#!/perl -w
# NAME: listregex.pl
# AIM: Read perl files, and list all 'regex' expressions found
# 8/29/2009 - geoff mclane - http://geoffair.net/mperl/
use strict;
use warnings;
use File::Basename;
unshift(@INC, 'C:/GTools/perl');
require 'logfile.pl' or die "Unable to load logfile.pl ...\n";
# log file stuff
my ($LF);
my $pgmname = $0;
if ($pgmname =~ /\w{1}:\\.*/) {
   my @tmpsp = split(/\\/,$pgmname);
   $pgmname = $tmpsp[-1];
}
my $outfile = "temp.$pgmname.txt";
open_log($outfile);

# options
my $add_scalars = 1; # include regex like '=~ $COND'
my $add_quoted = 1;  # include regex lile "word"

my $in_folder = '.';
my @warnings = ();
my $total_lines = 0;
my $reglist = "reglist.txt";

# FLAGS
my $FG_NONE = 0;
my $FG_SCAL = 1;   # is a SCALAR entry
my $FG_QUOT = 1;  # is a QUOTED entry

# debug
my $dbg01 = 0; # show prt( "Returning success [$reg] at off=[$ind]\n" ) if ($dbg01);
my $dbg02 = 0; # show prt( "$fil:$lnn: [$nxline]\n" ) if ($dbg02);;
my $dbg03 = 0; # show prt( "reg=[$reg] off=[$off] nxl=[$nxline]\n" ) if ($dbg03);
my $dbg04 = 0; # show prt( "$fil:$lnn: got regex = [$rlist] from line [$line]\n" ) if ($show && $dbg04);
my $dbg05 = 0; # show prt( "Returning QUOTED success [$reg] at off=[$ind]\n" ) if ($dbg05);
my $dbg06 = 0; # show prt( "Returning SCALAR success [$reg] at off=[$ind]\n" ) if ($dbg06);

sub prtw($) {
	my ($txt) = shift;
	prt($txt);
	$txt =~ s/\n$//;
	push(@warnings,$txt);
}

sub show_warnings() {
    if (@warnings) {
        prt( "\nGot ".scalar @warnings." WARNINGS ...\n" );
        foreach my $line (@warnings) {
            prt("$line\n" );
        }
    } else {
        prt("\nNo warnings issued.\n");
    }
	#my $s = get_dbg_str();
	#prt( "WARNING: DEBUG ON [$s]\n" ) if length($s);
   prt("\n");
}

sub scan_directory($) {
	my ($ind) = shift;
	my ($DIR, $typ, $cnt, $hm);
	my ($ocnt, $ccnt, $hcnt, $dcnt);
	$ocnt = 0; $ccnt = 0; $hcnt = 0; $dcnt = 0;
	my @arr = ();
   prt( "Scanning folder [$ind] for perl files...\n" );
	if (opendir $DIR, $ind) {
		my @fils = readdir($DIR);
		closedir $DIR;
      foreach my $fil (@fils) {
         next if (($fil eq '.')||($fil eq '..'));
			my $ff = $ind;
         $ff .= "\\" if !($ff =~ /(\\|\/)$/ );
         $ff .= $fil;
         my ($nm, $dir, $ext) = fileparse( $fil, qr/\.[^.]*/ );
			my $lcext = lc($ext);
         if ( (($lcext eq '.pl')||($lcext eq '.pm')||($lcext eq '.cgi')) ) {
            push(@arr, [ $fil, $ff ]);
         }
      }
   } else {
      prt( "ERROR: Unable to open directory...\n" );
   }
   return \@arr;
}

# [m]/PATTERN/[g][i][m][o][s][x]
# s/PATTERN/REPLACEMENT/[e][g][i][m][o][s][x]
# tr/SEARCHLIST/REPLACMENTLIST/[c][d][s]
my $m_mods = 'gimosx';
my $s_mods = 'egimosx';
my $t_mods = 'cds';
sub return_regex($$$$) {
   my ($txt,$roff,$rreg,$rflag) = @_;   # like substr($line,$off), \$off, \$reg )
   my ($ind,$len,$i,$c,$reg,$nc,$go,$pc,$mods,$ppc,$endc,$sub);
   $ind = index($txt,'=~');
   if ($ind >= 0) {
      $ind += 2;
      $txt = substr($txt,$ind);
      $len = length($txt);
      $reg = '';
      $go = 0;
      $endc = '/';
      $sub = 0;
      #prt( "Found '=~' at offset $ind - checking [$txt]\n" ) if ($dbg01);
      for ($i = 0; $i < $len; $i++) {
         $c = substr($txt,$i,1);
         $nc = (($i + 1) < $len) ? substr($txt,$i+1,1) : '';
         #prt( "Check char [$c]\n" );
         if ($c eq '/') {
            $reg = $c;
            $mods = $m_mods;
            $go = 1;
         } elsif (($c eq 'm')&&($nc eq '/')) {
            $reg = "$c$nc";
            $mods = $m_mods;
            $i++;
            $go = 1;
         } elsif (($c eq 'm')&&($nc eq '|')) {
            $reg = "$c$nc";
            $mods = $m_mods;
            $endc = $nc;
            $i++;
            $go = 1;
         } elsif (($c eq 's')&&($nc eq '/')) {
            $reg = "$c$nc";
            $mods = $s_mods;
            $sub = 1;
            $i++;
            $go = 1;
         } elsif (($c eq 't')&&($nc eq 'r')) {
            if (($i + 2) < $len) {
               $nc = (($i + 2) < $len) ? substr($txt,$i+2,1) : '';
               if ($nc eq '/') {
                  $reg = "tr/";
                  $i += 2;
                  $mods = $t_mods;
                  $sub = 1;
                  $go = 1;
               }
            }
         } elsif ($c eq '$') {
            # special case of a scalar
            if ($add_scalars) {
               $reg = $c;
               $i++;
               for (; $i < $len; $i++) {
                  $ppc = $pc;
                  $pc = $c;
                  $c = substr($txt,$i,1);
                  $nc = (($i + 1) < $len) ? substr($txt,$i+1,1) : '';
                  if ($c =~ /\w/) {
                     $reg .= $c;
                  } else {
                     last;
                  }
               }
               if (length($reg) > 1) {
                  $ind += $i;
                  $$rreg = $reg;
                  $$roff = $ind;
                  $$rflag |= $FG_SCAL;
                  prt( "Returning SCALAR success [$reg] at off=[$ind]\n" ) if ($dbg06);
                  return 1;
               }
            }
         } elsif ($c eq '"') {
            if ($add_quoted) {
               # limited to '"\w+"'
               $reg = $c;
               $i++;
               for (; $i < $len; $i++) {
                  $ppc = $pc;
                  $pc = $c;
                  $c = substr($txt,$i,1);
                  $nc = (($i + 1) < $len) ? substr($txt,$i+1,1) : '';
                  if ($c =~ /\w/) {
                     $reg .= $c;
                  } elsif ($c eq '"') {
                     $reg .= $c;
                     $i++;
                     last;
                  } else {
                     $reg = '';
                     last;
                  }
               }
               if (length($reg) > 1) {
                  $ind += $i;
                  $$rreg = $reg;
                  $$roff = $ind;
                  $$rflag |= $FG_QUOT;
                  prt( "Returning QUOTED success [$reg] at off=[$ind]\n" ) if ($dbg05);
                  return 1;
               }
            }
         }
         if ($go) {
            #prt( "Found GO at offset $i\n" ) if ($dbg01);
            $i++;
            $c = '/';
            for (; $i < $len; $i++) {
               $ppc = $pc;
               $pc = $c;
               $c = substr($txt,$i,1);
               $nc = (($i + 1) < $len) ? substr($txt,$i+1,1) : '';
               $reg .= $c;
               # if (($c eq '/')&&($pc ne "\\")) {
               if ( ($c eq $endc) && 
                  (($pc ne "\\") || (($pc eq "\\") && ($ppc eq "\\")) ) ) {
                  # we appear to have it...
                  if ($sub) {
                     $sub--;
                     next;    # continue for next part of tr or s/.../.../
                  }
                  $i++; # now check for MODS
                  for (; $i < $len; $i++) {
                     $c = substr($txt,$i,1);
                     last if ( !($c =~ /\w/) );
                     if ($c =~ /[$mods]/) {
                        $reg .= $c;
                     } else {
                        last;
                     }
                  }
                  $ind += $i;
                  $$rreg = $reg;
                  $$roff = $ind;
                  prt( "Returning success [$reg] at off=[$ind]\n" ) if ($dbg01);
                  return 1;
               }
            }
            last;
         }
      }
   }
   #prt( "Returning FAILED\n" );
   return 0;
}


sub process_perl_files($) {
   my ($ra) = shift;
   my ($cnt,$ff,$fil,$i,$lnn,@lns,$line,$off,$reg,$rlist,$show,$nxline,$flag);
   $cnt = scalar @{$ra};
   prt( "Processing $cnt perl files...\n" );
   my %hreg = ();
   my @flagged = ();
   for ($i = 0; $i < $cnt; $i++) {
      $fil = ${$ra}[$i][0];
      $ff = ${$ra}[$i][1];
      $lnn = 0;
      if (open INF, "<$ff") {
         @lns = <INF>;
         close INF;
         $cnt = scalar @lns;
         prt( "Scanning $cnt lines, from file [$fil]\n" );
         $total_lines += $cnt;
         foreach $line (@lns) {
            chomp $line;
            $lnn++;
            next if ($line =~ /^\s*#/);
            if ($line =~ /=~/) {
               $off = 0;
               $reg = '';
               $rlist = '';
               $show = 0;
               $nxline = trim_all($line);
               prt( "$fil:$lnn: [$nxline]\n" ) if ($dbg02);
               $flag = 0;
               while (return_regex( $nxline, \$off, \$reg, \$flag ) ) {
                  if (defined $hreg{$reg}) {
                     $hreg{$reg} .= "|$fil";
                  } else {
                     $hreg{$reg} = "$fil";
                     $show = 1;
                  }
                  $rlist .= " + " if length($rlist);
                  $rlist .= $reg;
                  push(@flagged, [$i, $flag]) if ($flag);
                  $flag = 0;
                  $nxline = substr($nxline,$off);
                  prt( "reg=[$reg] off=[$off] nxl=[$nxline]\n" ) if ($dbg03);
               }
               $line = trim_all($line);
               if (length($rlist)) {
                  prt( "$fil:$lnn: got regex = [$rlist] from line [$line]\n" ) if ($show && $dbg04);
               } else {
                  prtw( "$fil:$lnn: WARNING: FAILED to get regex from line [$line]???\n" );
               }
            }
         }
      } else {
         prtw( "WARNING: Can not open [$ff]!\n" );
      }
   }
   return \%hreg;
}

sub show_regex_list($) {
   my ($rh) = shift;
   my ($k,$v,$cnt,$len,$max,$msg,$out);
   $max = 0;
   $cnt = 0;
   $out = '';
   foreach $k (keys %{$rh}) {
      $v = ${$rh}{$k};
      $len = length($k);
      $max = $len if ($len > $max);
      $cnt++;
   }
   $msg = "List of $cnt regex examples...\n";
   prt($msg);
   $out = $msg;
   foreach $k (sort keys %{$rh}) {
      $v = ${$rh}{$k};
      $msg = "$k";
      $msg .= ' ' while (length($msg) < $max);
      prt( "$msg $v\n" );
      $out .= "$msg $v\n";
   }
   $msg = "Done list of $cnt regex...\n";
   prt($msg);
   $out .= $msg;
   write2file($out,$reglist);
   prt( "Written list to $reglist file...\n" );
}

prt( "$0 ... List regex in perl files, in [$in_folder] ...\n" );
my $rarr = scan_directory($in_folder);
my $rrhash = process_perl_files($rarr);
prt( "Processed ".scalar @{$rarr}." files, $total_lines total lines, for ".scalar keys(%{$rrhash})." regex samples...\n" );
show_regex_list($rrhash);
show_warnings();
close_log($outfile,1);
exit(0);

# eof
