#!/perl -w
# NAME: showhrefs.pl
# AIM: Given a HTML file, extract, and show HREF (anchor) entries.
# 14/03/2010 - Lots of IMPROVEMENTS, including batch running... ie parse_arg(list of files)
# 2010/03/22 - strip '#down' off file name before checking
# 20100312 - Some tidying...
# 29/07/2007 - geoff mclane - http://geoffair.net/mperl/index.htm
use strict;
use warnings;
use File::Basename;
unshift(@INC, 'C:/GTools/perl');
require 'logfile.pl' or die "Unable to load logfile.pl ...\n";
require 'htmltools.pl' or die "Unable to load htmltools.pl ...\n";

# log file stuff
my ($LF);
my $pgmname = $0;
if ($pgmname =~ /(\\|\/)/) {
	my @tmpsp = split(/(\\|\/)/,$pgmname);
	$pgmname = $tmpsp[-1];
}
my $perl_root = 'C:\GTools\perl';
my $outfile = $perl_root."\\temp.$pgmname.txt";
open_log($outfile);

# options
my $g_base_dir = "C:\\HOMEPAGE\\GA\\";
my @in_files = qw ( mperl\src\index.htm );
#my @in_files = qw ( climate\climate-01.htm );
#my @in_files = qw( misc/flags/index.htm );
#my @in_files = qw( home2.htm );
#my @in_files = qw ( cgi\index.htm );
#my @in_files = qw ( travel\maroc\maroc-slide4.htm );
##my $g_base_dir = "C:\\HOMEPAGE\\HOM\\test4\\";
##my @in_files = qw(collections.htm limited-edition.htm groom-center.htm product-lines.htm);
##my $g_in_file = 'collections.htm';
##my $g_in_file = 'C:\HOMEPAGE\HOM\test4\limited-edition.htm';
##my $g_in_file = 'C:\HOMEPAGE\HOM\test4\groom-center.htm';
##my $g_in_file = 'C:\HOMEPAGE\HOM\test4\product-lines.htm';

my $load_log = 1;
my $min_href = 45;
my $def_out_file = $perl_root."\\tempnohrefs.htm";
my $remove_hrefs = 0;

# debug
my $dbg38 = 0; #prt( "[dbg38] Got [$hr2] = [$txt] [$fil]\n" ) if ($dbg38);

# global program variables

my $g_in_file = '';
my @hrefs = ();
my @anchors = ();
my $g_title = '';
my $g_hcnt = 0;
my $g_acnt = 0;
#my $hrf = '';
#my $hfile = '';
my $g_dirname = $g_base_dir;
#my $had_menu = 0;
my $g_filename = '';
my @html_ext = qw( .htm .html .shtml .php );

sub pgm_exit($$) {
   my ($val,$msg) = @_;
   if (length($msg)) {
      $msg .= "\n" if !($msg =~ /\n$/);
      prt($msg);
   }
   #show_warnings($val);
   close_log($outfile,$load_log);
   exit($val);
}

sub get_file_title($) {
	my ($inf) = shift;
	my ($IN);
	my $tit = '';
	if (open $IN, "<$inf") {
		my @lines = <$IN>;
		close $IN;
		###my $lc = scalar @lines;
		###prt( "Processing $lc lines from $inf ...\n" );
		$tit = return_tag( join( '', @lines ), 'title' );
		$tit =~ s/\n/ /gm;
		$tit = trim_all($tit);
	} else {
		###prt( "ERROR: Failed to open $inf ... $! ...\n" );
		$tit = "<open failed on $inf>";
	}
	return $tit;
}

sub process_file($) {
	my ($inf) = shift;
	if (open INF, "<$inf") {
		my @lines = <INF>;
		close INF;
		my $lc = scalar @lines;
		prt( "\nProcessing $lc lines from $inf ...\n" );
		my $ft = join( '', @lines );
		$g_title = return_tag( $ft, 'title' );
		$g_title =~ s/\n/ /gm;
		$g_title = trim_all($g_title);
		my $ntxt = remove_script( $ft );
      $ntxt = dropcomments($ntxt);
      # write2file($ntxt,"tempnew.txt");
      $ntxt = trimblanklines($ntxt);
		@hrefs = ret_hrefs_array( $ntxt );
		@anchors = ret_anchor_array( $ntxt );
      if ($remove_hrefs) {
         $ntxt = collecthrefs($ntxt,1);
         write2file($ntxt,$def_out_file);
         prt("Written $def_out_file file, without anchors...\n");
      }
	} else {
		prt( "ERROR: Failed to open [$inf] ... $! ...\n" );
		return 0;
	}
	return 1;
}

sub anchor_href($) {
	my ($txt) = shift;
	my $len = length($txt);
	my $ch = '';
	my $pch = ' ';
	my $tag = '';
	for (my $i = 0; $i < $len; $i++) {
		$ch = substr($txt, $i, 1);
		if ((lc($ch) eq 'h')&&($pch =~ /\s/)) {
			$pch = substr($txt, $i);
			if ($pch =~ /^href=/i) {
				$tag = substr($txt, ($i+5));
				$tag = trim_all($tag);
				if ( $tag =~ /^['"]/ ) {
					$pch = substr($tag,0,1);
					$tag = substr($tag,1);
				} else {
					$pch = ' ';
				}
				my $ind = index($tag, $pch);
				if ($ind != -1) {
					$tag = substr($tag,0,$ind);
				}
				return $tag;
			}
		}
		$pch = $ch;
	}
	return $tag;
}

#########################################################
# Passed an array of extensions,
# check if this is one of them?
#########################################################
sub file_has_my_ext($$) {
	my ($fil, $rexts) = @_;
	my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ );
   my $lcext = lc($ext);
	foreach my $ex (@{$rexts}) {
		return 1 if (lc($ex) eq $lcext);
	}
	return 0;
}

sub get_href_type($) {
	my ($src) = shift;
	if ($src =~ /^http:/i) {
		#push(@httprefs, [$src, $fil, $lnnos] );
		return 1; # remote HREF
	} elsif ($src =~ /^https:/i) {
		return 1; # remote HREF
		#push(@httpsrefs, [$src, $fil, $lnnos] );
	} elsif ($src =~ /^ftp:/i) {
		#push(@ftprefs, [$src, $fil, $lnnos] );
		return 3; # remote HREF
	} elsif ($src =~ /^mailto:/i) {
		#push(@mtrefs, [$src, $fil, $lnnos] );
		return 4; # remote HREF
	} elsif ( $src =~ /^javascript:/i ) {
		return 5; # a JAVASCRIPT HREF
	} elsif ($src =~ /^file:/i) {
		return 5; # remote HREF
	} elsif ( substr($src,0,1) eq '#') {
		# local in page HREF
		return 6;
	} else {
		my $ind = index($src,'#');
		if ( $ind != -1 ) {
			$src = substr($src,0,$ind);
		}
		$ind = index($src,'?');
		if ( $ind != -1 ) {
			$src = substr($src,0,$ind);
		}
		$src =~ s/\/$//;
		if (length($src)) {
			return 7;
		}
	}
	return 0;
}

sub get_href_type_name($) {
	my ($src) = shift;
    my $typ = get_href_type($src);
	if ($typ == 1) { # ($src =~ /^http:/i)
		return "1: remote HREF (http)";
	} elsif ($typ == 2) { # ($src =~ /^https:/i)
		return "2: remote HREF (https)";
	} elsif ($typ == 3) { # ($src =~ /^ftp:/i)
		return "3: remote HREF (ftp)";
	} elsif ($typ == 4) { # ($src =~ /^mailto:/i) {
		return "4: remote HREF (mailto)";
	} elsif ($typ == 5) { #
        if ($src =~ /^javascript:/i ) {
            return "5: a JAVASCRIPT HREF";
    	} elsif ($src =~ /^file:/i) {
	    	return "5: a FILE HREF";
        }
        return "5: a ???? HREF CHECKME";
	} elsif ($typ == 6) { # ( substr($src,0,1) eq '#')
		return "6: infile link";   #  (".substr($src,1).")";
	} elsif ($typ == 7) {
		return "7: local link";
	}
	return "0: UNCASED [$src] CHECKME!";
}

sub mycmp_decend0 {
	my $off = 0;
	return 1 if (${$a}[$off] < ${$b}[$off]);
	return -1 if (${$a}[$off] > ${$b}[$off]);
	return 0;
}

sub get_anchor_hash_ref_lc($$$) {
   my ($fank,$fil,$dbg) = @_;
   my %hash = ();
   my ($ank,$len,$i,$ch,$pc,$hr2,$txt);
   my ($lchr2);
   if ($fank =~ /<a\s+(.+)>$/) {
      $ank = trim_all($1);
      $len = length($ank);
      $ch = '';
      $hr2 = '';
      for ($i = 0; $i < $len; $i++) {
         $pc = $ch;
         $ch = substr($ank,$i,1);
         if ($ch =~ /\w/) {
            $hr2 .= $ch;   # accumulate \w chars - alphanumeric, including _
         } elsif (length($hr2)) {
            if (($ch ne '=') && ($ch =~ /\s/)) {
               $i++;
               for (; $i < $len; $i++) {
                  $ch = substr($ank,$i,1);
                  last if ($ch eq '=');
                  last if !($ch =~ /\s/);
               }
            }
            if ($ch eq '=') {
               # found our equal sign
               $i++; # move on...
               for (; $i < $len; $i++) {
                  $ch = substr($ank,$i,1);
                  last if ($ch =~ /('|")/);
                  last if !($ch =~ /\s/);
               }
               if (($ch eq '"')||($ch eq "'")) {
                  $pc = $ch;
                  $i++; # move on...
                  $txt = '';
                  for (; $i < $len; $i++) {
                     $ch = substr($ank,$i,1);
                     last if ($ch eq $pc);
                     $txt .= $ch;
                  }
                  if ($ch eq $pc) {
                     $lchr2 = lc($hr2);
                     $hash{$lchr2} = $txt;
                     prt( "[dbg38] Got [$hr2] = [$txt] [$fil]\n" ) if ($dbg38);
                  } else {
                     prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO END INVERTED COMMA! ($pc) [$fil]\n");
                     pgm_exit(1,"") if ($dbg);
                  }
               } else {
                  if (($ch =~ /\w/) && (($hr2 =~ /name/i)||($hr2 =~ /id/i))) {
                     # accept these WITHOUT inverted comma
                     $txt = $ch;
                     $i++; # MOVING ON
                     for (; $i < $len; $i++) {
                        $ch = substr($ank,$i,1);
                        last if !($ch =~ /\w/);
                        $txt .= $ch;
                     }
                     $lchr2 = lc($hr2);
                     $hash{$lchr2} = $txt;
                     prt( "Got [$hr2] = [$txt] - no inverted commas! [$fil]\n" );
                  } else {
                     prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO START INVERTED COMMA! [$fil]\n");
                     pgm_exit(1,"") if ($dbg);
                  }
               }
            } else {
               prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO EQUAL SIGN! [$fil]\n");
               pgm_exit(1,"") if ($dbg);
            }
            $hr2 = '';
         }
      }
   }
   return \%hash;
}

sub process_in_files($) {
   my ($ra) = @_; # (\@in_files);
   my ($hrtyp,$hrt,$i,@slist,$ff,$msg,$tmp,@arr);
   my ($j,$ank,@arr2,$file,$rhrf,$hrf,$hfile);
   my ($nm,$tst,$fnd,$fullanc);
   foreach $hfile (@{$ra}) {
      $g_in_file = "$g_base_dir$hfile";
      ($g_filename, $g_dirname) = fileparse($g_in_file);
      next if !process_file( $g_in_file );
      $g_hcnt = scalar @hrefs;
      $g_acnt = scalar @anchors;
      prt( "Got $g_hcnt HREF... $g_acnt anchors... title=\"$g_title\"\n" );
      @arr2 = ();
      for ($i = 0; $i < $g_acnt; $i++) {
         $hrf = $anchors[$i];
         $hrf =~ s/\n/ /gm;
         $hrf = trim_all($hrf);
         $file = anchor_href($hrf);   # IF ANY
         $hrt = get_href_type($hfile);
         $rhrf = get_anchor_hash_ref_lc($hrf,$file,1);
         #            0     1     2      3
         push(@arr2, [$hrt, $hrf, $file, $rhrf]);
      }
      @arr = ();
      for ($i = 0; $i < $g_hcnt; $i++) {
         $fnd = -1;
         $hrf = $hrefs[$i];
         $hrf =~ s/\n/ /gm;
         $hrf = trim_all($hrf);
         $hrt = get_href_type($hrf);
         $hrtyp = get_href_type_name($hrf);
         # try to find ANCHOR
         for ($j = 0; $j < $g_acnt; $j++) {
            $ank = $arr2[$j];
            if (length(${$ank}[2])) {
               # a HREF, does it have a 'href'
               $rhrf = ${$ank}[3]; # extract HASH
               $nm = ${$rhrf}{'href'}; # extract value
               if ($hrf eq $nm) {
                  $fnd = $j;
                  #              0     1     2      3
                  # push(@arr2, [$hrt, $hrf, $file, $rhrf]);
                  $fullanc = ${$ank}[1];
                  last;
               }
            }
         }
         if ($hrt == 6) {
            # infile link - find the anchor
            $tmp = "NF";
            $tst = $hrf;
            $tst =~ s/^#//;
            for ($j = 0; $j < $g_acnt; $j++) {
               $ank = $arr2[$j];
               if (length(${$ank}[2]) == 0) {
                  # not a HREF, does it have a 'name'
                  $rhrf = ${$ank}[3]; # extract HASH
                  if (defined ${$rhrf}{'name'}) {
                     $nm = ${$rhrf}{'name'}; # extract value
                     if ($tst eq $nm) {
                        $tmp = "ok";
                        last;
                     }
                  }
               }
            }
            $hrtyp .= " $tmp ($fnd)";
         } elsif ($hrt == 7) {
            # local file link
            my $fn = $hrf;
            my $ind = index($fn,'#');
            if ($ind > 0) {
                $fn = substr($fn,0,$ind);
            }
            $ff = "$g_dirname$fn";
            if (-f $ff) {
               $hrtyp .= " ok ($fnd)" ;
            } else {
               $tmp = $ff.".txt";
               if (-f $tmp) {
                  $hrtyp .= " ok (with .txt) ($fnd)";
               } else {
                  $hrtyp .= " NF [$ff] ($fnd)";
                  if ($fnd != -1) {
                     $hrtyp .= "[$fullanc]";
                  }
               }
            }
         }
         #           0     1     2       3
         push(@arr, [$hrt, $hrf, $hrtyp, $fnd]);
      }

      @slist = sort mycmp_decend0 @arr;
      prt( "List $g_hcnt HREF (sorted by type)\n" );
      for ($i = 0; $i < $g_hcnt; $i++) {
         $hrf = $slist[$i][1];
         $hrtyp = $slist[$i][2];
         $hrf .= ' ' while (length($hrf) < $min_href);
         prt( "$hrf [$hrtyp]\n" );
      }

      #prt( "\nList $g_acnt anchors \n" );
      #$had_menu = 0;
      #foreach $hrf (@anchors) {
         #$hrf =~ s/\n/ /gm;
         #$hrf = trim_all($hrf);
         #$hfile = anchor_href($hrf);
         #if (file_has_my_ext("$g_dirname$hfile", \@html_ext)) {
            #$g_title = get_file_title( "$g_dirname$hfile" );
         #} else {
            #$g_title = "<not html>";
         #}
         ####prt( "$hrf ($hfile) $g_title\n" );
         #prt( "[$hrf] [$hfile], title=[$g_title]\n" ); # if ($had_menu);
         #if ($hfile eq './') {
            #$had_menu = 1;
         #}
      #}
   }
}

### MAIN PROCESS ###
parse_args(@ARGV);
process_in_files(\@in_files);
pgm_exit(0,"Normal exit");
########################
sub give_help {
   prt("$pgmname [options] file [file ...]\n");
   prt("The input file name(s) will be processed, and a list of HREF found output.\n");
   prt("options:\n");
   prt("  -h or -?  = this help, and exit 0\n");
   prt("  -r        = remove HREFS, and write to file [$def_out_file]\n");
   prt("  -o <out>  = set the output file name. Implies -r.\n");
}

sub need_arg {
   my ($arg,@av) = @_;
   pgm_exit(1,"ERROR: Option [$arg] needs foloowing argument! Try -h for valid options.\n")
      if (!@av);
}

sub parse_args {
   my (@av) = @_;
   my $cnt = 0;
   my @arr = ();
   my ($sarg,$ch);
   while (@av) {
      $cnt++;
      my $arg = $av[0];
      if ($arg =~ /^-/) {
         # what options?
         $sarg = substr($arg,1);
         $sarg = substr($sarg,1) while ($sarg =~ /^-/);
         $ch = substr($sarg,0,1);
         if (($ch eq 'h')||($ch eq '?')) {
            give_help();
            exit(0);
         } elsif ($ch eq 'r') {
            prt("Set to remove HREFS, and write file.\n");
            $remove_hrefs = 1;
         } elsif ($ch eq 'o') {
            need_arg(@av);
            shift @av;
            $cnt++;
            $def_out_file = $av[0];
            prt("Set to remove HREFS, and write file $def_out_file.\n");
            $remove_hrefs = 1;
         } else {
            pgm_exit(1,"ERROR: Unknown option [$arg]! Try -h for valid options.\n");
         }
      } else {
         if (-f $arg) {
            push(@arr,$arg);
         } else {
            pgm_exit(1,"ERROR: Unable to locate [$arg] file! Argument # $cnt. Check name, location...\n");
         }
      }
      shift @av;
   }
   if (@arr) {
      prt("Set input to ");
      foreach my $f (@arr) {
         prt("$f ");
      }
      prt("\n");
      $g_base_dir = "";
      @in_files = @arr;
   }
}

# eof
