#!/perl -w
# NAME: chkintlinks.pl
# AIM: Given a HTML file, check for <a name="abc"...> and <a href="#abc"...>
# are consistent
# 13/11/2008 - geoff mclane - http://geoffair.net/mperl
use strict;
use warnings;
use File::Basename;
##require 'logfile.pl' or die "Unable to load logfile.pl ...\n";
require 'fgutils.pl' or die "Unable to load fgutils.pl ...\n";
# log file stuff
my ($LF);
my $pgmname = $0;
if ($pgmname =~ /\w{1}:\\.*/) {
    my @tmpsp = split(/\\/,$pgmname);
    $pgmname = $tmpsp[-1];
}
my $outfile = "temp.$pgmname.txt";
open_log($outfile);

my $in_file = "C:\\HOMEPAGE\\GA\\mperl\\perl_hash.htm";
###my $in_file = "C:\\HOMEPAGE\\GA\\mperl\\perl_ref.htm";
###my $in_file = "C:\\HOMEPAGE\\GA\\fg\\fgfs-045.htm";
###my $in_file = "C:\\HOMEPAGE\\GA\\fg\\fgall.htm";
##my $in_file = "C:\\HOMEPAGE\\GA\\fg";
##my $in_file = "C:\\HOMEPAGE\\GA\\fg\\fgfs-045b.htm";
##my $in_file = "C:\\HOMEPAGE\\GA\\fg\\fgfs-044.htm";
my $resursive = 1;

my @warnings = ();
my $done_files = 0;
my $loadlog = 0;
prt( "$0 ... Hello, processing $in_file ...\n" );

if (-f $in_file) {
    my %h = process_file($in_file);
} elsif (-d $in_file) {
    process_directory($in_file,$resursive);
} else {
    prt("WARNING: $in_file is NOT file or folder ... check name, location!\n");
}

if (($done_files > 2)||(scalar @warnings > 10)) {
    show_warnings(0);
    $loadlog = 1;
}
close_log($outfile,$loadlog);
exit(0);

######################################
###### SUBS ONLY ######

sub is_my_file_type {
    my ($fil) = shift;
    if ($fil =~ /\.htm$/i) {
        return 1;
    } elsif ($fil =~ /\.html$/i) {
        return 1;
    }
    return 0;
}

sub html_to_lines {
    my ($rlm, @lns) = @_;
    my $intag = 0;
    my $text = '';  # gather TEXT between tags
    my @nlines = ();
    my ($fln, $ln, $ch, $pch, $nch, $len, $i, $i2, $tag, $xml, $dnx);
    my ($lnnm, $lnb, $nlnm);
    my ($ppch, $incomm);
    my $show_comm_dbg = 0;
    $pch = '';
    $ppch = '';
    $nch = '';
    $tag = '';
    $xml = '';
    $dnx = 0;
    $lnnm = 0;
    $nlnm = 0;
    $lnb = 0;
    $incomm = 0;
    $text = ''; # start NO TEXT
    foreach $fln (@lns) {
        chomp $fln;
        $ln = trim_all($fln);
        $len = length($ln);
        $lnnm++;    # count another xml line
        for ($i = 0; $i < $len; $i++) {
            $i2 = $i + 1;
            $ch = substr($ln,$i,1);
            $nch = (($i2 < $len) ? substr($ln,$i2,1) : ' ');
            if ($intag) {
                # on first GREATER THAN - SPACE
                $tag .= $ch;
                if ($ch eq '>') {
                    if ( $incomm ) {
                        prt("$lnnm: potential end of XML tag pch=$pch ppch=$ppch\n") if ($show_comm_dbg);
                        if (($pch eq '-') && ($ppch eq '-')) {
                            $nlnm++;
                            push(@nlines,$tag);
                            ### prt( "push(\@xlnmap, [ $nlnm, $lnb, $lnnm ]); # each NEW line has BEGIN and END\n" );
                            $$rlm{$nlnm} = "$lnb-$lnnm";    # each NEW line has BEGIN and END
                            $tag = '';
                            $intag = 0;
                            $xml = '';
                            $incomm = 0;
                            prt( "$lnnm: Exit comment [$ln]\n" ) if ($show_comm_dbg);
                        }
                    } else {
                        $nlnm++;
                        push(@nlines,$tag);
                        ### prt( "push(\@xlnmap, [ $nlnm, $lnb, $lnnm ]); # each NEW line has BEGIN and END\n" );
                        $$rlm{$nlnm} = "$lnb-$lnnm";    # each NEW line has BEGIN and END
                        $tag = '';
                        $intag = 0;
                        $xml = '';
                        $incomm = 0;
                    }
                }
            } else {
                if ($ch eq '<') {
                    if (length($text)) {
                        $nlnm++;
                        push(@nlines,$text);
                        $$rlm{$nlnm} = "$lnb-$lnnm";    # each NEW line has BEGIN and END
                        $text = '';
                    }
                    $tag = $ch; # start a tag line
                    $intag = 1; # signal in a tag
                    $xml = '';
                    $dnx = 0;
                    $lnb = $lnnm;    # set the BEGIN xml line
                    if ($nch eq '!') {
                        # but watch out for <!DOCTYPE ...>
                        if ($ln =~ /<!--/) {
                            prt( "$lnnm: Entering comment [$ln]\n" ) if ($show_comm_dbg);
                            $incomm = 1;
                        }
                    }
                } else {
                    $text .= $ch;
                }
            }
            $ppch = $pch;
            $pch = $ch;
        }
        # done a line - this is like a SPACE
        if ($intag && length($tag)) {
            $tag .= ' ' if !($tag =~ /(=|\s)$/);
        }
    }
    prtw("WARNING: Exit STILL in comment!\n") if ($incomm);
    if (length($tag)) {
        prtw("WARNING: xml re-lining error! Left pending tag [$tag]\nin $in_file file ...\n");
    }
    return @nlines;
}


sub process_file {
    my ($fil) = shift;
	my ($in_name,$in_dir) = fileparse($fil);
    my (@attribs, %atthash, %lnmap);
    my (@lines, $xlncnt, $lnnum, $line, $tag, $xln);
    my @names = ();
    my @hrefs = ();
    my @lhrefs = ();
    my %names_hash = ();
    my %hrefs_hash = ();
    my %lhrefs_hash = ();
    my ($name, $href, $nmcnt, $hrcnt, $lrcnt, $msg);
    my $dbg_name = 0;
    my $dbg_href = 0;
    my %hash = ();
    if (open INF, "<$fil") {
        %lnmap = ();
        @lines = <INF>;
        close INF;
        $xlncnt = scalar @lines;
        @lines = html_to_lines(\%lnmap, @lines);
        write2file(join("\n",@lines),'tempxml.txt');
        $lnnum = scalar @lines;
        $msg = "Processing $lnnum, from $xlncnt lines, from $fil ...\n";
        $lnnum = 0;
        prt( "$msg" );
        foreach $line (@lines) {
            chomp $line;
            $lnnum++;
            $xln = $lnmap{$lnnum};
            #prt( "line $xln: $line\n" );
            @attribs = space_split($line);
            $tag = $attribs[0];
            if ($tag && length($tag)) {
                %atthash = ();  # clear HASH - only if NOT a comment <!-- ... -->
                if ($tag =~ /^</) {
                    %atthash = array_2_hash_on_equals(@attribs) if !($tag =~ /^<!--/);
                }
                if ($tag =~ /<a/i) {    ###or if ($line =~ /^<a\s+(.+)>/i) {
                    if (defined $atthash{'name'}) {
                        $name = $atthash{'name'};
                        $name =~ s/>$//;
                        $name = strip_both_quotes($name);
                        if (is_in_array($name,@names)) {
                            prtw("WARNING: Name $name already in NAME array ($fil:$xln)...\n" );
                        } else {
                            push(@names,$name);
                            $msg = "$xln: Name = [$name]";
                            $names_hash{$name} = $msg;
                            prt( "$msg\n" ) if ($dbg_name);
                        }
                    }
                    if (defined $atthash{'href'}) {
                        $href = $atthash{'href'};
                        $href =~ s/>$//;
                        $href = strip_both_quotes($href);
                        if (is_in_array($href,@hrefs)) {
                            ### prtw("WARNING: HREF $href already in HREF array ...\n" );
                        } else {
                            push(@hrefs,$href);
                            $msg = "$xln: HREF = [$href]";
                            $hrefs_hash{$href} = $msg;
                            prt( "$msg\n" ) if ($dbg_href);
                        }
                    }
                    #prt( "$line\n" );
                }
            }
        }
        $nmcnt = scalar @names;
        $hrcnt = scalar @hrefs;
        foreach $href (@hrefs) {
            if ($href =~ /^#.+/) {
                $name = $href;
                push(@lhrefs, substr($href,1));
                $lhrefs_hash{$href} = $hrefs_hash{$name};    # copy the info
            }
        }
        $lrcnt = scalar @lhrefs;
        prt( "Got $nmcnt names, and $hrcnt HREF. $lrcnt local, entries ...\n" );
        $hrcnt = 0;
        foreach $href (@lhrefs) {
            if ( !is_in_array($href,@names) ) {
                $name = '#'.$href;
                $msg = $lhrefs_hash{$name};
                prtw("WARNING: Got local HREF of [$href], but not NAME! ($msg)\n");
                $hrcnt++;
            }
        }
        if ($hrcnt) {
            prtw("WARNING: Got $hrcnt local ref with no NAME anchor in $in_name!\n");
        } elsif ($lrcnt) {
            prt("All $lrcnt local references point to a NAME.\n" );
        } else {
            ###prtw("WARNING: NO local references found.\n" );
        }
        $hrcnt = 0;
        foreach $name (@names) {
            if ( !is_in_array($name,@lhrefs)) {
                $hrcnt++;
            }
        }
        if ($hrcnt) {
            prtw("WARNING: Got $hrcnt ANCHOR names, with NO internal links in $in_name ...\n");
            foreach $name (@names) {
                if ( !is_in_array($name,@lhrefs)) {
                    $msg = $names_hash{$name};
                    prtw("NOTE: NO HREF for [$name]! ($msg)\n");
                }
            }
        }
    } else {
        prtw("WARNING: Unable to open $fil!\n" );
    }
    $hash{'NAMES'} = { %names_hash };
    $hash{'HREFS'} = { %hrefs_hash };
    $hash{'LOCAL'} = { %lhrefs_hash };
    $done_files++;
    return %hash;
}

sub process_directory {
    my ($inf, $rec) = shift;
	prt( "Processing $inf folder ...\n" );
	if ( opendir( DIR, $inf ) ) {
		my @files = readdir(DIR);
		closedir DIR;
		foreach my $fil (@files) {
			if (($fil eq ".")||($fil eq "..")) {
				next;
			}
			my $ff = $inf."\\".$fil;
			if ( -d $ff ) {
				process_directory( $ff ) if ($rec);
			} else {
                if (is_my_file_type($fil)) {
                    my %h = process_file( $ff );
                }
            }
        }
    }
}

sub prtw {
    my ($tx) = shift;
    $tx =~ s/\n$// if ($tx =~ /\n$/);
    prt("$tx\n");
    push(@warnings,$tx);
}

sub show_warnings {
    my ($dbg) = shift;
    if (@warnings) {
        prt( "\nGot ".scalar @warnings." WARNINGS ...\n" );
        foreach my $line (@warnings) {
            prt("$line\n" );
        }
        prt("\n");
    } elsif ($dbg) {
        prt("\nNo warnings issued.\n\n");
    }
}

# eof - chkintlinks.pl
