#!/perl -w
# NAME: chkhtml.pl
# AIM: Just parse HTML elements, and report any problem
# 2010/04/06  - geoff mclane - http://geoffair.net/mperl/
use strict;
use warnings;
use File::Basename;
use Cwd;
unshift(@INC, 'C:\GTools\perl');
require 'logfile.pl' or die "Unable to load logfile.pl ...\n";
# log file stuff
my ($LF);
my $pgmname = $0;
if ($pgmname =~ /\w{1}:\\.*/) {
    my @tmpsp = split(/\\/,$pgmname);
    $pgmname = $tmpsp[-1];
}
my $perl_dir = 'C:\GTools\perl';
my $outfile = $perl_dir."\\temp.$pgmname.txt";
open_log($outfile);

# user variables
my $load_log = 0;
my $def_infile = 'U:\var\www\fg\www\Docs\getstart\getstart24.html';
#my $def_infile = 'C:\HOMEPAGE\FG\Docs\getstart\getstartch9.html';
my $in_file = '';

my @closed_tags = qw( meta link area base basefont br frame hr isindex param bgsound embed keygen );

# tags which do NOT need a closing, like </p>, tag
my @opt_tags = ( "body", "colgroup", "dd", "dt", "head", "html", "li", "optgroup", "option",
"p", "tbody", "td", "tfoot", "th", "thead", "tr", "marquee" );

### program variables
my $verbosity = 0;
my @warnings = ();
my $cwd = cwd();

sub pgm_exit($$) {
    my ($val,$msg) = @_;
    if (length($msg)) {
        $msg .= "\n" if (!($msg =~ /\n$/));
        prt($msg)
    }
    close_log($outfile,$load_log);
    exit($val);
}


sub prtw($) {
   my ($tx) = shift;
   $tx =~ s/\n$//;
   prt("$tx\n");
   push(@warnings,$tx);
}

sub show_warnings() {
   if (@warnings) {
      prt( "\nGot ".scalar @warnings." WARNINGS...\n" );
      foreach my $itm (@warnings) {
         prt("$itm\n");
      }
      prt("\n");
   } else {
      prt( "\nNo warnings issued.\n\n" );
   }
}

sub is_closed_tag($) {
    my ($tt) = shift;
    my $lctt = lc($tt);
    foreach my $tag (@closed_tags) {
        return 1 if ($tag eq $lctt);
    }
    return 0;
}

sub is_opt_tag($) {
    my ($tt) = shift;
    my $lctt = lc($tt);
    foreach my $tag (@opt_tags) {
        return 1 if ($tag eq $lctt);
    }
    return 0;
}

# $drop = can_find_this_tag($tag,\@elements);
sub can_find_this_tag($$) {
    my ($tag,$re) = @_;
    my $len = scalar @{$re};
    my $drop = 0;
    my $bu = -1;
    my $last = '';
    while ($len) {
        $drop++;    # can pop this one
        $last = ${$re}[$bu][0]; # get tag
        if ($last eq $tag) {    # if the desired tag
            return $drop;   # return drop value
        } elsif ( ! is_opt_tag($last) ) {
            return 0;   # oop, have a non-optional tag
        }
        $bu--;  # back up one more
        $len--; # and reduce available to check
    }
    return 0;
}

sub is_all_optional($) {
    my ($re) = @_;
    my $len = scalar @{$re};
    my $bu = -1;
    my ($last);
    while ($len) {
        $last = ${$re}[$bu][0]; # get tag
        if ( ! is_opt_tag($last) ) {
            return 0;   # oop, have a non-optional tag
        }
        $bu--;  # back up one more
        $len--; # and reduce available to check
    }
    return 1;   # ALL were optiona
}

sub show_stack_elements($$$) {
    my ($tag,$rele,$rlns) = @_;
    my $cnt = scalar @{$rele};
    my $lcnt = scalar @{$rlns};
    if ($cnt) {
        prt("The stack has $cnt elements... The current closing element is [$tag]\n");
        for (my $i = 0; $i < $cnt; $i++) {
            my $e = ${$rele}[$i][0];
            my $n = ${$rele}[$i][1];
            prt("$n: elelement [$e]");
            prt(" SAME as tag [$tag]!") if ($e eq $tag);
            if ($n <= $lcnt) {
                my $ln = trim_all(${$rlns}[$n-1]);
                prt(" line=[$ln]");
            }
            prt("\n");
        }
    }
}

sub get_element_chain($) {
    my ($rele) = @_;
    my $cnt = scalar @{$rele};
    my $chn = '';
    if ($cnt) {
        for (my $i = 0; $i < $cnt; $i++) {
            my $e = ${$rele}[$i][0];
            $chn .= '|' if length($chn);
            $chn .= $e;
        }
    }
    return $chn;
}


sub process_file($) {
    my ($inf) = shift;
    if (!open INF, "<$inf") {
        pgm_exit(1,"ERROR: Unable to open file [$inf]!\n");
    }
    my @lines = <INF>;
    close INF;
    my $lncnt = scalar @lines;
    prt("Got $lncnt lines, from $inf...\n");
    my ($i,$line,$ch,$tag,$len,$intag,$txt,$j,$pc,$ppc,$incdata,$hadsp,$attrs);
    my ($lnn,$last,$lln,$bgnlnn,$endlnn,$clnn,$stkdep,$maxdep);
    my ($maxelement,$echn);
    $tag = '';
    $attrs = '';
    $intag = 0;
    $incdata = 0;
    $hadsp = 0;
    $txt = '';
    $ch = '';
    $pc = '';
    my @elements = ();
    $lnn = 0;
    $maxdep = 0;
    $maxelement = '';
    for ($i = 0; $i < $lncnt; $i++) {
        $line = $lines[$i];
        chomp $line;
        $len = length($line);
        $lnn++;
        $clnn = sprintf("%3d",$lnn);
        for ($j = 0; $j < $len; $j++) {
            $ppc = $pc;
            $pc = $ch;
            $ch = substr($line,$j,1);
            if ($incdata) {
                $tag .= $ch;
                if (($ch eq '>')&&($pc eq ']')&&($ppc eq ']')) {
                    $incdata = 0;
                    prt("$clnn: End CDATA\n");
                }
            } elsif ($intag) {
                if ($hadsp) {
                    $attrs .= $ch if !($ch eq '>');
                } elsif ($ch =~ /\s/) {
                    $hadsp = 1;
                } else {
                    $tag .= $ch if !($ch eq '>');
                }

                if ($ch eq '>') {
                    $intag = 0;
                    $endlnn = $lnn;
                } elsif (($ch eq '[')&&($pc eq 'A')&&($tag =~ /^<\!\[CDATA\[/)) {
                    $incdata = 1;
                    prt("\n$clnn: Begin CDATA\n");
                }
                if (!$intag) {
                    $tag = trim_all($tag);
                    $clnn = sprintf("%3d",$lnn);
                    if ($verbosity) {
                        prt("$clnn: ");
                        prt("Text [".trim_all($txt)."] ") if (length($txt) && !($txt =~ /^\s+$/));
                        prt("End tag [$tag] ");
                        prt("Attrs [".trim_all($attrs)."] ") if (length($attrs));
                    }
                    if ($tag =~ /^(\!|\?)/) {
                        prt("Special") if ($verbosity);
                    } else {
                        # if ($attrs =~ /\/$/) but it may NOT end with '/'
                        if (($attrs =~ /\/$/) || is_closed_tag($tag)) {
                            prt("self-closed") if ($verbosity);
                        } elsif ($tag =~ /^\//) {
                            $tag = substr($tag,1);
                            prt("Close") if ($verbosity);
                            if (@elements) {
                                $last = $elements[-1][0]; 
                                $lln  = $elements[-1][1]; 
                                if ($last eq $tag) {
                                    pop @elements;
                                } else {
                                    # but may have 'opt' tags - tags that need no close on the stack, which
                                    # can be dropped to get to this tag
                                    my $drop = can_find_this_tag($tag,\@elements);
                                    if ($drop) {
                                        while($drop--) {
                                            pop @elements;
                                        }
                                    } else {
                                        prt("\nERROR: Last [$last]$lln NE [$tag]$lnn line=[".trim_all($line)."]\n");
                                        show_stack_elements($tag,\@elements,\@lines);
                                        pgm_exit(1,"ERROR:[1]: It is useless to continue when the element stack is out of order!\n");
                                    }
                                }
                            } else {
                                prt("\nERROR: The stack has NO elements... The current closing element is [$tag]\n");
                                pgm_exit(1,"ERROR:[2]: It is useless to continue when the element stack is out of order! [2]\n");
                            }
                        } else {
                            prt("Open") if ($verbosity);
                            push(@elements,[$tag,$bgnlnn,$endlnn]);
                            $echn = get_element_chain(\@elements);
                            $stkdep = scalar @elements;
                            if ($stkdep > $maxdep) {
                                $maxdep = $stkdep;
                                $maxelement = "$clnn: $tag $bgnlnn $endlnn [$echn]";
                            }
                        }
                    }
                    prt("\n") if ($verbosity);
                    # reset
                    $txt = '';
                    $tag = '';
                    $attrs = '';
                    $hadsp = 0;
                }
            } else {
                if ($ch eq '<') {
                    $tag = '';
                    $intag = 1;
                    $hadsp = 0;
                    $bgnlnn = $lnn;
                } else {
                    $txt .= $ch;
                }
            }
        } # reached end of line - get next
        $ch = ' ';
        $txt .= $ch if (length($txt) && !($txt =~ /\s$/));
        if ($hadsp) {
            $attrs .= $ch if (length($attrs) && !($attrs =~ /\s$/));
        } else {
            $tag .= $ch if (length($tag) && !($tag =~ /\s$/));
        }
        $ppc = $pc;
        $pc = $ch;
    }
    if (@elements && !is_all_optional(\@elements)) {
        show_stack_elements("At-end-of-file",\@elements,\@lines);
        pgm_exit(1,"ERROR:[3] It is useless to continue when the element stack is out of order!\n");

    }
    prt("Max. element stack $maxdep...$maxelement\n");
    prt("Done $lncnt lines... $inf appears ok...\n");
}

#########################################
### MAIN ###
parse_args(@ARGV);
prt( "$pgmname: in [$cwd]: Process $in_file...\n" );
process_file($in_file);
pgm_exit(0,"Normal exit(0)");
########################################

sub give_help {
    prt("$pgmname: version 0.0.1 2010-04-06\n");
    prt("Usage: $pgmname [options] in_file_name\n");
    prt("Options:\n");
    prt(" -h (or -?) = THis help, and exit 0\n");
    prt(" -l         = Load log file at end.\n");
    prt(" -v[num]    = Bump, or set verbosity to [num]\n");
    prt("Parse input file, and report any problems...\n");

}

sub parse_args {
    my (@av) = @_;
    my ($arg,$sarg,$ch);
    while (@av) {
        $arg = $av[0];
        if ($arg =~ /^-/) {
            $sarg = substr($arg,1);
            $sarg = substr($sarg,1) while ($sarg =~ /^-/);
            $ch = substr($sarg,0,1);
            if ($ch =~ /h/i) {
                give_help();
                pgm_exit(0,"Help exit");
            } elsif ($ch =~ /l/i) {
                $load_log = 1;
                prt("Set to load log at end\n");
            } elsif ($ch =~ /v/i) {
                $sarg = substr($sarg,1);
                if (length($sarg)) {
                    if ($sarg =~ /^\d+$/) {
                        $verbosity = $sarg;
                        prt("Set verbosity to [$verbosity]\n");
                    } else {
                        pgm_exit(1,"Unknown argument [$arg] - verbosity is -v[num]. Try -h for help\n");
                    }
                } else {
                    $verbosity++;
                    prt("Bumped verbosity to [$verbosity]\n");
                }
            } else {
                pgm_exit(1,"Unknown argument [$arg]  Try -h for help\n");
            }
        } else {
            $in_file = $arg;
            prt("Set input file to [$in_file]\n");
        }
        shift @av;
    }
    if (!length($in_file)) {
        $in_file = $def_infile;
        $load_log = 1;
        $verbosity = 9;
        prt("Set DEFAULT input file to [$in_file], and set load_log=1, and verbosity=$verbosity\n");
        
    }
}

# eof - template.pl
