#!/usr/bin/perl -w
# NAME: htmlclass.pl
# AIM: Get class list from html file
use strict;
use warnings;
use File::Basename;  # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] )
use Cwd;
my $os = $^O;
my $perl_dir = '/home/geoff/bin';
my $PATH_SEP = '/';
my $temp_dir = '/tmp';
if ($os =~ /win/i) {
    $perl_dir = 'C:\GTools\perl';
    $temp_dir = $perl_dir;
    $PATH_SEP = "\\";
}
unshift(@INC, $perl_dir);
require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' Check paths in \@INC...\n";
require 'lib_url.pl'  or die "Unable to load 'lib_url.pl' Check paths in \@INC...\n";
require 'lib_css.pl'  or die "Unable to load 'lib_css.pl' Check paths in \@INC...\n";
# log file stuff
our ($LF);
my $pgmname = $0;
if ($pgmname =~ /(\\|\/)/) {
    my @tmpsp = split(/(\\|\/)/,$pgmname);
    $pgmname = $tmpsp[-1];
}
my $outfile = $temp_dir.$PATH_SEP."temp.$pgmname.txt";
open_log($outfile);

# user variables
my $VERS = "0.0.1 2012-07-18";
my $load_log = 0;
my $in_file = '';
my $verbosity = 0;
my $out_file = $temp_dir.$PATH_SEP."temphtm.html";
my $find = '';

# ### DEBUG ###
my $debug_on = 1;
my $def_file = 'C:\FG\17\crossfeed\test\list.html';
my $def_find = 'header_flights';
#my $def_find = "wrapper";

### program variables
my @warnings = ();
my $cwd = cwd();
my $strip_bom = 1;
my $curr_file_bom = '';

my @BOM_list = (
    [ "UTF-8",       3, [0xEF,0xBB,0xBF     ] ], # 239 187 191   
    [ "UTF-16 (BE)", 2, [0xFE,0xFF          ] ], # 254 255 
    [ "UTF-16 (LE)", 2, [0xFF,0xFE          ] ], # 255 254
    [ "UTF-32 (BE)", 4, [0x00,0x00,0xFE,0xFF] ], # 0 0 254 255
    [ "UTF-32 (LE)", 4, [0xFF,0xFE,0x00,0x00] ], # 255 254 0 0
    [ "UTF-7a"     , 4, [0x2B,0x2F,0x76,0x38] ], # 2B 2F 76 39  2B 2F 76 2B  2B 2F 76 2F
    [ "UTF-7b"     , 4, [0x2B,0x2F,0x76,0x39] ], # 2B 2F 76 39  2B 2F 76 2B  2B 2F 76 2F
    [ "UTF-7c"     , 4, [0x2B,0x2F,0x76,0x2B] ], # 2B 2F 76 39  2B 2F 76 2B  2B 2F 76 2F
    [ "UTF-7d"     , 4, [0x2B,0x2F,0x76,0x2F] ], # 2B 2F 76 39  2B 2F 76 2B  2B 2F 76 2F
    [ "UTF-1"      , 3, [0xF7,0x64,0x4C     ] ], # 247 100 76 
    [ "UTF-EBCDIC" , 4, [0xDD,0x73,0x66,0x73] ], # 221 115 102 115
    [ "SCSU"       , 3, [0x0E,0xFE,0xFF     ] ], # 14 254 255
    [ "BOCU-1"     , 3, [0xFB,0xEE,0x28     ] ], # 251 238 40
    [ "GB-18030"   , 4, [0x84,0x31,0x95,0x33] ]  # 132 49 149 51
);

sub VERB1() { return $verbosity >= 1; }
sub VERB2() { return $verbosity >= 2; }
sub VERB5() { return $verbosity >= 5; }
sub VERB9() { return $verbosity >= 9; }

sub show_warnings($) {
    my ($val) = @_;
    if (@warnings) {
        prt( "\nGot ".scalar @warnings." WARNINGS...\n" );
        foreach my $itm (@warnings) {
           prt("$itm\n");
        }
        prt("\n");
    } else {
        prt( "\nNo warnings issued.\n\n" ) if (VERB9());
    }
}

sub pgm_exit($$) {
    my ($val,$msg) = @_;
    if (length($msg)) {
        $msg .= "\n" if (!($msg =~ /\n$/));
        prt($msg);
    }
    show_warnings($val);
    close_log($outfile,$load_log);
    exit($val);
}


sub prtw($) {
   my ($tx) = shift;
   $tx =~ s/\n$//;
   prt("$tx\n");
   push(@warnings,$tx);
}

# LOAD without a BOM
sub line_has_bom($$) {
    my ($line,$rname) = @_;
    my $max = scalar @BOM_list;
    my $len = length($line);
    my ($i,$j,$name,$cnt,$ra,$ch,$val);
    for ($i = 0; $i < $max; $i++) {
        $name = $BOM_list[$i][0]; # name
        $cnt  = $BOM_list[$i][1]; # length
        $ra   = $BOM_list[$i][2]; # ref array of values
        if ($len > $cnt) {  # make sure line length GT BOM
            for ($j = 0; $j < $cnt; $j++) {
                $ch = substr($line,$j,1);   # extract CHAR
                $val = ord($ch);            # get VALUE
                last if ($val != ${$ra}[$j]); # compare
            }
            if ($j == $cnt) {   # if ALL values found
                ${$rname} = $name;  # give back 'name'
                return $cnt;    # and return count
            }
        }
    }
    return 0;   # no BOM found
}

sub remove_utf_bom($$) {
    my ($ff,$ra) = @_;
    my $line = ${$ra}[0];  # get first line
    my $name = '';
    my $len = line_has_bom($line,\$name);
    if ($len) {
        $curr_file_bom = substr($line,0,$len);
        $line = substr($line,$len); # truncate line
        ${$ra}[0] = $line;  # and return minus BOM
        my ($nm,$dr) = fileparse($ff); # just show name
        prt("NOTE: File [$nm] is $name encoding. BOM($len) removed.\n");
    }
}

sub load_file_lines($$) {
    my ($ff,$ra) = @_;
    my $lncnt = 0;
    $curr_file_bom = '';
    if (open INF, "<$ff") {
        @{$ra} = <INF>;
        close INF;
        $lncnt = scalar @{$ra};
        remove_utf_bom($ff,$ra) if ($strip_bom);
    } else {
        prtw("WARNING: Unable to open [$ff]!\n");
    }
    return $lncnt;
}

sub show_hash($$) {
    my ($rh,$msg) = @_;
    my ($key,$val);
    my $cnt = 0;
    my $hdr = "$msg [";
    foreach $key (keys %{$rh}) {
        $val = ${$rh}{$key};
        prt("$hdr") if (length($hdr));
        if (length($val)) {
            prt("$key=$val ");
        } else {
            prt("$key ");
        }
        $cnt++;
        $hdr = '';
    }
    prt("]\n");
}

sub split_attrs($) {
    my $txt = shift;
    my @arr = space_split($txt);
    my ($cnt,$key,$val,$i);
    my %h = ();
    foreach $txt (@arr) {
        my @a = split('=',$txt);
        $cnt = scalar @a;
        if ($cnt > 2) {
            for ($i = 2; $i < $cnt; $i++) {
                $a[1] .= "=";
                $a[1] .= $a[$i];
            }
            $cnt = 2;
        }
        if ($cnt == 1) {
            $h{$txt} = '';
        } elsif ($cnt == 2) {
            $key = $a[0];
            $val = strip_both_quotes($a[1]);
            $h{$key} = $val;
        } else {
            pgm_exit(1,"UGH: cnt=$cnt NOT HANDLED [$txt]! FIX ME!\n");
        }
    }
    show_hash(\%h,'ATTR') if (VERB9());
    return \%h;
}


sub search_for_find($$) {
    my ($rh,$file) = @_;
    my ($key,$val,$k2,$v2);
    foreach $key (keys %{$rh}) {
        $val = ${$rh}{$key};
        if ($key =~ /$find/i) {
            prt("$file: Found key $key, with find [$find]\n");
            show_hash($val,'VAL');
        } else {
            foreach $k2 (keys %{$val}) {
                $v2 = ${$val}{$k2};
                if ($k2 =~ /$find/i) {
                    prt("$file: With key $key, and find [$find], fnd $k2 = $v2\n");
                } elsif ($v2 =~ /$find/i) {
                    prt("$file: With key $key, and find [$find], k2 $k2 = $v2 fnd\n");
                }
            }
        }
    }
}


## for file, read file lines with sub read_css_file($$$) { my ($rh,$file,$dbg) = @_;
## for URL, fetch, with sub fetch_url($$) { my ($url,$ra) = @_;
## and get ref hash = 
## return read_css_lines(\@lines,$rh,$file,$dbg);


sub process_in_file($) {
    my ($inf) = @_;
    my @lines = ();
    my $lncnt = load_file_lines($inf,\@lines);
    prt("Processing $lncnt lines, from [$inf]...\n");
    my ($name,$dir) = fileparse($inf);
    my ($line,$inc,$lnn,$i,$len,$ch,$pc,$tag,$txt,$attr,$msg,$tmp,$name,$cnt);
    my ($rh,$href,$ff,$tmp);
    ut_fix_directory(\$dir);
    my $inquot = 0;
    my $incomm = 0;
    my $intable = 0;
    my $intag = 0;
    my $hadsp = 0;
    my $inbody = 0;
    my $intr = 0;
    my $tdcnt = 0;
    my $trcnt = 0;
    my $inscript = 0;
    my $endsc = 0;
    my $http = '';
    my $lsttxt = '';
    my $ptxt = '';
    my $endtable = 0;
    my $eol = " ";  # or "\n"
    $lnn = 0;
    $ch = '';
    $tag = '';
    $txt = '';
    $attr = '';
    my @list = ();
    my @csslns = ();
    my ($rcss,$csslns);
    my $dbg = 1;
    foreach $line (@lines) {
        chomp $line;
        $lnn++;
        $line = trim_all($line);
        $len = length($line);
        next if ($len == 0);
        for ($i = 0; $i < $len; $i++) {
            $pc = $ch;
            $ch = substr($line,$i,1);
            if ($intag) {
                if ($ch eq '>') {
                    $msg = "<$tag";
                    $msg .= " $attr" if (length($attr));
                    $msg .= $ch;
                    $intag = 0;
                    if ($incomm) {
                        if ($attr =~ /--$/) {
                            $incomm = 0;
                            $endsc = 1;
                        }
                    } elsif ($inscript) {
                        if ($tag =~ /^\/script$/i) {
                            $inscript = 0;
                            $endsc = 1;
                        }
                    } elsif ($inbody) {
                        if ($tag =~ /^\/body$/i) {
                            $inbody = 0;
                        }
                    } elsif ($tag =~ /^body$/i) {
                        $inbody = 1;
                    }
                    if ($inscript || $incomm || $endsc) {
                        # no show
                    } else {
                        prt("$txt");
                        prt("$msg\n");
                        $rh = split_attrs($attr) if (length($attr));
                        if ($tag =~ /^link$/i) {
                            if (defined ${$rh}{'href'}) {
                                $href = ${$rh}{'href'};
                                 my %h = ();
                                 my ($n,$d);
                                if ($href =~ /^http:\/\//i) {
                                    $csslns = fetch_url($href,\@csslns,$dbg); # { my ($url,$ra) = @_;
                                    $tmp = $href;
                                    $tmp =~ s/^http:\/\///;
                                    ($n,$d) = fileparse($tmp);
                                    $rcss = read_css_lines(\@csslns,\%h,$n,$dbg);
                                } else {
                                    $ff = $dir.$href;
                                    ($n,$d) = fileparse($href);
                                    $rcss = read_css_file(\%h,$ff,$dbg);
                                }
                                search_for_find(\%h,$n);
                            }
                        }
                    }
                    $txt = '';
                    $tag = '';
                    $attr = '';
                    $endsc = 0;
                } elsif ($hadsp) {
                    $attr .= $ch;   # add to attributes
                } elsif ($ch =~ /\s/) {
                    $hadsp = 1;
                    if ($tag =~ /^script$/i) {
                        $inscript = 1;
                    }
                } else {
                    $tag .= $ch;
                    if (($ch eq '-')&&($tag =~ /^!--/)) {
                        #prt("$lnn: Begin comment - <$tag\n");
                        $incomm = 1;
                    }
                }
            } else {
                if ($ch eq '<') {
                    $intag = 1;
                    $hadsp = 0;
                    $attr = '';
                    $tag = '';
                } else {
                    $txt .= $ch;
                }
            }
        }
        # end of line
        if ($intag) {
            if ($hadsp) {
                ###$attr .= ' ' if (length($attr));
                $attr .= $eol if (length($attr));
            } else {
                $hadsp = 1;
            }
        } else {
            $txt .= $eol if (length($txt));
        }
    }
    #write2file($http,$out_file);
    #prt("New HTML wirtten to $out_file\n");
}

#########################################
### MAIN ###
parse_args(@ARGV);
process_in_file($in_file);
pgm_exit(0,"");
########################################

sub need_arg {
    my ($arg,@av) = @_;
    pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av);
}

sub parse_args {
    my (@av) = @_;
    my ($arg,$sarg);
    while (@av) {
        $arg = $av[0];
        if ($arg =~ /^-/) {
            $sarg = substr($arg,1);
            $sarg = substr($sarg,1) while ($sarg =~ /^-/);
            if (($sarg =~ /^h/i)||($sarg eq '?')) {
                give_help();
                pgm_exit(0,"Help exit(0)");
            } elsif ($sarg =~ /^f/) {
                need_arg(@av);
                shift @av;
                $sarg = $av[0];
                $find = $sarg;
                prt("Set FIND to [$find].\n") if (VERB1());
            } elsif ($sarg =~ /^v/) {
                if ($sarg =~ /^v.*(\d+)$/) {
                    $verbosity = $1;
                } else {
                    while ($sarg =~ /^v/) {
                        $verbosity++;
                        $sarg = substr($sarg,1);
                    }
                }
                prt("Verbosity = $verbosity\n") if (VERB1());
            } elsif ($sarg =~ /^l/) {
                if ($sarg =~ /^ll/) {
                    $load_log = 2;
                } else {
                    $load_log = 1;
                }
                prt("Set to load log at end. ($load_log)\n") if (VERB1());
            } elsif ($sarg =~ /^o/) {
                need_arg(@av);
                shift @av;
                $sarg = $av[0];
                $out_file = $sarg;
                prt("Set out file to [$out_file].\n") if (VERB1());
            } else {
                pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n");
            }
        } else {
            $in_file = $arg;
            prt("Set input to [$in_file]\n") if (VERB1());
        }
        shift @av;
    }

    if ($debug_on) {
        prtw("WARNING: DEBUG is ON\n");
        if (length($in_file) ==  0) {
            $in_file = $def_file;
            prt("Set DEFAULT input to [$in_file]\n");
        }
        if (length($find) == 0) {
            $find = $def_find;
        }
    }
    if (length($in_file) ==  0) {
        pgm_exit(1,"ERROR: No input files found in command!\n");
    }
    if (! -f $in_file) {
        pgm_exit(1,"ERROR: Unable to find in file [$in_file]! Check name, location...\n");
    }
    if (length($find) == 0) {
        pgm_exit(1,"ERROR: No class/id to find found in command!\n");
    }
}

sub give_help {
    prt("$pgmname: version $VERS\n");
    prt("Usage: $pgmname [options] in-file\n");
    prt("Options:\n");
    prt(" --help  (-h or -?) = This help, and exit 0.\n");
    prt(" --verb[n]     (-v) = Bump [or set] verbosity. def=$verbosity\n");
    prt(" --load        (-l) = Load LOG at end. ($outfile)\n");
    prt(" --find class  (-f) = Class or id to find.\n");
    prt(" --out <file>  (-o) = Write output to this file.\n");
}

# eof - template.pl
