#!/usr/bin/perl -w
# NAME: chkxml2.pl
# AIM: Given a directory, check ALL xml files
# (a) for a BOM, and (b) for xml character encoding
use strict;
use warnings;
use File::Basename;  # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] )
use File::stat; # get file info if ($sb = stat($fil)){$dt = $sb->mtime; $sz = $sb->size;}
use Cwd;
my $os = $^O;
my $perl_dir = '/home/geoff/bin';
my $PATH_SEP = '/';
my $temp_dir = '/tmp';
if ($os =~ /win/i) {
    $perl_dir = 'C:\GTools\perl';
    $temp_dir = $perl_dir;
    $PATH_SEP = "\\";
}
unshift(@INC, $perl_dir);
require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' Check paths in \@INC...\n";
# log file stuff
our ($LF);
my $pgmname = $0;
if ($pgmname =~ /(\\|\/)/) {
    my @tmpsp = split(/(\\|\/)/,$pgmname);
    $pgmname = $tmpsp[-1];
}
my $outfile = $temp_dir.$PATH_SEP."temp.$pgmname.txt";
open_log($outfile);

# user variables
my $VERS = "0.0.1 2012-05-05";
my $load_log = 0;
my $in_file = '';
my $verbosity = 0;
my $out_xml = '';

### debug ###
my $debug_on = 1;
#my $def_file = 'c:\FG\15\fgdata';
my $def_file = 'c:\FGCVS\flightgear\data';

### program variables
my @warnings = ();
my $cwd = cwd();
my @xml_array = ();
my $total_lines = 0;
my $total_files = 0;
my $blank_files = 0;
my $bom_files = 0;
my $files_no_decl = 0;
my @no_xml_decl = ();
my $files_with_enc = 0;
my $files_no_enc = 0;
my @blanks_founda = ();
my %encs_foundh = ();
my @encs_founda = ();
my %boms_foundh = ();
my @boms_founda = ();
my $curr_file_bom = '';
my %encs_found = ();
my $total_bytes = 0;
my $largest_file_size = 0;
my $largest_file_name = '';
my $largest_file_line = 0;

sub VERB1() { return $verbosity >= 1; }
sub VERB2() { return $verbosity >= 2; }
sub VERB5() { return $verbosity >= 5; }
sub VERB9() { return $verbosity >= 9; }

# BOM list - name, count, values
my @BOM_list = (
    [ "UTF-8",       3, [0xEF,0xBB,0xBF     ] ], # 239 187 191   
    [ "UTF-16 (BE)", 2, [0xFE,0xFF          ] ], # 254 255 
    [ "UTF-16 (LE)", 2, [0xFF,0xFE          ] ], # 255 254
    [ "UTF-32 (BE)", 4, [0x00,0x00,0xFE,0xFF] ], # 0 0 254 255
    [ "UTF-32 (LE)", 4, [0xFF,0xFE,0x00,0x00] ], # 255 254 0 0
    [ "UTF-7a"     , 4, [0x2B,0x2F,0x76,0x38] ], # 2B 2F 76 39  2B 2F 76 2B  2B 2F 76 2F
    [ "UTF-7b"     , 4, [0x2B,0x2F,0x76,0x39] ], # 2B 2F 76 39  2B 2F 76 2B  2B 2F 76 2F
    [ "UTF-7c"     , 4, [0x2B,0x2F,0x76,0x2B] ], # 2B 2F 76 39  2B 2F 76 2B  2B 2F 76 2F
    [ "UTF-7d"     , 4, [0x2B,0x2F,0x76,0x2F] ], # 2B 2F 76 39  2B 2F 76 2B  2B 2F 76 2F
    [ "UTF-1"      , 3, [0xF7,0x64,0x4C     ] ], # 247 100 76 
    [ "UTF-EBCDIC" , 4, [0xDD,0x73,0x66,0x73] ], # 221 115 102 115
    [ "SCSU"       , 3, [0x0E,0xFE,0xFF     ] ], # 14 254 255
    [ "BOCU-1"     , 3, [0xFB,0xEE,0x28     ] ], # 251 238 40
    [ "GB-18030"   , 4, [0x84,0x31,0x95,0x33] ]  # 132 49 149 51
);

sub show_warnings($) {
    my ($val) = @_;
    if (@warnings) {
        prt( "\nGot ".scalar @warnings." WARNINGS...\n" );
        foreach my $itm (@warnings) {
           prt("$itm\n");
        }
        prt("\n");
    } else {
        prt( "\nNo warnings issued.\n\n" ) if (VERB9());
    }
}

sub pgm_exit($$) {
    my ($val,$msg) = @_;
    if (length($msg)) {
        $msg .= "\n" if (!($msg =~ /\n$/));
        prt($msg);
    }
    show_warnings($val);
    close_log($outfile,$load_log);
    exit($val);
}


sub prtw($) {
   my ($tx) = shift;
   $tx =~ s/\n$//;
   prt("$tx\n");
   push(@warnings,$tx);
}

# LOAD without a BOM
my $last_bom_name = '';
sub line_has_bom($$) {
    my ($line,$rname) = @_;
    my $max = scalar @BOM_list;
    my $len = length($line);
    my ($i,$j,$name,$cnt,$ra,$ch,$val);
    for ($i = 0; $i < $max; $i++) {
        $name = $BOM_list[$i][0]; # name
        $cnt  = $BOM_list[$i][1]; # length
        $ra   = $BOM_list[$i][2]; # ref array of values
        if ($len > $cnt) {  # make sure line length GT BOM
            for ($j = 0; $j < $cnt; $j++) {
                $ch = substr($line,$j,1);   # extract CHAR
                $val = ord($ch);            # get VALUE
                last if ($val != ${$ra}[$j]); # compare
            }
            if ($j == $cnt) {   # if ALL values found
                ${$rname} = $name;  # give back 'name'
                $last_bom_name = $name;
                return $cnt;    # and return count
            }
        }
    }
    return 0;   # no BOM found
}

sub remove_utf_bom($$) {
    my ($ff,$ra) = @_;
    my $line = ${$ra}[0];  # get first line
    my $name = '';
    my $len = line_has_bom($line,\$name);
    my $iret = 0;
    if ($len) {
        $curr_file_bom = substr($line,0,$len);
        $line = substr($line,$len); # truncate line
        ${$ra}[0] = $line;  # and return minus BOM
        my ($nm,$dr) = fileparse($ff); # just show name
        prt("NOTE: File [$ff] is $name encoding. BOM($len) removed.\n");
        $iret = 1;
    }
    return $iret;
}

sub process_in_dir($);
sub process_in_dir($) {
    my $dir = shift;
    my @dirs = ();
    if (!opendir(DIR,$dir)) {
        prtw("WARNING: Unable to open directory [$dir}!\n");
        return;
    }
    my @files = readdir(DIR);
    closedir(DIR);
    my $cnt = scalar @files;
    prt("Have $cnt files, from [$dir] to process...\n") if (VERB9());
    my ($file,$ff);
    ut_fix_directory(\$dir);
    @dirs = ();
    foreach $file (@files) {
        next if (($file eq ".") || ($file eq ".."));
        $ff = $dir.$file;
        if ( -l $ff) {
            # ignore LINKS
        } elsif (-d $ff) {
            push(@dirs,$ff);
        } elsif (-f $ff) {
            if ($file =~ /\.xml$/i) {
                push(@xml_array,$ff);
            }
        }
    }
    foreach $dir (@dirs) {
        process_in_dir($dir);
    }
}

sub local_xml_to_lines($$$) {
    my ($rlm, $rlns,$file) = @_;
    my $intag = 0;
    my $text = '';
    my @nlines = ();
    my ($fln, $ln, $ch, $pch, $nch, $len, $i, $i2, $tag, $xml, $dnx);
    my ($lnnm, $lnb, $nlnm);
    $pch = '';
    $nch = '';
    $tag = '';
    $xml = '';
    $dnx = 0;
    $lnnm = 0;
    $nlnm = 0;
    $lnb = 0;
    my $nxtxt = '';
    foreach $fln (@{$rlns}) {
        chomp $fln;
        $ln = trim_all($fln);
        $len = length($ln);
        $lnnm++;    # count another xml line
        for ($i = 0; $i < $len; $i++) {
            $i2 = $i + 1;
            $ch = substr($ln,$i,1);
            $nch = (($i2 < $len) ? substr($ln,$i2,1) : ' ');
            if ($intag) {
                # on first SPACE
                $tag .= $ch;
                if ($ch eq '>') {
                    # end of XML tag
                    $nlnm++;
                    push(@nlines,$tag);
                    ### prt( "push(\@xlnmap, [ $nlnm, $lnb, $lnnm ]); # each NEW line has BEGIN and END\n" );
                    $$rlm{$nlnm} = "$lnb-$lnnm";    # each NEW line has BEGIN and END
                    $nxtxt .= "$lnb-$lnnm: [$tag]\n";
                    $tag = '';
                    $intag = 0;
                    $xml = '';
                }
            } else {
                if ($ch eq '<') {
                    $tag = $ch; # start a tag line
                    $intag = 1; # signal in a tag
                    $xml = '';
                    $dnx = 0;
                    $lnb = $lnnm;    # set the BEGIN xml line
                }
            }
            $pch = $ch;
        }
        # done a line - this is like a SPACE
        if ($intag && length($tag)) {
            $tag .= ' ' if !($tag =~ /\s$/);
        }
    }
    if (length($tag)) {
        prtw("WARNING:local_xml_to_lines: xml re-lining error! Left pending tag [$tag]\nin file [$file]...\n");
    }
    #$dnx = scalar @nlines;
    #if ($dnx < 5) {
    #    prt("ERROR: Input file [$g_sc_act_vcproj] only has $dnx lines!\n");
    #    pgm_exit(1, "INPUT TOO SMALL!!!\n");
    #}
    #if ($write_temp_xml) {
    #    my $tmpxml = "C:\\GTools\\perl\\tempvcx.xml";
    #    if (! -f $tmpxml) {
    #        write2file($nxtxt,$tmpxml);
    #        prt( "Written relined XML to '$tmpxml'\n" );
    #    }
    #}
    return \@nlines;
}

sub local_array_2_hash_on_equals($) {
	my $rlns = shift;   # (@inarr) = @_;
	my %hash = ();
	my ($itm, @arr, $key, $val, $al, $a, $cnt, $titm);
   $cnt = 0;
	foreach $itm (@{$rlns}) {
      $cnt++;
      $titm = trim_all($itm);
      if (length($titm) == 0) {
         prt( "NOTE: fgutils:array_2_hash_on_equals: Item $cnt has NO length in passed array!\n" );
         next;
      } elsif ($titm eq '=') {
         # 20090912 - lets overlook this = no noise
         ### prt( "NOTE: fgutils:array_2_hash_on_equals: Item $cnt is JUST an equal sign! [$itm]!\n" );
         next;
      }
		@arr = split('=',$itm);
		$al = scalar @arr;
		$key = $arr[0];
		$val = '';
		for ($a = 1; $a < $al; $a++) {
			$val .= '=' if length($val);
			$val .= $arr[$a];
		}
      if (defined $key && length($key)) {
         if (defined $hash{$key}) {
            prtw( "WARNING: array_2_hash_on_equals: Duplicate KEY: [$key] ... ADDING val [$val]\n" );
            $hash{$key} .= "\@".$val;
         } else {
            $hash{$key} = $val;
         }
      } else {
         if (defined $key) {
            prt( "NOTE: fgutils:array_2_hash_on_equals: Item $cnt:$itm: key=[$key] has NO length in passed array!\n" );
         } else {
            prt( "NOTE: fgutils:array_2_hash_on_equals: Item $cnt:$itm: key is NOT set in passed array!\n" );
         }
      }
	}
	return \%hash;
}

sub do_xml_file($) {
    my $xml = shift;
    my ($sb,$tm,$sz);
    if (! open INF, "<$xml") {
        prtw("WARNING: can NPT open file [$xml]\n");
    }
    my @lines = <INF>;
    close INF;
    my $cnt = scalar @lines;
    $total_lines += $cnt;
    $total_files++;
    my $withBOM = 0;
    my %lnmap = ();
    if ($cnt) {
        my ($rlines,$max);
        $withBOM = remove_utf_bom($xml,\@lines);
        $rlines = local_xml_to_lines(\%lnmap,\@lines,$xml);
        $max = scalar @{$rlines};
        my ($line,$i,$rh,@arr,$enc,$haddec,$msg,$rea,$tmp);
        $msg = "Processing $cnt lines, $max xml, from [$xml] ";
        $enc = '';
        $haddec = 0;
        for ($i = 0; $i < $max; $i++) {
            $line = ${$rlines}[$i];
            if ($line =~ /<\?xml/) {
                @arr = space_split($line);
                $rh = local_array_2_hash_on_equals(\@arr);
                if (defined ${$rh}{'encoding'}) {
                    $tmp = strip_both_quotes(${$rh}{'encoding'});
                    $tmp =~ s/>$//;
                    $tmp =~ s/\?$//;
                    $enc = strip_both_quotes($tmp);
                    if ($enc =~ /^utf/i) {
                        $enc = uc($enc);
                    } elsif ($enc =~ /^iso/i) {
                        $enc = uc($enc);
                    }
                }
                $haddec = 1;

                last;
            }
        }
        if ($haddec) {
            $msg .= "decl ";
            if (length($enc)) {
                $msg .= "encoding=\"$enc\" ";
                $files_with_enc++;
                if (defined $encs_foundh{$enc}) {
                    $encs_foundh{$enc}++;
                } else {
                    $encs_foundh{$enc} = 1;
                    push(@encs_founda,$xml);
                }
                $encs_found{$enc} = [] if (!defined $encs_found{$enc});
                $rea = $encs_found{$enc};
                push(@{$rea},$xml);

            } else {
                $msg .= "NO encoding ";
                $files_no_enc++;
            }
        } else {
            $msg .= "NO DECL ";
            $files_no_decl++;
            push(@no_xml_decl,$xml);
        }
        if ($withBOM) {
            $msg .= "BOM $last_bom_name";
            $bom_files++;
            push(@boms_founda,$xml);
            if (defined $boms_foundh{$last_bom_name}) {
                $boms_foundh{$last_bom_name}++;
            } else {
                $boms_foundh{$last_bom_name} = 1;
            }
        }
        prt("$msg\n"); # if (VERB5() || $withBOM);
    } else {
        # no line count???
        $blank_files++;
        push(@blanks_founda,$xml);
    }
    if ($sb = stat($xml)) {
        $tm = $sb->mtime;
        $sz = $sb->size;
        if ($sz > $largest_file_size) {
            $largest_file_size = $sz;
            $largest_file_name = $xml;
            $largest_file_line = $cnt;
        }
        $total_bytes += $sz;
    }
}

sub process_xml_files() {
    my $cnt = scalar @xml_array;
    prt("Have $cnt XML files to process...\n");
    my ($file);
    foreach $file (@xml_array) {
        do_xml_file($file);
    }
}

sub show_stats() {
    prt("\nProcessed $total_lines lines, $total_bytes bytes, from $total_files files.\n");
    my (@arr,$key,$rea,$cnt,$msg);
    if ($blank_files) {
        prt("Found $blank_files files BLANK!\n");
        prt(join("\n",@blanks_founda)."\n");
    }
    if ($bom_files) {
        @arr = keys(%boms_foundh);
        prt("Found $bom_files files with a BOM. ".join(" ",@arr)."\n");
        prt(join("\n",@boms_founda)."\n");
    }
    if ($files_no_decl) {
        prt("Found $files_no_decl files with NO 'xml' declaration.\n");
        prt(join("\n",@no_xml_decl)."\n");
    }
    if ($files_no_enc) {
        prt("Found $files_no_enc files with NO 'encoding' declaration.\n");
    }
    if ($files_with_enc) {
        @arr = keys(%encs_foundh);
        prt("Found $files_with_enc files with encodings like ".join(" ",@arr)."\n");
        $msg = '';
        foreach $key (keys %encs_found) {
            $rea = $encs_found{$key};
            $cnt = scalar @{$rea};
            prt(" Encoding $key - $cnt files\n");
            prt(join("\n",@{$rea})."\n");
            $msg .= "$key = $cnt ";
        }
        ###prt(join("\n",@encs_founda)."\n");
    }
    prt("$msg\n");
    prt("Counts: $files_no_decl no decl, $files_with_enc with enc, $files_no_enc no enc\n");
    prt("Counts: ".get_nn($total_lines)." lines, ".get_nn($total_bytes)." bytes (".util_bytes2ks($total_bytes)."), from ".get_nn($total_files)." files.\n");
    prt("Largest file ".get_nn($largest_file_size)." bytes, [$largest_file_name], ".get_nn($largest_file_line)." lines.\n");

}

#########################################
### MAIN ###
parse_args(@ARGV);
prt("Collecting xml files from [$in_file] directory... recursive... moment...\n");
process_in_dir($in_file);
process_xml_files();
show_stats();

pgm_exit(0,"");
########################################
sub give_help {
    prt("$pgmname: version $VERS\n");
    prt("Usage: $pgmname [options] in-file\n");
    prt("Options:\n");
    prt(" --help  (-h or -?) = This help, and exit 0.\n");
    prt(" --verb[n]     (-v) = Bump [or set] verbosity. def=$verbosity\n");
    prt(" --load        (-l) = Load LOG at end. ($outfile)\n");
    prt(" --out <file>  (-o) = Write output to this file.\n");
}

sub need_arg {
    my ($arg,@av) = @_;
    pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av);
}

sub parse_args {
    my (@av) = @_;
    my ($arg,$sarg);
    while (@av) {
        $arg = $av[0];
        if ($arg =~ /^-/) {
            $sarg = substr($arg,1);
            $sarg = substr($sarg,1) while ($sarg =~ /^-/);
            if (($sarg =~ /^h/i)||($sarg eq '?')) {
                give_help();
                pgm_exit(0,"Help exit(0)");
            } elsif ($sarg =~ /^v/) {
                if ($sarg =~ /^v.*(\d+)$/) {
                    $verbosity = $1;
                } else {
                    while ($sarg =~ /^v/) {
                        $verbosity++;
                        $sarg = substr($sarg,1);
                    }
                }
                prt("Verbosity = $verbosity\n") if (VERB1());
            } elsif ($sarg =~ /^l/) {
                $load_log = 1;
                prt("Set to load log at end.\n") if (VERB1());
            } elsif ($sarg =~ /^o/) {
                need_arg(@av);
                shift @av;
                $sarg = $av[0];
                $out_xml = $sarg;
                prt("Set out file to [$out_xml].\n") if (VERB1());
            } else {
                pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n");
            }
        } else {
            $in_file = $arg;
            prt("Set input to [$in_file]\n") if (VERB1());
        }
        shift @av;
    }

    if ((length($in_file) ==  0) && $debug_on) {
        $in_file = $def_file;
        $load_log = 2;
        prt("Set DEFAULT input to [$in_file]\n");
    }
    if (length($in_file) ==  0) {
        pgm_exit(1,"ERROR: No input files found in command!\n");
    }
    if (! -d $in_file) {
        pgm_exit(1,"ERROR: Unable to find directory [$in_file]! Check name, location...\n");
    }
}

# eof - chkxml2.pl
