#!/usr/bin/perl -w
# ###########################################################
# NAME: chkxml.pl
# AIM: Given an XML file, test it - a SIMPLE test
# 27/01/2010 - add BOM support, even GUESSING when no BOM!!!
# 23/01/2010 - update many many things,
# but still does NOT handle UTF-16 encloded files - see below
# 15/11/2008 - geoff mclane - http://geoffair.net/mperl
# ###########################################################
use strict;
use warnings;
my ($perl_root,$log_lib);
if ($^O eq 'MSWin32') {
   $perl_root = 'C:\GTools\perl';
   $log_lib = 'logfile.pl';
} else {
   $perl_root = '/home/geoff/bin';
   $log_lib = 'logfile.pl';
}
unshift(@INC, $perl_root);
require $log_lib or die "Unable to load $log_lib...\n";
# log file stuff
my ($LF);
my $pgmname = $0;
if ($pgmname =~ /\w{1}:\\.*/) {
    my @tmpsp = split(/\\/,$pgmname);
    $pgmname = $tmpsp[-1];
}
my $outfile = $perl_root."/temp.$pgmname.txt";
open_log($outfile);

# FOR UTF-16 (UNICODE) NOTES
# ==========================
# from : http://www.perlmonks.org/?node_id=649456
# Would need to 'use PerlIO::encoding;', then
# open( IN, '<:raw:encoding(UTF-16LE)', $in_file);
# or perhaps :raw:encoding(UTF-16LE):crlf:utf8 
# or these parameters used on 'binmode'
# see sub has_utf_16_BOM($) below...
# UNICODE file
my $def_file = 'C:\DTEMP\libxml2-2.6.30\result\slashdot16.xml';
# This could be detected by reading the BOM
# 0000:0000 FF FE 3C 00 3F 00 78 00  6D 00 6C 00 20 00 76 00 ..<.?.x.m.l. .v.
# 0000:0010 65 00 72 00 73 00 69 00  6F 00 6E 00 3D 00 22 00 e.r.s.i.o.n.=.".
# 0000:0020 31 00 2E 00 30 00 22 00  20 00 65 00 6E 00 63 00 1...0.". .e.n.c.
# 0000:0030 6F 00 64 00 69 00 6E 00  67 00 3D 00 22 00 75 00 o.d.i.n.g.=.".u.
# 0000:0040 74 00 66 00 2D 00 31 00  36 00 22 00 3F 00 3E 00 t.f.-.1.6.".?.>.
# NOW HANDLED
# ===========
#my $def_file = 'C:\DTEMP\libxml2-2.6.30\result\utf16bebom.xml';   # another
#my $def_file = 'C:\DTEMP\libxml2-2.6.30\result\utf16lebom.xml';
#my $def_file = 'test4.xml';
#my $def_file = 'tests.xml';
#my $def_file = 'C:\DTEMP\libxml2-2.6.30\test\errors\attr2.xml';
#my $def_file = 'C:\FGCVS\FlightGear\data/Aircraft/B-2/B-2-set.xml';
#my $def_file = 'REC-xml-19980210.xml';
#my $def_file = 'C:\DTEMP\libxml2-2.6.30\result\valid\REC-xml-19980210.xml';
#my $def_file = 'C:\DTEMP\libxml2-2.6.30\result\noent\badcomment.xml';
#my $def_file = 'C:\DTEMP\FG\CubeServ420.xml';
#my $def_file = "C:\\FGCVS\\FlightGear\\data\\Aircraft\\787\\787.xml";
#my $def_file = 'C:\FG\26\data\Protocol\FgfsSharp.xml';
#my $def_file = "C:\\FGCVS\\FlightGear\\data\\Aircraft\\c172p\\c172p-set.xml";

# features
my $debug_on = 1; # to run WITHOUT parameters
my $load_log = 0;
my $verbose = 0;
my $dbg_cdata = 0;   # show begin and end of CDATA
my $dbg_lines = 0;   # show EACH line

# program variables
my @warnings = ();
my @in_files = ();   # list of INPUT files...

sub pgm_exit($$) {
   my ($ret,$msg) = @_;
   show_warnings($ret);
   if (length($msg)) {
      $msg .= "\n" if (!($msg =~ /\n$/));
      prt($msg);
   }
   close_log($outfile,$load_log);
   exit($ret);
}

sub prtw($) {
    my ($tx) = shift;
    $tx =~ s/\n$// if ($tx =~ /\n$/);
    prt("$tx\n");
    push(@warnings,$tx);
}

sub show_warnings($) {
    my ($dbg) = shift;
    if (@warnings) {
        prt( "\nGot ".scalar @warnings." WARNINGS ...\n" );
        foreach my $line (@warnings) {
            prt("$line\n" );
        }
        prt("\n");
    } elsif ($dbg) {
        prt("\nNo warnings issued.\n\n");
    }
}

sub prtv($) {
   my ($txt) = @_;
   prt($txt) if ($verbose);
}

sub has_utf_16_BOM($) {
   my ($fil) = shift;
   if (open INF, "<$fil") {
      binmode INF;
      my $buf = "";
      if ((read INF, $buf, 2) == 2) {
         close INF;
         my $od1 = ord(substr($buf,0,1));
         my $od2 = ord(substr($buf,1,1));
         if (($od1 == 0xFF)&&($od2 == 0xFE)) {
            return (16+2);   # LittleEndians (windows)
         } elsif (($od1 == 0xFE)&&($od2 == 0xFF)) {
            return (16+4);   # BigEndians (unix)
         } elsif ($od1 == 0) {
            return 4;
         } elsif ($od2 == 0) {
            return 2;
         }
         return 1;
      }
      close INF;
   }
   return 0;
}

#############################################################
# main process

sub get_attrs_ref($) {
   my ($txt) = @_;
   $txt = substr($txt,1) while ($txt =~ /^\s/); # clear any leading spaces
   my $len = length($txt);
   my ($i,$cc,$key,$val);
   my %h = ();
   $i = 0;
   while ($i < $len) {
      $key = '';
      for (; $i < $len; $i++) {
         $cc = substr($txt,$i,1);
         if ($cc eq '=') {
            $i++;
            $cc = substr($txt,$i,1);
            last;
         }
         $key .= $cc;
      }
      return \%h if ($cc ne '"');
      $i++;
      $val = '';
      for (; $i < $len; $i++) {
         $cc = substr($txt,$i,1);
         last if ($cc eq '"');
         $val .= $cc;
      }
      $h{$key} = $val;
      $i++; # bump over 2nd inverted commas
      # and eat any spaces
      for (; $i < $len; $i++) {
         $cc = substr($txt,$i,1);
         last if ( !($cc =~ /\s/) );
      }
   }
   return \%h;
}

sub search_harder($$$$) {
   my ($bal,$rlines,$i,$max) = @_;
   my $iret = 1;
   my $len = length($bal);
   my ($j,$cc,$line);
   for ($j = 0; $j < $len; $j++) {
      $cc = substr($bal,$j,1);
      if ($cc eq '>') {
         return 0;
      }
      if (!($cc =~ /\s/)) {
         return 1;
      }
   }
   for (; $i < $max; $i++) {
      $line = ${$rlines}[$i];
      chomp $line;
      $line = trim_all($line);
      $len = length($line);   # get new length to process
      for ($j = 0; $j < $len; $j++) {
         $cc = substr($line,$j,1);
         if ($cc eq '>') {
            return 0;
         }
         if (!($cc =~ /\s/)) {
            return 1;
         }
      }
   }
   return $iret;
}

# WOW, found this as the DOCTYPE
# <!DOCTYPE WMT_MS_Capabilities SYSTEM
# "http://schemas.cubewerx.com/schemas/wms/1.1.0/WMT_MS_Capabilities.dtd"
# [
# <!-- vendor-specific elements defined here -->
# <!ELEMENT VendorSpecificCapabilities EMPTY>
# ]>
# which is parsed correctly by IE...

sub process_xml_line_array($$$) {
   my ($fil,$bom,$rlines) = @_;
   my $iret = 0;
   my $lncnt = scalar @{$rlines};
   my ($line, $i, $lnnum);
   my ($j, $ch, $tag, $intag, $nch, $j2, $len);
   my ($attrs, $hadsp, $text, $tmp, $ttag, $ltag);
   my ($pc,$ptag,$ppc,$pppc,$incomm,$ctag,$stkcnt);
   my ($attref,$pushed,$msg,$total_chars,$doc_root,$doc_head,$doc_type);
   my ($indt,$indtsqr,$comtext,$incdata,$plnn,$pref,$lentag);
   prtv( "Processing $lncnt lines, from $fil..." );
   if ($bom & 6) {
      prtv(" in UTF-16");
      prtv("LE") if ($bom & 2);
      prtv("BE") if ($bom & 4);
      prtv("(BOM)") if ($bom & 16);
   }
   prtv("\n");
   $lnnum = 0;
   $tag = '';
   $intag = 0;
   $nch = '';
   $attrs = '';
   $hadsp = 0;
   $text = '';
   $ch = '';
   $ppc = '';
   $incomm = 0;
   $ctag = '';
   $pushed = 0;
   my @tags = ();
   $total_chars = 0;
   $doc_root = '';
   $doc_head = '';
   $doc_type = '';
   $indt = 0;
   $indtsqr = 0;
   $incdata = 0;
   $lentag = 0;
   for ($i = 0; $i < $lncnt; $i++) {
      $lnnum++;
      $line = ${$rlines}[$i];
      $len = length($line);
      $total_chars += $len;
      # now tidy the line a little
      chomp $line;
      $line = trim_all($line);
      $len = length($line);   # get new length to process
      # process the line of text
      prt("$lnnum: [$line]\n") if ($dbg_lines);
      for ($j = 0; $j < $len; $j++) {
          $j2   = $j + 1;
          $pppc = $ppc;
          $ppc  = $pc;
          $pc   = $ch; # store into previous
          $ch   = substr($line,$j,1);
          $nch  = (($j2 < $len) ? substr($line,$j2,1) : " ");
          if ( !$incdata && !$incomm && ($ch eq '-') && ($pc eq '-') && ($ppc eq '!') && ($pppc eq '<') ) {
             $incomm = 1;
             prtv(sprintf("%4d: <!-- Begin comment\n",$lnnum));
             $tag .= $ch;   # no space yet, so put it in TAG
             $j++;
             $j2   = $j + 1;
             $pppc = $ppc;
             $ppc  = $pc;
             $pc   = $ch; # store into previous
             $ch   = substr($line,$j,1);
             $nch  = (($j2 < $len) ? substr($line,$j2,1) : " ");
             next if ($j2 >= $len);
             $pc = " ";    # ensure if this NEXT is a '>', that <!--> does NOT close comments
             $comtext = $tag;
          }
          if ($intag) {
             # IN A TAG - had '<' opening
             # ==========================
              if ($hadsp) {
                  $attrs .= $ch; # after any SPACE while IN a TAG, put it in TEXT
              } else {
                  $tag .= $ch;   # else no space yet, to put it in TAG
                  $lentag = length($tag);
              }
              if ($indt) {
                 if ($incomm) {
                    $incomm = 0 if (($ch eq '>')&&($pc eq '-')&&($ppc eq '-'));
                    prtv(sprintf("%4d: Exit comment -->\n",$lnnum)) if (!$incomm);
                 } elsif ($indtsqr) {
                    $indtsqr = 0 if ($ch eq ']');
                    prtv(sprintf("%4d: End DOCTYPE CDATA] [$tag]\n",$lnnum)) if (!$indtsqr);
                 } else {
                    $indtsqr = 1 if ($ch eq '[');
                    prtv(sprintf("%4d: Bgn DOCTYPE [CDATA [$tag]\n",$lnnum)) if ($indtsqr);
                 }
                 if ( !$indtsqr && ($ch eq '>') ) {
                    $indt = 0;
                    prtv(sprintf("%4d: End DOCTYPE [$tag]\n",$lnnum));
                    # is NOT an open element, SO...
                    $attrs = '';   # clear ALL attributes
                    $text  = '';    # clear ALL text
                    $tag   = '';
                    $intag = 0;    # and OUT of DOCTYPE tag
                 }
                 next;  # stay to EAT whole DOCTYPE declaration...
              }
              # must EAT in line no space CDATA tags
              $incdata = 1 if (($lentag == 9)&&($tag =~ /^<!\[CDATA\[/) && !$hadsp);
              prt("$lnnum: BEGIN CDATA\n") if ($incdata && ($lentag == 9) && !$hadsp && $dbg_cdata);
              # just wait for TAG close
              # if ($ch eq '>') {
              #if ( !$incomm && ($ch eq '>')) {
              if ( !$incdata && !$incomm && ($ch eq '>')) {
                 $pushed = 0;
                  $msg = sprintf("%4d: '>'",$lnnum);
                  $attrs =~ s/>$//;
                  $attrs =~ s/\/$//;
                  $attrs =~ s/\s+//;
                  $attref = get_attrs_ref($attrs);
                  if ($incomm) {
                     $msg .= " COMMENT";
                  } elsif ($tag =~ /^<\?/) {
                      $msg .= " HEADER";
                      $doc_head = $tag.$attrs;
                  } elsif ($tag =~ /^<\//) {
                      # prt("/");
                      $tag =~ s/^<\///;
                      $tag =~ s/>$//;
                      if ($tag eq '!--') {
                         $msg .= " COMMENT";
                      } elsif (@tags) {
                         $ttag = trim_all($tag);
                         #$ltag = $tags[-1];
                         $pref = $tags[-1];
                         $ltag = ${$pref}[0];
                         $plnn = ${$pref}[1];
                          #if ($tags[-1] eq $tag) {
                          if ($ttag eq $ltag) {
                              #$ptag = pop @tags;
                              $pref = pop @tags;
                              $ptag = ${$pref}[0];
                              $plnn = ${$pref}[1];
                              $stkcnt = scalar @tags;
                              $msg .= " POP plnn [$plnn] ($stkcnt)";
                              $ctag = $ptag;
                          } else {
                              $msg .= " FAILED!!!!!";
                              #prtw( "WARNING: Last tag [".$tags[-1]."] NOT $tag\n" );
                              prtw( "WARNING: Last tag [$ltag]($plnn) NOT [$ttag]($lnnum)\n" );
                              $iret |= 4;
                          }
                      } else {
                         # NOT comment, NO tags
                         $msg .= " MISSED!!!!!";
                         prtw( "WARNING: NO TAGS ON STACK $tag\n" );
                         $iret |= 4;
                      }
                  } elsif ($tag =~ /^</) {
                      $tag =~ s/^<//;
                      $tag =~ s/>$//;
                      $ttag = trim_all($tag);
                      #if ($tag eq '!--') {
                      if ($tag =~ /^!--/) {
                         $msg .= " COMMENT [$ttag]";
                      } elsif ($tag =~ /^!\[CDATA\[/) {
                         $msg .= " CDATA [$ttag]";
                         $incdata = 0 if (($ch eq '>')&&($pc eq ']')&&($ppc eq ']')); # out of <![CDATA[...with... ]]>
                         $msg .= " CDATA CLOSED" if (!$incdata);
                      } elsif ($tag =~ /^!DOCTYPE/) {
                         $msg .= " DOCTYPE [$ttag]";
                      } elsif ($pc eq '/') {
                         $msg .= " COMPLETE [$ttag]";
                      } else {
                         push(@tags, [$ttag,$lnnum]);
                         $stkcnt = scalar @tags;
                         $ctag = $ttag;
                         $msg .= " PUSHED [$ttag] ($stkcnt)";
                         $pushed = 1;  # flag an OPEN TAG
                         if ($stkcnt == 1) {
                            if (length($doc_root)) {
                               prtw("WARNING: Appears to have MULTIPLE document ROOTS [$doc_root], NOW [$ttag]\n");
                               prtw("Only one top level element is allowed in an XML document.\n");
                               $iret |= 16;
                            }
                            $doc_root = $ttag;
                         }
                      }
                  } else {
                      prt( " ???? CHECK ME " );
                      $iret |= 8;
                  }
                  $msg .= " tag [$tag]";
                  $msg .= " attrs [$attrs]" if (length($attrs));
                  $msg .= " text [$text]" if (length($text));
                  prtv("$msg\n");
                  $tag = '';
                  if (!$pushed) {
                     # is NOT an open element, SO...
                     $attrs = '';   # clear ALL attributes
                     $text = '';    # clear ALL text
                  }
                  $intag = 0;
              #} elsif ($ch =~ /\s/) {
              #} elsif ( !$incomm && ($ch =~ /\s/)) {
              } elsif ( !$incdata && !$incomm && ($ch =~ /\s/)) {
                  $hadsp = 1;
                  $indt = 1 if ($tag =~ /^<!DOCTYPE/);
                  $incdata = 1 if ($tag =~ /^<!\[CDATA\[/);
                  prtv(sprintf("%4d: Begin DOCTYPE\n",$lnnum)) if ($indt);
              } elsif ($incdata) {
                 $incdata = 0 if (($nch eq '>')&&($ch eq ']')&&($pc eq ']')); # out of <![CDATA[...with... ]]>
                 # BUT what about ]] + <newline(s)>
                 #   > is also an END of CDATA
                 if (($ch eq ']') && ($pc eq ']') && ($nch =~ /\s/) && $incdata ) {
                    # must search HARDER for close
                    my $bal = (($j2+1) < $len) ? substr($line,($j2+1)) : ""; # more in this line
                    $incdata = search_harder($bal,$rlines,$i,$lncnt); 
                    prt(sprintf("%4d: SPECIAL End CDATA\n",$lnnum)) if (!$incdata);
                 }
                 prt("$lnnum: EXIT CDATA\n") if (!$incdata && $dbg_cdata);
              }
          } else {
             # NOT yet in a TAG
             # ================
              if ($ch eq '<') {
                 # Enter a TAG with '<'
                 # out any previous TEXT
                 $attrs =~ s/>$//;
                  if (length($text)||length($attrs)) {
                      $msg = sprintf("%4d:",$lnnum);
                      $msg .= " TEXT [$text] ctag [$ctag]";
                      $msg .= " attrs [$attrs]" if (length($attrs));
                      prtv("$msg\n");
                  }
                  ###$attrs = '';   # clear any attributes
                  ###$text = '';
                  # start the TAG
                  $tag = $ch;
                  $intag = 1; # set IN A TAG
                  $hadsp = 0; # had no space in tag yet
              } else {
                  $text .= $ch;  # accumulate TEXT (between tags)
              }
          }
          # POST character processing
          if ($incomm) {
             if (($ch eq '>')&&($pc eq '-')&&($ppc eq '-')) {
                $incomm = 0;
                prtv(sprintf("%4d: Exit comment",$lnnum));
                prtv(" $tag$attrs\n");
                $tag = '';
                $attrs = '';   # clear ALL attributes
                $text = '';    # clear ALL text
                $intag = 0;
             }
          }
      } # for line of text
      $pppc = $ppc;
      $ppc  = $pc;
      $pc   = $ch; # store into previous
      $ch   = "\n";
      $hadsp = 1; # is the SAME as a SPACE
   }
   # =============================
   prt("File: [$fil]. Doc root [$doc_root]\n");
   if (@tags) {
      $len = scalar @tags;
      #$msg = join('|',@tags);
      $msg = '';
      for ($i = 0; $i < $len; $i++) {
         $pref = $tags[$i];
         $msg .= " | " if (length($msg));
         $msg .= ${$pref}[0]."(".${$pref}[1].")";
      }

      prtw( "WARNING: Still $len ON STACK! List [$msg]\n" );
      $iret |= 1;
   } elsif ($iret) {
      prt( "HAS WARNINGS... $lncnt lines, $total_chars characters.\n" );
   } else {
      prt( "Appears CLEAN... $lncnt lines, $total_chars characters.\n" );
   }
   return $iret;
}

sub process_xml_file($) {
   my ($fil) = @_;
   my $ret = 0;
   my $bom = has_utf_16_BOM($fil);
   my (@lines);
   if (open INF, "<$fil") {
      if ($bom & 2) {
         binmode INF, ":encoding(UTF-16LE)";
      } elsif ($bom & 4) {
         binmode INF, ":encoding(UTF-16BE)";
      }
      @lines = <INF>;
      close INF;
      $lines[0] = substr($lines[0],1) if ($bom & 16); # move PAST the BOM
      $ret = process_xml_line_array($fil,$bom,\@lines);
   } else {
      prtw( "WARNING: FAILED TO OPEN $fil ...\n" );
      $ret = 3;
   }
   return $ret;
}

sub process_xml_files($) {
   my ($rfils) = @_;
   my ($fil);
   my $res = 0;
   foreach $fil (@{$rfils}) {
      $res |= process_xml_file($fil);
   }
   return $res;
}

##################################
# ### MAIN ###
parse_args(@ARGV);
#prt( "$pgmname: Processing file list...\n" );
my $r = process_xml_files( \@in_files );
pgm_exit($r,"");
##################################

sub give_help {
   prt("$pgmname: Version 0.0.1 2010/01/25\n");
   prt("Usage: [Options] input_xml_file\n");
   prt("Options:\n");
   prt(" -h (-?) - This help, and exit 0\n");
   prt(" -l      - Load log at end.\n");
   prt(" -v      - Set verbose output.\n");
   prt("Given an XML input file, parse, display, and advise results.\n");
   pgm_exit(0,"Help exit");
}

sub set_verbosity {
   my (@av) = @_;
   my ($arg,$sarg);
   while (@av) {
      $arg = $av[0];
      if ($arg =~ /^-/) {
         $sarg = substr($arg,1);
         $sarg = substr($sarg,1) while ($sarg =~ /^-/);
         #if ($sarg =~ /^v/i) {
         if ($sarg =~ /^v(.*)$/i) {
            $verbose++;
            $sarg = $1;
            if (length($sarg)) {
               if ($sarg =~ /(=|:)(.+)/) {
                  $sarg = $2;
                  if ($sarg =~ /^\d+$/) {
                     $verbose = $sarg;
                  } else {
                     prt("ERROR: -v= can only be followed by a number, NOT [$sarg]!\n");
                     goto ON_ERROR;
                  }
               } else {
                  $sarg = $1;
                  prt("ERROR: -v can only be followed by '=' or ':', NOT [$sarg]!\n");
                  goto ON_ERROR;
               }
            }
            prtv("Set verbosity to $verbose.\n");
         }
      }
      shift @av;
   }
   return 0;
ON_ERROR:
    prt("ERROR: Unknown argument [$arg]! Aborting...\n");
    pgm_exit(2,"BAD ARGUMENT");
    return 1;
}

sub parse_args {
   my (@av) = @_;
   set_verbosity(@av);
   while (@av) {
      my $arg = $av[0];
      if ($arg =~ /^-/) {
         my $sarg = substr($arg,1);
         $sarg = substr($sarg,1) while ($sarg =~ /^-/);

         if (($sarg =~ /^h/i)||($sarg =~ /^\?/)) {
            prt("Got argument [$arg]...\n");
            give_help();
         } elsif ($sarg =~ /^v/i) {
            #prtv("Set verbosity to $verbose.\n");
         } elsif ($sarg =~ /^l/i) {
            $load_log = 1;
            prtv("Set to load log at end.\n");
         } else {
            prt("ERROR: Unknown argument [$arg]! Aborting...\n");
            pgm_exit(2,"BAD ARGUMENT");
         }
      } else {
         push(@in_files,$arg);
         prtv("Added input file to [$arg]\n");
      }
      shift @av;
   }
   if ($debug_on) {
      if (!@in_files) {
         push(@in_files,$def_file);
         prtv("Set input file to default [$def_file]\n");
         $load_log = 1;
         $verbose = 1;
      }
   }
}

# eof - chkxml.pl
