#!/usr/bin/perl -W
# 23/01/2013 - Strip a BOM
# 27/01/2010 - Initial cut
# BOM Type    Hex          = Decimal
# UTF-8       EF BB BF     = 239 187 191
# UTF-16 (BE) FE FF        = 254 255
# UTF-16 (LE) FF FE        = 255 254 
# UTF-32 (BE) 00 00 FE FF  = 0 0 254 255 
# UTF-32 (LE) FF FE 00 00  = 255 254 0 0 
# UTF-7       2B 2F 76, and one of: [38|39|2B|2F] 43 47 118, and one of: [56|57|43|47] +/v, and one of 8 9 + / 
# UTF-1       F7 64 4C     = 247 100 76
# UTF-EBCDIC  DD 73 66 73  = 221 115 102 115
# SCSU        0E FE FF     = 14 254 255 
# BOCU-1      FB EE 28 +optional FF 251 238 40 +optional 255
# GB-18030    84 31 95 33  = 132 49 149 51
use strict;
use warnings;
use File::Basename;  # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] )
###use Cwd;
my $os = $^O;
my $perl_dir = '/home/geoff/bin';
my $PATH_SEP = '/';
my $temp_dir = '/tmp';
if ($os =~ /win/i) {
    $perl_dir = 'C:\GTools\perl';
    $temp_dir = $perl_dir;
    $PATH_SEP = "\\";
}
unshift(@INC, $perl_dir);
##require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' Check paths in \@INC...\n";
# log file stuff
our ($LF);
my $pgmname = $0;
if ($pgmname =~ /(\\|\/)/) {
    my @tmpsp = split(/(\\|\/)/,$pgmname);
    $pgmname = $tmpsp[-1];
}
my $outfile = $temp_dir.$PATH_SEP."temp.$pgmname.txt";
###open_log($outfile);

my $VERS = "0.0.2 2013-01-23";
my $load_log = 0;
my $in_file = '';
my $verbosity = 0;
my $out_file = '';
my $strip_bom = 1;
my $removed_bom = 0;

# ### DEBUG ###
my $debug_on = 0;
#my $def_file = 'def_file';
#my $def_file = 'C:\DTEMP\libxml2-2.6.30\result\slashdot16.xml';
my $def_file = 'C:\HOMEPAGE\GA\projects\ini.htm';
my $do_quick_test = 0;
my $BOM_offset = -1;

sub VERB1() { return $verbosity >= 1; }
sub VERB2() { return $verbosity >= 2; }
sub VERB5() { return $verbosity >= 5; }
sub VERB9() { return $verbosity >= 9; }

my @BOM_list = (
    [ "UTF-8",       3, [0xEF,0xBB,0xBF     ] ], # 239 187 191   
    [ "UTF-16 (BE)", 2, [0xFE,0xFF          ] ], # 254 255 
    [ "UTF-16 (LE)", 2, [0xFF,0xFE          ] ], # 255 254
    [ "UTF-32 (BE)", 4, [0x00,0x00,0xFE,0xFF] ], # 0 0 254 255
    [ "UTF-32 (LE)", 4, [0xFF,0xFE,0x00,0x00] ], # 255 254 0 0
    [ "UTF-7a"     , 4, [0x2B,0x2F,0x76,0x38] ], # 2B 2F 76 39  2B 2F 76 2B  2B 2F 76 2F
    [ "UTF-7b"     , 4, [0x2B,0x2F,0x76,0x39] ], # 2B 2F 76 39  2B 2F 76 2B  2B 2F 76 2F
    [ "UTF-7c"     , 4, [0x2B,0x2F,0x76,0x2B] ], # 2B 2F 76 39  2B 2F 76 2B  2B 2F 76 2F
    [ "UTF-7d"     , 4, [0x2B,0x2F,0x76,0x2F] ], # 2B 2F 76 39  2B 2F 76 2B  2B 2F 76 2F
    [ "UTF-1"      , 3, [0xF7,0x64,0x4C     ] ], # 247 100 76 
    [ "UTF-EBCDIC" , 4, [0xDD,0x73,0x66,0x73] ], # 221 115 102 115
    [ "SCSU"       , 3, [0x0E,0xFE,0xFF     ] ], # 14 254 255
    [ "BOCU-1"     , 3, [0xFB,0xEE,0x28     ] ], # 251 238 40
    [ "GB-18030"   , 4, [0x84,0x31,0x95,0x33] ]  # 132 49 149 51
);

sub prt($) { print shift; }


sub pgm_exit($$) {
    my ($val,$msg) = @_;
    if (length($msg)) {
        $msg .= "\n" if (!($msg =~ /\n$/));
        prt($msg);
    }
    ###show_warnings($val);
    ###close_log($outfile,$load_log);
    exit($val);
}


sub has_utf_16_BOM($) {
   my ($fil) = shift;
   if (open INF, "<$fil") {
      binmode INF;
      my $buf = "";
      if ((read INF, $buf, 2) == 2) {
         close INF;
         my $od1 = ord(substr($buf,0,1));
         my $od2 = ord(substr($buf,1,1));
         if (($od1 == 0xFF)&&($od2 == 0xFE)) {
            return 2;   # LittleEndians (intel)
         } elsif (($od1 == 0xFE)&&($od2 == 0xFF)) {
            return 3;   # BigEndians (amd)
         }
         return 1;
      }
      close INF;
   }
   return 0;
}

if ($do_quick_test) {
    if (@ARGV) {
       $in_file = $ARGV[0];
    }

    my $bom = has_utf_16_BOM($in_file);

    print "File [$in_file]\n";
    if ($bom == 2) {
       print " *** IS *** UTF-16 (LE) LittleEndians 0xFFFE";
    } elsif ($bom == 3) {
       print " *** IS *** UTF-16 (BE) BigEndians 0xFEFF";
    } elsif ($bom == 1) {
       print "is *** NOT *** UTF-16...";
    } else {
       print "Can not open file!";
    }
    print "\n";
}

sub line_has_bom($$) {
    my ($line,$rname) = @_;
    my $max = scalar @BOM_list;
    my $len = length($line);
    my ($i,$j,$name,$cnt,$ra,$ch,$val);
    for ($i = 0; $i < $max; $i++) {
        $name = $BOM_list[$i][0]; # name
        $cnt  = $BOM_list[$i][1]; # length
        $ra   = $BOM_list[$i][2]; # ref array of values
        if ($len > $cnt) {  # make sure line length GT BOM
            for ($j = 0; $j < $cnt; $j++) {
                $ch = substr($line,$j,1);   # extract CHAR
                $val = ord($ch);            # get VALUE
                last if ($val != ${$ra}[$j]); # compare
            }
            if ($j == $cnt) {   # if ALL values found
                $BOM_offset = $i;
                ${$rname} = $name;  # give back 'name'
                return $cnt;    # and return count
            }
        }
    }
    return 0;   # no BOM found
}

sub remove_utf_bom($$) {
    my ($ff,$ra) = @_;
    my $line = ${$ra}[0];  # get first line
    my $name = '';
    my $len = line_has_bom($line,\$name);
    my ($nm,$dr) = fileparse($ff); # just show name
    if ($len) {
        $line = substr($line,$len); # truncate line
        ${$ra}[0] = $line;  # and return minus BOM
        prt("NOTE: File [$nm] is $name encoding. BOM($len) removed.\n");
        $removed_bom = 1;
    } else {
        prt("NOTE: File [$nm] has no BOM.\n");
    }
}

sub load_file_lines($$) {
    my ($ff,$ra) = @_;
    my $lncnt = 0;
    if (open INF, "<$ff") {
        @{$ra} = <INF>;
        close INF;
        $lncnt = scalar @{$ra};
        remove_utf_bom($ff,$ra) if ($strip_bom);
    } else {
        prtw("WARNING: Unable to open [$ff]!\n");
    }
    return $lncnt;
}

sub my_write2file {
	my ($txt,$fil) = @_;
	open WOF, ">$fil" or mydie("ERROR: Unable to open $fil! $!\n");
	print WOF $txt;
	close WOF;
}


sub process_in_file($) {
    my ($inf) = @_;
    my @lines = ();
    load_file_lines($inf,\@lines);
    my $lncnt = scalar @lines;
    if ($removed_bom) {
        prt("Got $lncnt lines, from [$inf]...\n");

        if (VERB9() && ($BOM_offset >= 0)) {
            my $i = $BOM_offset;
            my $name = $BOM_list[$i][0]; # name
            my $cnt  = $BOM_list[$i][1]; # length
            my $ra   = $BOM_list[$i][2]; # ref array of values
            prt("Removed BOM: $name, $cnt [");
            for ($i = 0; $i < $cnt; $i++) {
                prt(" ") if ($i);
                prt(sprintf("%#02x", ${$ra}[$i]));
            }
            prt("]\n");
        }

        my $txt = join("",@lines)."\n";
        if (length($out_file)) {
            my_write2file($txt,$out_file);
            prt("Data written to outfile [$out_file]\n");
        } else {
            my_write2file($txt,$outfile);
            prt("Data written to DEFAULT outfile [$outfile]\n");
        }
    }
}

#########################################
### MAIN ###
parse_args(@ARGV);
process_in_file($in_file);
pgm_exit(0,"");
########################################

sub need_arg {
    my ($arg,@av) = @_;
    pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av);
}

sub parse_args {
    my (@av) = @_;
    my ($arg,$sarg);
    while (@av) {
        $arg = $av[0];
        if ($arg =~ /^-/) {
            $sarg = substr($arg,1);
            $sarg = substr($sarg,1) while ($sarg =~ /^-/);
            if (($sarg =~ /^h/i)||($sarg eq '?')) {
                give_help();
                pgm_exit(0,"Help exit(0)");
            } elsif ($sarg =~ /^v/) {
                if ($sarg =~ /^v.*(\d+)$/) {
                    $verbosity = $1;
                } else {
                    while ($sarg =~ /^v/) {
                        $verbosity++;
                        $sarg = substr($sarg,1);
                    }
                }
                prt("Verbosity = $verbosity\n") if (VERB1());
            #} elsif ($sarg =~ /^l/) {
            #    if ($sarg =~ /^ll/) {
            #        $load_log = 2;
            #    } else {
            #        $load_log = 1;
            #    }
            #    prt("Set to load log at end. ($load_log)\n") if (VERB1());
            } elsif ($sarg =~ /^o/) {
                need_arg(@av);
                shift @av;
                $sarg = $av[0];
                $out_file = $sarg;
                prt("Set out file to [$out_file].\n") if (VERB1());
            } else {
                pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n");
            }
        } else {
            $in_file = $arg;
            prt("Set input to [$in_file]\n") if (VERB1());
        }
        shift @av;
    }

    if ((length($in_file) ==  0) && $debug_on) {
        $in_file = $def_file;
        prt("Set DEFAULT input to [$in_file]\n");
    }
    if (length($in_file) ==  0) {
        pgm_exit(1,"ERROR: No input files found in command!\n");
    }
    if (! -f $in_file) {
        pgm_exit(1,"ERROR: Unable to find in file [$in_file]! Check name, location...\n");
    }
}

sub give_help {
    prt("$pgmname: version $VERS\n");
    prt("Usage: $pgmname [options] in-file\n");
    prt("Options:\n");
    prt(" --help  (-h or -?) = This help, and exit 0.\n");
    prt(" --verb[n]     (-v) = Bump [or set] verbosity. def=$verbosity\n");
    ### prt(" --load        (-l) = Load LOG at end. ($outfile)\n");
    prt(" --out <file>  (-o) = Write output to this file.\n");
}

# eof - utf_16_BOM.pl