#!/usr/bin/perl

use strict;
use FILE::stat;
use CGI;
use CGI qw(:standard);
use HTML::Parser ();

my $verbose = 0;
my $package = 'stripms.pl';
my $logfile = 'striplog.txt';
my $actfile = '';
my @input_files = ();
my $linebyline = 0;
my $usearray = 1;
my $msg;
my $htmdata = '';
my $definp = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/My Webs/moon-01.htm";
my $dotoken = 0;

parse_arguments(@ARGV);

die "program: no input files found or specified\n" if ! @input_files;

$msg = "Begin processing ...\n";
open (OP, "> $logfile") or die "No LOG file ...\n";
print OP $msg;
print $msg;
foreach $actfile (@input_files) {
 #my $sb = stat($actfile);
 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
       $atime,$mtime,$ctime,$blksize,$blocks)
           = stat($actfile);
 print OP "Processing [$actfile] ... $size bytes ...\n";
 printf "File is %s, is %s bytes, perm %04o, mtime %s\n",
	 $actfile,
	 $size, $mode & 07777, scalar localtime $mtime;

 open (INP, $actfile) or die "ERROR: Unable to OPEN file $actfile!\n";
 if ($linebyline) {
  while ( <INP> ) {
 	 chomp;
 	 my $line = $_;
	 print "$line\n";
  } ## WHILE THERE IS input
 } elsif ($usearray) {
  my @lines = <INP>;
  chomp @lines;
  print OP "\n*********************************************\n";
  print "@lines";
  print OP "@lines";
  my $all1 = join (' ',@lines);
  print OP "\n*********************************************\n";
  print $all1;
  $htmdata = $all1;
  print OP $all1;
  print OP "\n*********************************************\n";
  print "\n***** search for emails ****\n";
  my $str = $all1;
  while ($str =~ /(([\w._-]+)\@([\w._-]+))/) { ## look for an email addr
     $msg = "user:$2 host:$3 all:$1\n"; ## parts of the addr
	 print OP $msg;
	 print $msg;
	 $str = $'; ## set the str to be the "rest" of the string
  }
  print "\n***** done search for emails ****\n";
 } else {
  $/ = undef; 
  my $all = <INP>;    ## read the whole file into one string 
  print $all;
 }

 close (INP); ## close this INPUT file

} ## end foreach input ...

print "\nDone foreach ...\n";
##$actfile = "c:/HOMEPAGE/P26/home2.htm";

sub dotoken {

print "\nDoning token parser ...\n";
use HTML::TokeParser;
my $p2 = HTML::TokeParser->new($actfile); ## shift||"index.html");
while (my $token = $p2->get_tag("a")) {
    my $url = $token->[1]{href} || "-";
    my $text = $p2->get_trimmed_text("/a");
    print "$url\t$text\n";
}
print "\nDone URL list ...$actfile\n";

my $p3 = HTML::TokeParser->new($actfile) ||
      die "Can't open: $!";

##  ["S",  $tag, $attr, $attrseq, $text]
##  ["E",  $tag, $text]
##  ["T",  $text, $is_data]
##  ["C",  $text]
##  ["D",  $text]
##  ["PI", $token0, $text]
##use Switch;
my $WHITE_PATTERN = "^[ \t]*\$"; # only spacey stuff, like if ( /$WHITE_PATTERN/o ) { ...}
my $WHITE_PATTERN2 = "^[ \t\n\r]*\$"; # only spacey stuff, like if ( /$WHITE_PATTERN/o ) { ...}

 while (my $tok3 = $p3->get_token) {
     # ...
	 my $typ = $tok3->[0];
	 my $txt = $tok3->[1];
	 print OP "Before=[$txt]\n";
	chomp $txt;
	 print OP "AfterC=[$txt]\n";
	if ($txt =~ /$WHITE_PATTERN2/o) {
		# skip just space
	} else {
##	 switch ($typ) {
##		 case "S" {    }
##       case "E" {    }
##     }
	if ($typ eq "S") {
		$msg = "S=<" . $txt . ">" . $tok3->[3];
	} elsif ($typ eq "E") {
		$msg = "E=</" . $txt . ">" . $tok3->[2];
	} elsif ($typ eq "T") {
		$msg = "T=$txt";
	} elsif ($typ eq "C") {
		$msg = "C=$txt";
	} elsif ($typ eq "D") {
		$msg = "D=$txt";
	} elsif ($typ eq "PI") {
		$msg = "PI=$txt";
	} else {
		$msg = "??" . $typ . $txt;
	}
	$msg .= "\n";
	print $msg;
	print OP $msg;
	}
 }

} ## if $dotoken

###use HTML::Parser ();
##use HTML::Parser;
#HTML::Parser->new(default_h => [\&my_text, "text"], ### [sub { print shift }, 'text'],
#                  comment_h => [\&my_comm, "text"],
#                  )->parse_file($actfile || die) || die $!;
# Create parser object
my $pp = HTML::Parser->new( api_version => 3,
						### start_h => [\&my_start, "tagname, attr"],
						start_h => [\&my_start2, "self,tokens" ],
                        end_h   => [\&my_end,   "tagname"],
						text_h  => [\&my_text,  "text"],
                        marked_sections => 1,
                       );
  sub start_handler
  {
    return if shift ne "title";
    my $self = shift;
    $self->handler(text => sub { print shift }, "dtext");
    $self->handler(end  => sub { shift->eof if shift eq "title"; },
                           "tagname,self");
  }

##  my $p = HTML::Parser->new(api_version => 3);
##  $p->handler( start => \&start_handler, "tagname,self");
##  $p->parse_file($actfile || die) || die $!;

 # Parse document text chunk by chunk
 # $pp->parse($chunk1);
# $pp->parse($htmdata);
 #...
# $pp->eof;                 # signal end of document

# Parse directly from file
print "\n********* parse directly from file ************************************\n";
$pp->parse_file("$actfile"); ### foo.html");
print "\n********* end parse directly from file ************************************\n";
 # or
# open(F, "$actfile") || die "Unable to open file ...\n";
# $pP->parse_file(*F);

print "THE END!\n";
close OP;

sub my_start2 {
	my ($self, @toks) = @_;
	$msg = '';
	foreach my $tkn (@toks) {
		$msg .= " $tkn"
	}
	print "@toks\n";
	print "$msg \n";
}

sub my_start {
	my ($tg,%atr) = @_;
	my @attr = %atr;
	#$msg = "Start=[$tg] Attr=[";
	#$msg .= join (" ", @attr);
	#$msg .= "]\n";
	#print $msg;
	print "size of hash:  " . keys( %atr ) . ".\n";
	$msg = "Start=[$tg] Attr=[";
    while ( my ($key, $value) = each(%atr) ) {
        $msg .= " $key => $value";
    }
	$msg .= "]\n";
	print $msg;

	$msg = "Start=[$tg] Attr=[";
	for my $key2 ( keys %atr ) {
        my $value2 = $atr{$key2};
        $msg .= " $key2 => $value2";
    }
	$msg .= "]\n";
	print $msg;
}

sub my_end {
	my ($tge) = @_;
	$msg = "End=[$tge]\n";
	#print $msg;
}

sub my_text {
	my ($txt) = @_;
	## chomp $txt;
    ## print "text=[$txt]\n";
	## print $txt;
}

sub my_comm {
	my ($txt) = @_;
	## chomp $txt;
    ## print "text=[$txt]\n";
	print $txt;
}

sub parse_arguments {
 my @av = @_; # take it off the passed stack
 while (@av) {
  if ($av[0] eq '-v') {
   print "$package: Version 0.0.1 ... setting verbose ...\n";
   $verbose = 1;
  } elsif (($av[0] eq '-help') || ($av[0] eq '-h') || ($av[0] eq '-?')) {
   die "$package filename [-v = version] [-? = this help] - strip MS html from file ...\n";
  } elsif ($av[0] =~ /^-/) {
   die "$package: unrecognised option --$av[0]\nTry $package -help, for more information ;=)).\n";
  } else {
   $actfile = $av[0];
   print "Adding file [$actfile] ...\n";
   if ( !( -f $actfile) ) {
	 die "$package: Unable to loacte [$actfile]!\n";
   }
   push(@input_files, $actfile);
  }

  shift @av; # move to next argument
 } # while arguments

 ### set default
 push(@input_files, $definp) if ! @input_files;

}

