#!/Perl
##
## geoffmclane@hotmail.com
## just looking for a way to clean up 'complicated' HTML files, and build
## a 'simpler' version. Naturally, some of the 'look and feel' is lost,
## but the information is ALL there ... plus some ...
##
use HTML::Parser ();
use Data::Dump ();
### use URI::URL;
### use APR::URI ();
  
my $program = "stripms4"; ### published as perl1.htm 2005.05.11 update
## user feature variables ##
my $dodebug = 1;
### still a
my $definp = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/Wednesday.htm";
my $defout = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/temphtml.htm";
## test show could remove the 'space' after a <br>, where it is ignored by the browser (ie)
###my $definp = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/My Webs/moon-01.htm";
###my $definp = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/Russel/Russ-04.htm";
###my $defout = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/Russel/temphtml.htm";
##my $definp = "C:/HOMEPAGE/P26/compgr.htm";
##my $defout = "C:/HOMEPAGE/P26/temphtml.htm";
my $deflog = "temphtml.txt"; # output log file ... more if $dodebug = 1!
my $defskip = "tempskip.txt"; # view what has been REJECTED, DELETED, CHOPPED 
my $WEBVERS = "P26.2005.05.05";
my $addcode = "<!-- $WEBVERS - geoffmclane.com - stripms.pl -->";

my $clearhtml = 1; # clear HTML attributes
## paragraph handling
my $clearop = 1; # clear MS o:p paragraph thingy 
my $clearpatts = 0; # clear P paragraph attributes
my $clearplang = 1; # modify P lang attribute
my $defnorm1 = 'MsoNormal';
my $defnorm2 = 'MsoPlainText';
my $postpara = 1; # handle paragraphs post </p>, to allow delete of &nbsp; only para
my @paraarr = ();
my $innsep = 0; # got C '<!', [ ], in not support empty paras = '<![if !supportEmptyParas]>'
my $nsepif = '<![if !supportEmptyParas]>';
my $delpara = 0; 

my $clearhstyl = 1; # no SYTLE statment in head - include through file, if required ... *TBD*
my $cleartdsty = 0; # clear TD attributes
my $fiximg = 1; # modify the IMG tag
my $clearhlink = 1; # clear a LINK REL statement
my $clearspan = 1; # remove all SPAN tags
my $cleardiv = 1; # remove all DIV tags
my $clearmeta = 1; # remove META (head) tag

## BODY actions
my $clearbsyle = 0; # no BODY attributes
my $fixblstyle = 1; # modify body language, if given
my $deflang = 'EN-AU'; # use English (Austrlian)
my $delayclose = 1; # close after PARSE is DONE ie no E body, or html ...
## A tag HREF hyperlink
my $keeplinks = 1; # store links found
my @links; # store of links
my $actlink = "";
my $act1 = "";

## program variables ##
my $WHITE_PATTERN2 = "^[ \t\n\r]*\$"; # spacey if ($var ~= /$WHITE_PATTERN2/o ) { ...}
my ($FH, $HH, $CH); # run log, html and strip log ...
my $doout = 1; # do the OUTPUT, but can be off'ed ...
my $inpfil = "";
my $subok = 0;
my $msg = ""; # used to build a message, for multiple output
my $inhtml = 0; # in document
my $inpara = 0; # in paragraph tag
my $inhead = 0; # processing header
my $inbody = 0; # body processing
my $instyle = 0; # style processing

my $start_time = time();

open $HH, ">$defout"  or die "No HTML output file ... [$defout]!\n";
open $FH, ">$deflog"  or die "No OUT LOG file ...\n";
open $CH, ">$defskip" or die "No SKIP file ...\n";

my $p = HTML::Parser->new(api_version => 3);

$p->handler(default => \&hand, "event, line, column, text, tagname, attr");

# $p->parse_file(@ARGV ? shift : die "No input given ....\n");
parse_args(@ARGV);
# if we did NOT get an INPUT file, what to DO ...
if ( !length($inpfil) ) {
	if ($dodebug) {
		$inpfil = $definp;
	} else {
		die "No input file given ...\n";
	}
}
$msg = "$program: Started on " . localtime($start_time) ; ### . " in $cwdir ...\n" if $shwtm;
print "$msg\n";
print $FH "$msg\n";

$p->parse_file($inpfil);

my $cnt = @links;
my %links1 = ();
$msg = "Found $cnt hyperlinks ...";
prt2 ("$msg\n");
$i = 0;
foreach $actlink (@links) {
	$i++;
	$msg = "$i [$actlink]";
	prt2 ( "$msg\n" );
	local $lnk = $actlink;
	$lnk =~ s,^http://,,i; # remove HTTP:// start, if any
	$msg = "$i [$lnk]";
	prt2 ("$msg\n");
	## split URL from HREF of A tag
	local @ar = split(/\?/, $lnk);
	local $param = $ar[1]; # parameter, if any
	local @ar1 = split(/\//, $ar[0]);
	local $ar1cnt = @ar1; # get count of items
	$act1 = uc($ar1[0]); # always a first
	local $page = $ar1[1]; # if more pages ...
	chomp $page;
	if (length($page)) {
		local $i2 = 0;
		for ($i2 = 2; $i2 < $ar1cnt; $i2++) {
			chomp $ar1[$i2];
			if (length($ar1[$i2])) {
				$page .= "/$ar1[$i2]";
			}
		}
	}
	## combine the value
	$msg = "$page";
	if (length($param)) {
		$msg .= "?$param";
	}
	if ( exists $links1{$act1} ) {
		local $val2 = $links1{$act1};
		chomp $val2;
		if ( length($val2) ) {
			if ( $val2 =~ "/$msg/" ) {
				## already exists in message == a REPEAT param
				prt2 ("Avoided adding $msg\n");
			} else {
				$links1{$act1} .= " " . $msg; ### ADD "$page?$param"; ## $ar[1];
			}
		} else {
			$links1{$act1} = $msg; ### ADD "$page?$param"; ## $ar[1];
		}
	} else {
		$links1{$act1} = $msg; ### "$page?$param"; ## $ar[1];
	}
	### log message
	$msg = "$i ";
	if ($act1 eq $actlink) {
		$msg .= ' same';
	} else {
		$msg .= " site[$act1]";
	}
	prt2( "$msg\n");
}

$msg = "lISTED $cnt hyperlinks ...";
prt2 ( "$msg\n");

my @urlkeys = %links1; # get the key list
$cnt = @urlkeys / 2; # pairs key,value
$msg = "Found $cnt DOMAINS ...";
prt2 ( "$msg\n" );
print $HH "<p>$msg<br>\n";
$i = 0;
for my $url ( sort keys %links1 ) {
	$i++;
	$msg = "url[$url]";
	if ( length( $links1{$url} ) ) {
		$msg .= "/$links1{$url}";
	}
	prt2("$msg\n");
	print $HH "Link $i: <A HREF=\"http://$url\"><b>$url</b></A><br>\n";
}
print $HH "Listed $i DOMAINS</p>\n";
print $HH "</body>\n";
print $HH "</html>\n";

# print @urlkeys;
$msg = "List of $cnt DOMAINS ...";
prt2( "$msg\n" );

$msg = "$program: Ending on " . localtime(time());
prt2( "$msg\n" );

close $FH; # log file output
close $HH; # ouput HTML file
close $CH; # log of discarded items
my $bakname = getback($inpfil); # get back up name
# print "\fFrom [$inpfil] got [$bakname] ... \n";
system $defout; # run the HTML file

if ($subok) {
	open $HH, "<$defout"  or die "No new HTML input file ... [$defout]!\n";
	##my $bakname = getback($inpfil); # get back up name
	open $FH, "<$inpfil" or die "No re-open of the source ... [$inpfil]!\n";
	my @infil = <$HH>; # slurp the file, from the disk
	my @outfil = <$FH>; # slurp the original source
	close $FH; # log file output
	close $HH; # ouput HTML file
	open $HH, ">$defout"  or die "No HTML input file ... [$defout]!\n";
	open $FH, ">$inpfil" or die "No re-open of the source ... [$inpfil]!\n";
	print $HH @outfil;
	print $FH @infil;
	close $FH; # log file output
	close $HH; # ouput HTML file
}


## Event table
##  ["S",  $tag, $attr, $attrseq, $text]
##  ["E",  $tag, $text]
##  ["T",  $text, $is_data]
##  ["C",  $text]
##  ["D",  $text]
##  ["PI", $token0, $text]
sub hand {
    my($event, $line, $column, $text, $tagname, $attr) = @_;
	my $typ = uc(substr($event,0,1)); ## get TYPE
    my @d =  "$typ L$line C$column";
    #substr($text, 40) = "..." if length($text) > 40;
    push(@d, $text);
    push(@d, $tagname) if defined $tagname;
	push(@d, $attr) if $attr;
	my $otxt = Data::Dump::dump(@d);
	#print $FH Data::Dump::dump(@d), "\n";
    #print Data::Dump::dump(@d), "\n";
	print "$otxt\n";
	# now process the data ...
	my $locout = 1; # one time only output flag
	my $i;
	my $tag = '*NO_TAG*';
	if (defined $tagname) {
		$tag = uc($tagname);
	}

	# Event table
	########################################################################
	if ($typ eq 'S') { # START OF TAG, and possible ATTRIBUTES
		##  ["S",  $tag, $attr, $attrseq, $text]
		if ($tag eq 'HTML') {
			$inhtml = 1;
			if ($clearhtml) {
				$text = '<html>';
			}
			if (defined $addcode) { ## = "<!-- $WEBVERS - geoffmclane.com - stripms.pl -->";
				$text .= "\n";
				$text .= "$addcode";
			}
		} elsif ($tag eq 'A') { # a hyperlink - collect, at least ...
			if ($keeplinks) {
				my %att = %$attr; # copy the HASH, to do modifications
				$i = 0;
				prt( "Checking A attribs ...\n" );
				foreach $key (keys %att) {
					prt ( "Checking A attrib $key ...\n" );
					if ($key eq 'href') {
						prt ( "Found and saving $key=$att{$key} ...\n" );
						push(@links, $att{$key});
					}
				}
			}
		} elsif ($tag eq 'P') {
			$inpara = 1;
			@paraarr = (); # clear paragraph accumulator
			if ($clearpatts) {
				print $FH "Paragraph from $text to <$tagname> ...\n";
				$text = "<$tagname>";
			} elsif ($clearplang) { # modify P lang= attribute
				# use my $deflang = 'EN-AU'; # use English (Austrlian)
				my %att = %$attr; # copy the HASH, to do modifications
				$i = 0;
				prt( "Checking P attrib ...\n" );
				foreach $key (keys %att) {
					prt ( "Checking attrib $key ...\n" );
					if ($key eq 'lang') {
						prt ( "Found $key=$att{$key} ...\n" );
						if ($att{$key} ne $deflang) {
							prt ( "Modifying $key=$att{$key} to [$deflang] ...\n" );
							$att{$key} = $deflang;
						}
						$i++;
					} elsif ($key eq 'class') {
						##my $defnorm1 = 'MsoNormal';
						##my $defnorm2 = 'MsoPlainText';
						prt ( "Found $key=[" . $att{$key} . "]\n" );
						if (($att{$key} eq $defnorm1)||($att{$key} eq $defnorm2) ) {
							prtd ( "Deleting $key=$att{$key} ...\n" );
							#$att{$key} = $deflang;
							delete $att{$key}; # remove this MS reference
							$i++;
						}
					}
				}
				if ($i) { # ok, change output, re-run to build new HTML
					$text = "<$tagname"; # start tag again
					while (($key,$value) = each %att) {
						$text .= " $key=$value";
					}
					$text .= '>'; # close tag
				}
			}
		} elsif ($tag eq 'HEAD') {
			$inhead = 1;
		} elsif ($tag eq 'BODY') {
			$inbody = 1;
			if ($clearbsyle) { # no BODY attributes
				$text = '<body>';
			} elsif ($fixblstyle) { # modify body language, if given
				# use my $deflang = 'EN-AU'; # use English (Austrlian)
				my %att = %$attr; # copy the HASH, to do modifications
				$i = 0;
				prt( "Checking BODY attrib ...\n" );
				foreach $key (keys %att) {
					prt ( "Checking attrib $key ...\n" );
					if ($key eq 'lang') {
						prt ( "Found lang=$key ...\n" );
						if ($att{$key} ne $deflang) {
							prt ( "Modifying 'lang=$att{$key} to [$deflang] ...\n" );
							$att{$key} = $deflang;
						}
						$i++;
					}
				}
				if ($i) { # ok, change output, re-run to build new HTML
					$text = "<$tag"; # start tag again
					while (($key,$value) = each %att) {
						$text .= " $key=$value";
					}
					$text .= '>'; # close tag
				}
			}
		} elsif ($tag eq 'STYLE') {
			$instyle =1;
			if ($clearhstyl) {
				# in head - close out S style to E sytle
				if ($inhead) {
					$doout = 0; # CLOSE output
				}
			}
		} elsif ($tag eq 'SPAN') {
			if ($clearspan) { # remove all SPAN tags
				$locout = 0;
			}
		} elsif ($tag eq 'O:P') {
			if ($clearop) { # clear MS o:p paragraph thingy 
				$locout = 0;
			}
		} elsif ($tag eq 'LINK') {
			if ($clearhlink) {
				$locout = 0;
			}
		} elsif ($tag eq 'DIV') {
			if ($cleardiv) {
				$locout = 0;
			}
		} elsif ($tag eq 'TD') {
			if ($cleartdsty) {
				$text = '<td>';
			}
		} elsif ($tag eq 'IMG') {
			if ($fiximg) { # modify the IMG tag
				my %att = %$attr; # copy the HASH, to do modifications
				$i = 0;
				foreach $key (keys %att) {
					if ($key eq 'v:shapes') {
						prtd ( "Deleting attrib $key ...\n" );
						delete $att{$key}; # remove this MS reference
						$i++;
					}
				}
				if ($i) {
					$text = '<IMG'; # start IMG tag again
					while (($key,$value) = each %att) {
						$text .= " $key=$value";
					}
					$text .= '>'; # close IMG tag
				}
			}
		} elsif ($tag eq 'META') {
			if ($clearmeta) { # remove META (head) tag
				$locout = 0;
			}
		}
	########################################################################
	} elsif ($typ eq 'E') {
		##  ["E",  $tag, $text]
		if ($tag eq 'HTML') {
			if ($delayclose) { # close after PARSE is DONE ie no E body, or html ...
				$locout = 0;
			} else {
				$inhtml = 0;
			}
		} elsif ($tag eq 'P') {
			$inpara = 0;
			if ($postpara) {
				$locout = 0; # clear any output reqd
				push(@paraarr,$text); # do it all here
				$i = @paraarr; # count item count
				if ($delpara) { # if a DUMMY thing
					prtd ("Para components $i [ @paraarr ]\n");
					prtd ("This NON-BLAMK-SPACE dummy paragraph has been deleted ...\n");
				} else { # send em out, line by line ...
					$i = 0;
					foreach $msg (@paraarr) {
						$i++;
						prt ( "$msg\n" );
						print $HH "$msg\n"; # out to HTML file
					}
				}
			}
			$delpara = 0; ####### E = clear DELETE OF PARAGRAPH #######
		} elsif ($tag eq 'HEAD') {
			$inhead = 0;
		} elsif ($tag eq 'BODY') {
			if ($delayclose) { # close after PARSE is DONE ie no E body, or html ...
				$locout = 0;
			} else {
				$inbody = 0;
			}
		} elsif ($tag eq 'STYLE') {
			$instyle = 0;
			if ($clearhstyl) {
				# in head - close out S style to E sytle
				if ($inhead) {
					$doout = 1; # OPEN output
					$locout = 0; # but NOT for this style one
				}
			}
		} elsif ($tag eq 'SPAN') {
			if ($clearspan) { # remove all SPAN tags
				$locout = 0;
			}
		} elsif ($tag eq 'O:P') {
			if ($clearop) { # clear MS o:p paragraph thingy 
				$locout = 0;
			}
		} elsif ($tag eq 'LINK') {
			if ($clearhlink) {
				$locout = 0;
			}
		} elsif ($tag eq 'DIV') {
			if ($cleardiv) {
				$locout = 0;
			}
		}
	} elsif ($typ eq 'T') {
		##  ["T",  $text, $is_data]
		if ($text eq '&nbsp;') {
			if ($innsep) {
				$delpara = 1;
				prt ("Should DELETE this 'dummy' paragraph...\n");
			}
		}
	} elsif ($typ eq 'C') {
		##  ["C",  $text]
		$locout = 0; # toss all CODE
		if ($text eq $nsepif ) {  #'<![if !supportEmptyParas]>'
			$innsep = 1; # got C '<!', [ ], in not support empty paras
		} else {
			$innsep = 0;
		}
	} elsif ($typ eq 'D') {
		##  ["D",  $text]
	} elsif ($typ eq 'P') {
		##  ["PI", $token0, $text]
	}
	### end event table ###########################################################
	if ($text =~ /$WHITE_PATTERN2/o) {
		print $CH "ws[$otxt]\n";
		print $CH "ws[$text]\n";
	} else {
		if ($doout && $locout) {
			if ($postpara && $inpara) {
				push(@paraarr, $text); # store paragraph components
			} else {
				print $FH "$otxt\n";
				print $HH "$text\n";
			}
		} else {
			print $CH "$otxt\n";
			print $CH "$text\n";
		}
	}
}

sub parse_args {
	my (@av) = @_; # get stack
	while (@av) {
		my $arg = uc($av[0]);
		if ($arg =~ /^-/) {
			if ($arg eq '-V') {
				print "Version: 0.0.1 - May 2005\n";
			} elsif (($arg eq '-H') || ($arg eq '-?')) {
				die "stripms infile [options]\n";
			} else {
				die "ERROR: Unknown option [$arg]\n";
			}
		} else {
			if (length($inpfil)) {
				die "ERROR: Can not handle two input files ...\n";
			}
			$inpfil = $arg;
			if ( !(-f $inpfil) ) {
				die "ERROR: Can NOT locate file [$inpfil] ...\n";
			}

		}
		shift @av;
	}
}

sub prt {
	if ($dodebug) {
		print $FH @_;
	}
}

sub prt2 {
	if ($dodebug) {
		prt(@_);
		print @_;
	}
}

sub prtd {
	if ($dodebug) {
		prt(@_);
		print $CH @_;
	}
}


sub getback {
	local ($ff) = @_;
	$ff =~ s/\\/\//g; # sub/ensure *nix path separators ie c:\usr becomes c:/usr ...
	local $nf = getfn($ff);
	$nf .= '.bak';
	return $nf;
}

sub dirname { # passed a path, './dir1/dir2/file.name' returns './dir1/dir2/
local ($file) = @_;
local ($sub);
($sub = $file) =~ s,/+[^/]+$,,g;
$sub = '.' if $sub eq $file;
return $sub;
}

sub getfn {
local ($ff) = @_;
local $dn = dirname($ff); # get the directory/path name ...
local $nf;
($nf = $ff) =~ s,^$dn,,;
$nf =~ s,^/,,;
return $nf;
}


# EOF
