#!/Perl
# cleanhtm02.pl - 2006.10.01 - geoff mclane (geoffmclane.com)
# AIM: To clean certain items from a HTML document ...
# specifically target microsoft word 'filtered' output which uses a
# paragraph style which places every line in a <p> ... </p>,
# and to denote REAL paragraphs, inserts a <p>&nbsp;</p> line ...
# Thus to remove some of the <p>, using a <br> to get to a new line ...
# =====================================================================
use strict;
require 'logfile.pl' or die "ERROR: Can NOT load logfile.pl ...\n";
require 'htmltools.pl' or die "ERROR: Can NOT load htmltools.pl ...\n";
# log file stuff
my ($LF);
my $outfile = 'temp'.$0.'.txt';
open_log($outfile);
prt( "$0 ... Hello, World ...\n" );
# user variable
###my $def_input = '..\javascript\messageontop.htm';
my $def_input = 'temptidy48in.htm';
###my $def_input = 'C:\Documents and Settings\Geoff McLane.PRO-1\My Documents\tidy\tidy-48.htm';
my $def_output = 'temptidy48.htm';
###my $def_input = 'C:\Documents and Settings\Geoff McLane.PRO-1\My Documents\FG\devel\fgd-008.htm';
###my $def_output = 'tempfgd02.htm';

# debug
my $dbg1 = 0;
my $dbg2 = 0;
my $dbg3 = 0;
my $dbg4 = 0;
my $dbg5 = 0;
my $dbg6 = 0;
my $dbg7 = 0;
my $dbg8 = 0;
# program variable
my $in_file = $def_input;
my $out_file = $def_output;
$in_file = pop @ARGV if (@ARGV);
$out_file = pop @ARGV if (@ARGV);
prt( "Got input from [$in_file], output to [$out_file] ...\n" );
if ( ! -f $in_file) {
	mydie("OOPS: Can NOT locate [$in_file] ...\n");
}
open IF, "<$in_file" or mydie("OOPS: Can NOT open [$in_file] ...\n");
my @lines = <IF>;	# slurp it all in
close IF;
my $cnt = scalar @lines;
prt("Processing $cnt lines from [$in_file] ...\n");
my $txt = join("\n", @lines);
my $ccnt = length($txt);
prt("Or $ccnt characters from [$in_file] ...\n");
my $newtxt = general_adjustments( $txt );
$ccnt = length($newtxt);
my @lines = split("\n", $newtxt);
my $line = '';
my $word = 0;
my $cont = '';
my $lncnt = scalar @lines;
my $style = '';
my @styles = ();
my %mystyles = ();
my %myattrs = ();
my @attribs = ();
my $stbgn = 0;
my $stend = 0;
my $bdybgn = 0;
my $bdyend = 0;
my @paralist = ();
my $blkparas = 0;
my $stybgn = 0;
my $styend = 0;
my %xmlblocks = ();
################################################################
my $newstyle = <<EOF;

<style type="text/css">
body {
 background-image:url('clds5.jpg');
 margin: 0cm 1cm 0cm 1cm;
}

.code {
 margin: 0px 10px 0px 10px;
 background: #f0f8ff;
 border-width: 1px;
 border-style: solid solid solid solid;
 border-color: #000090;
 width: 90%;
 padding: 0px 10px 0px 10px;
}

.diff {
 margin: 0px 10px 0px 10px;
 background: #f0ffef;
 border-width: 1px;
 border-style: solid solid solid solid;
 border-color: #900090;
 width: 90%;
 padding: 0px 10px 0px 10px;
}

</style>
EOF

my $my_pre = '<pre class="code">';

################################################################
$word = check_for_word();
get_the_style();
@attribs = keys %myattrs;
my $atcnt = scalar @attribs;
prt( "Got $atcnt attibutes to look for ... style bgn $stbgn($stybgn), end $stend($styend) body bgn $bdybgn, end $bdyend ...\n" );
##sub removetagattrib { my ($txt, $tag) = @_;
my $clntxt = removetagattrib( $newtxt, 'p' );
$clntxt = striptag( $clntxt, 'style' );
$clntxt = add_my_style( $clntxt );
$clntxt = inline_paras( $clntxt );
get_para_list( $clntxt );
prt( "In " . scalar @paralist . " paragraphs, found $blkparas blanks ...\n" );
show_xml_list( $clntxt );
$clntxt = prt_para_list( $clntxt );
write_out_file($clntxt, $out_file);
system($out_file);
close_log($outfile,1);
exit(0);

# ###############################################
# all subs below
# ##############

sub prt_para_list {
	my ($tx) = shift;
	my $cnt = scalar @paralist;
	my $mt = length($tx);
	my ($ch, $i, $ip, $stp, $edp, $ntx, $lstp, $ledp, $pstp, $pedp, $ps, $pe, $inp);
	my ($lpstp, $lpedp, $blk, $inx, $gotpre, $msg);
	prt( "Showing $cnt paragraphs ...\n" );
	$ip = 0;
	$ntx = '';
	$lstp = 0;
	$ledp = 0;
	$inx = 0;
	$stp = 0;
	$gotpre = 0;
	$inx = in_xml_range($stp); 
	if ($cnt) {
		$stp = $paralist[0][0];
		$edp = $paralist[0][1];
		$pstp = $paralist[0][2];
		$pedp = $paralist[0][3];
		$blk = $paralist[0][4];
		$ch = substr($tx, 0, $stp);
		$ntx = $ch;
		$lstp = $stp;
		$ledp = $edp;
		$lpstp = $pstp;
		$lpedp = $pedp;
	} # this includes a <p> - the first p ...
	$inp = 1;
	for ($i = 0; $i < $cnt; $i++) {
		$ip++;
		$stp = $paralist[$i][0];
		$edp = $paralist[$i][1];
		$pstp = $paralist[$i][2];
		$pedp = $paralist[$i][3];
		$blk = $paralist[$i][4];
		$inx = in_xml_range($stp); 
		if ($lpedp < $pstp) {
			$msg = '';
			if ($inp) {
				# exclude <p>
				$msg .= "In p exclude";
				$ch = substr($tx, $lpedp, $pstp - $lpedp);
				if ($inx) {
					####$ch .= '</p>'; # close paragraph
					if (!$gotpre) {
						$ch .= '</p>' . $my_pre;
						$gotpre = 1;
						$msg .= " added my pre";
					}
					$inp = 0;
				}
			} else {
				# include <p>
				if ($inx) {
					# exclude <p>
					$msg .= "Not p excl";
					$ch = substr($tx, $lpedp, $pstp - $lpedp);
					if (!$gotpre) {
						$ch .= $my_pre; # '<pre>';
						$gotpre = 1;
						$msg .= " add pre";
					}
				} else {
					# include <p>
					$msg .= "Not p incl";
					$ch = substr($tx, $lpedp, $stp - $lpedp);
				}
			}
			##prt( "$ip Inbetween [$ch] \n" ) if ($dbg5);
			prt( "$ip [$pstp-$stp-$edp-$pedp] Inbetween inp=$inp inx=$inx [$ch] [$msg]\n" ) if ($dbg8);
			$ntx .= $ch;
		}
		$ch = substr($tx, $stp, $edp - $stp);
		$ps = substr($tx, $pstp, $stp - $pstp);
		$pe = substr($tx, $edp, $pedp - $edp);
		###if ($ch eq '&nbsp;') {
		if ($blk) {
			$ch = substr($tx, $pstp, $pedp - $pstp);
			prt( "$ip BLANK P[$ch]\n" ) if ($dbg5);
			if ($gotpre) {
				###if ($inx) {
					$ntx .= '</pre>';
					$gotpre = 0;
				###}
			} else {
				$ntx .= '</p>';
			}
			$inp = 0;
		} else {
			###prt( "$ip P[$ch]\n" );
			prt( "$ip P[$ps][$pe]\n" ) if ($dbg5);
			$ntx .= $ch;
			if (($i + 1) < $cnt) {
				$blk = $paralist[$i+1][4];
				if ($blk == 0) {
					if (!$gotpre) {
						$ntx .= '<br>';
					}
				}
			}
			$inp = 1;
		}
		$lstp = $stp;
		$ledp = $edp;
		$lpstp = $pstp;
		$lpedp = $pedp;
	}
	if ($inp) {
		$ntx .= '</p>';
	}
	if ($pedp < $mt) {
		$ntx .= substr($tx, $pedp); # add any remainder
	}
	return $ntx;
}

# check for things like
# <p>&lt;makeconf.mak&gt;, and if found seek to end
# <p>&lt;/makeconf.mak&gt;
sub check_my_xml {
	my ($tx, $i2) = @_;
	my $mt = length($tx);
	my $ch = '';
	my $i = 0;
	my $ln = '';
	for ($i = 0; $i < $mt; $i++) {
		$ch = substr($tx,$i,1);
		if ($ch eq '&') {
			$ln = $ch;
			$i++; # bump to next, and get the line
			for ( ; $i < $mt; $i++) {
				$ch = substr($tx,$i,1);
				if (($ch eq "\r")||($ch eq "\n")) {
					last;
				}
				$ln .= $ch;
			}
			if ($ln =~ /&lt;(\S+)&gt;/) {
				my $xml = $1;
				my $neg = 0;
				if (substr($xml,0,1) eq '/') {
					$xml = substr($xml,1);
					$neg = 1;
				} else {
					$i2 += length($ln);
				}
				if (defined $xmlblocks{$xml}) {
					prt("Old <$xml> ... ["  . $xmlblocks{$xml} . "]+[$neg $i2]\n") if ($dbg6);
					$xmlblocks{$xml} .= " $neg $i2";
				} else {
					prt("New <$xml> ... [$neg $i2]\n") if ($dbg6);
					$xmlblocks{$xml} = "$neg $i2";
				}
			}
			last;
		}
	}
}

sub in_xml_range {
	my ($j) = shift;
	foreach my $k (keys %xmlblocks) {
		my $v = $xmlblocks{$k}; # extract value
		my @a = split(' ',$v);
		if ((scalar @a == 2) && $a[0] && $a[1] && ($a[0] < $a[1]) ) {
			if (($j >= $a[0]) && ($j <= $a[1])){
				return 1;
			}
		}
	}
	return 0;
}

sub vv_in_xml_range {
	my ($j) = shift;
	prt( "Checking [$j] ...\n" );
	foreach my $k (keys %xmlblocks) {
		my $v = $xmlblocks{$k}; # extract value
		my @a = split(' ',$v);
		if ((scalar @a == 2) && $a[0] && $a[1] && ($a[0] < $a[1]) ) {
			if (($j >= $a[0]) && ($j >= $a[1])){
				return 1;
			}
		}
	}
	return 0;
}


sub show_xml_list {
	my ($tx) = shift;
	foreach my $k (keys %xmlblocks) {
		my $v = $xmlblocks{$k}; # extract value
		my @arr = split(" ",$v);
		if ((scalar @arr == 4) && ($arr[0] == 0) && ($arr[2] == 1)) {
			# appear to have 0 item start 1 item end
			$xmlblocks{$k} = "$arr[1] $arr[3]";
			prt( "Got $k bgn $arr[1], end $arr[3] ...\n" ) if ($dbg7);
		} else {
			prt( "Discarding [$k][$v] ...\n" ) if ($dbg7);
			delete $xmlblocks{$k}; # remove it
		}
	}
	prt( "Final list ...\n" );
	foreach my $k (keys %xmlblocks) {
		my $v = $xmlblocks{$k}; # extract value
		prt( "$k [$v]\n" );
		my @a = split(' ',$v);
		if ((scalar @a == 2) && $a[0] && $a[1] && ($a[0] < $a[1]) ) {
			my $blk = substr($tx, $a[0], $a[1] - $a[0]);
			prt( "BLOCK=[$blk]\n" ) if ($dbg7);
		} else {
			prt( "CHECH MISSED!!!\n" );
		}
	}
}

sub get_para_list {
	my ($tx) = shift;
	my $mt = length($tx);
	my ($ch, $i, $tg, $ip, $stp, $edp, $msg, $pstp, $pedp, $blk);
	$tg = '';
	$ip = 0;
	$stp = 0;
	$edp = 0;
	$pstp = 0;
	$pedp = 0;
	for ($i = 0; $i < $mt; $i++) {
		$ch = substr($tx,$i,1);
		if ($ch eq '<') {
			$edp = $i;
			$i++;
			$tg = $ch;
			for ( ; $i < $mt; $i++) {
				$ch = substr($tx,$i,1);
				$tg .= $ch;
				if ($ch eq '>') {
					$msg = "Got tag [$tg] ...";
					if ($tg =~ /<p.*>/i) {
						$msg .= " in para";
						$stp = $i + 1;
						$pstp = $edp; # start of '<p...
						$ip = 1;
					} elsif ($tg =~ /<\/p>/i) {
						$msg .= " out para";
						$ip = 0;
						###$edp = $i;
						$pedp = $i + 1;
						if ($stp && ($edp > $stp)) {
							$ch = substr($tx, $stp, $edp - $stp);
							# keep the intern stt   end outer sts end
							$blk = 0;
							if ($ch eq '&nbsp;') {
								$blkparas++;
								$blk = 1;
							}
							push(@paralist, [ $stp, $edp, $pstp, $pedp, $blk ]);
						}
						$stp = 0;
					}
					prt("$msg\n") if ($dbg3);
					last;
				}
			}
		} elsif ($ch eq '&') {
			check_my_xml( substr($tx,$i), $i );
		}
	}
}

sub process_style {
	my ($tx) = shift;
	my $sl = length($tx);
	my ($i, $ch, $sp, $ob, $nm, $stl);
	$sp = 0;
	$ob = 0;
	$nm = ''; # name of the style, can be comma separated
	$stl = ''; # material between braces
	for ($i = 0; $i < $sl; $i++) {
		$ch = substr($tx,$i,1);
		if ($ch eq '<') {
			# eat to end of this
			$i++;
			if ($i < $sl) {
				$ch = substr($tx,$i,1);
				if ($ch eq '!') {
					$i += 2;
				} else {
					for ( ; $i < $sl; $i++) {
						$ch = substr($tx,$i,1);
						if ($ch eq '>') {
							last;
						}
					}
				}
			}
		} elsif ($ch eq '/') {
			if ((($i + 1) < $sl) && (substr($tx,$i+1,1) eq '*')) {
				# entered a comment - eat it
				$i++;
				for ( ; $i < $sl; $i++) {
					$ch = substr($tx,$i,1);
					if ( ($ch eq '*') && (substr($tx,$i+1,1) eq '/')) {
						last;
					}
				}
			}
		} elsif ($ch =~ /\s/) {
			$sp++;
		} elsif ($ch eq '{') {
			$ob = 1;
		} elsif ($ch eq '}') {
			$ob = 0;
			if (length($nm) && length($stl)) {
				my @arr = split(",", $nm);
				foreach my $bit (@arr) {
					$bit = trimall($bit);
					if (length($bit)) {
						if (defined $mystyles{$bit}) {
							prt( "Duplicate of [$bit], with val = [" . $mystyles{$bit} . "\n" );
							$mystyles{$bit} .= ' ' . $stl;
						} else {
							$mystyles{$bit} = $stl;
						}
					}
				}
			}
			prt( "Name = [$nm] Styles = [$stl] ...\n" ) if ($dbg4);
			$nm = '';
			$stl = '';
		} else {
			# seeking something { more ... }
			if ($ob) {
				$stl .= $ch;
			} else {
				$nm .= $ch;
			}
		}
	}
}

sub get_the_style {
	my $i = 0;
	my $off = 0;
	my $len = 0;
	# run one - extract the STYLE stuff
	for ($i = 0; $i < $lncnt; $i++) {
		$line = $lines[$i];
		$len = length($line);
		chomp $line;
		$line =~ s/\r$//;
		prt( "Line: ". ($i + 1) . " $line\n" ) if ($dbg2); 
		if ($line =~ /<style(.*)>/i) {
			prt( "Found [$line] ...\n" );
			$style = $line;
			$stbgn = $i; # keep BEGIN of STYLE
			$stybgn = $off;
			###push(@styles, $line);
			while ( !($style =~ /<\/style>/i) && ($i < $lncnt) ) {
				$i++;
				$line = $lines[$i];
				$len = length($line);
				chomp $line;
				$line =~ s/\r$//;
				$style .= ' ' . $line;
				push(@styles, $line);
				$off += $len;
			}
			$stend = $i; # and END of STYLE
			$styend = $off;
			$len = 0;
		} elsif ($line =~ /<body(.*)>/i) {
			$bdybgn = $i;
		} elsif ($line =~ /<\/body>/i) {
			$bdyend = $i;
		}
		$off += $len;
	}
	process_style( $style );
	prt( "Style = [$style] ...\n" ) if ($dbg4);
	foreach my $k (keys %mystyles) {
		my $v = $mystyles{$k};
		prt( "$k { $v }\n" ) if ($dbg4);
		my @ar = split(/\./,$k);
		my $tg = trimall($ar[0]);
		my $at = $tg;
		if (scalar @ar == 2) {
			$at = trimall($ar[1]);
		}

		if (defined $myattrs{$at} ) {
			prt( "Adding [$tg] to [$at] ...\n" ) if ($dbg4);
			$myattrs{$at} .= ' ' . $tg;
		} else {
			prt( "Setting [$tg] to [$at] ...\n" ) if ($dbg4);
			$myattrs{$at} = $tg;
		}
	}
}

sub write_out_file {
	my ($tx, $fil) = @_;
	open OF, ">$fil" or mydie("YEEK! Can NOT create [$fil] ...\n");
	print OF $tx;
	close OF;
	prt("Written " . length($tx) . " characters to [$fil]...\n");
}


sub get_next_tag {
	my ($tx) = shift;
	my $tl = length($tx);
	my $ch = '';
	my $i = 0;
	my $nt = '';
	for ($i = 0; $i < $tl; $i++) {
		$ch = substr($tx,$i,1);
		if ($ch eq '<') {
			$nt = $ch;
			$i++;
			for ( ; $i < $tl; $i++) {
				$ch = substr($tx,$i,1);
				$nt .= $ch;
				if ($ch eq '>') {
					return $nt;
				}
			}
		}
	}
	return $nt;
}

sub inline_paras {
	my ($tx) = shift;
	my $tl = length($tx);
	my $ch = '';
	my $i = 0;
	my $nt = '';
	my $tg = '';
	my $ip = 0;
	for ($i = 0; $i < $tl; $i++) {
		$ch = substr($tx,$i,1);
		if ($ch eq '<') {
			$tg = get_next_tag( substr($tx,$i) );
			if ($tg =~ /^<p>/i) {
				$ip = 1;
			} elsif ($tg =~ /^<p\s+.+>/i) {
				$ip = 1;
			} elsif ($tg =~ /^<\/p>/) {
				$ip = 0;
			}
		}
		if ($ip) {
			if ($ch eq "\r") {
				# skip this
			} elsif ($ch eq "\n") {
				$nt .= ' ';
			} else {
				$nt .= $ch;
			}
		} else {
			$nt .= $ch;
		}
	}
	return $nt;
}

sub check_for_word {
	my $lc = scalar @lines;
	prt( "Processing $lc lines ... seeking MS word meta ...\n" );
	my $isword = 0;
	foreach $line (@lines) {
		chomp $line;
		$line =~ s/\r$//;
		## <meta name="Generator" content="Microsoft Word 10 (filtered)">
		if ($line =~ /<meta\s+name="?Generator"?\s+?content="?(.*)"?>/i) {
			$cont = $1;
			if ($cont =~ /Microsoft/i) {
				$isword = 1;
				prt( "Found [$line] ...\n" );
				last;
			}
		}
	}
	return $isword;
}

sub general_adjustments {
	my ($tx) = shift;
	my $tl = length($tx);
	prt( "Begin len=$tl - Do some expansions, if required ...\n") if $dbg1;
	my $nt = htmlexpand($tx);
	$tl = length($nt);
	prt( "len=$tl - Add font tag to new line ...\n") if $dbg1;
	$nt = tag2newline($nt,'font');
	$tl = length($nt);
	prt( "len=$tl - Add input tag to new line ...\n") if $dbg1;
	$nt = tag2newline($nt,'input');
	$tl = length($nt);
	prt( "len=$tl - Add form tag to new line ...\n") if $dbg1;
	$nt = tag2newline($nt,'form');
	$tl = length($nt);
	prt( "len=$tl - Add comments to new line ...\n") if $dbg1;
	$nt = comments2newline($nt);
	$tl = length($nt);
	prt( "len=$tl - left before trimblanks ...\n") if $dbg1;
	###$nt = trimblanklines($nt);
	$nt = trimblanks($nt);
	$tl = length($nt);
	prt("Now returning $tl characters ...\n") if $dbg1;
	return $nt;
}

sub trimall {
	my ($ln) = shift;
	chomp $ln;
	$ln =~ s/\r$//;
	$ln =~ s/\t/ /g;
	while ($ln =~ /\s\s/) {
		$ln =~ s/\s\s/ /g;
	}
	while ($ln =~ /^\s/) {
		$ln = substr($ln,1);
	}
	while ($ln =~ /\s$/) {
		$ln = substr($ln,0, length($ln) - 1);
	}
	return $ln;
}

sub add_my_style {
	my ($tx) = shift;
	my $nt = '';
	my $tl = length($tx);
	my $ch = '';
	my $i = 0;
	my $tg = '';
	my $di = 0;
	for ($i = 0; $i < $tl; $i++) {
		$ch = substr($tx,$i,1);
		if (!$di) {
			if ($ch eq '<') {
				$tg = get_next_tag( substr($tx,$i) );
				if ($tg =~ /<\/head>/i) {
					$nt .= $newstyle;
					$di = 1;
					prt( "Added new style ...\n" );
				}
			}
		}
		$nt .= $ch;
	}
	return $nt;
}

# eof - cleanhtm02.pl
