#!/Perl
# cleantd01.pl - 2006.10.24 - geoff mclane (geoffmclane.com)
# AIM: To clean certain items from a HTML document ...
# specifically target microsoft word 'filtered' output.
# Search for 'tables', and remove 'style' attribute from <td>,
# and '<p ...></p><o:p></o:p> and <span> from within ...
# =====================================================================
use strict;
require 'logfile.pl' or die "ERROR: Can NOT load logfile.pl ...\n";
require 'htmltools.pl' or die "ERROR: Can NOT load htmltools.pl ...\n";
# log file stuff
my ($LF);
my $outfile = 'temp.'.$0.'.txt';
open_log($outfile);
prt( "$0 ... Hello, World ...\n" );
# user variable
my $def_input = 'C:\HOMEPAGE\P26\travel\tunisfr2.htm';
my $def_output = 'temptunis.htm';

# debug
my $dbg1 = 1; # show length after adjustments
# program variables
my $line = '';
my $word = 0;
my @lines = ();
my $cnt = 0;
my $txt = '';
my $ccnt = 0;
my $newtxt = '';
my $in_file = $def_input;
my $out_file = $def_output;
$in_file = pop @ARGV if (@ARGV);
$out_file = pop @ARGV if (@ARGV);
prt( "Got input from [$in_file], output to [$out_file] ...\n" );
if ( ! -f $in_file) {
	mydie("OOPS: Can NOT locate [$in_file] ...\n");
}
open IF, "<$in_file" or mydie("OOPS: Can NOT open [$in_file] ...\n");
@lines = <IF>;	# slurp it all in
close IF;
$word = check_for_word();
$cnt = scalar @lines;
prt("Processing $cnt lines from [$in_file] ... " . ($word ? 'is word' : 'not word') . "\n");
$txt = join("\n", @lines);
$ccnt = length($txt);
prt("Or $ccnt characters from [$in_file] ...\n");
$newtxt = make_adjustments( $txt );
$ccnt = length($newtxt);
write_out_file($newtxt, $out_file);
system($out_file);
close_log($outfile,1);
exit(0);

# ###############################################
# all subs below
# ##############

sub short_text {
	my ($tx, $len) = @_;
	my $ln = length($tx);
	my $ntx = $tx;
	if ($ln > ($len + 3)) {
		my $hl = int( $len / 2 );
		$ntx = substr($tx,0,$hl);
		$ntx .= '...';
		$hl = $len - $hl;
		$ntx .= substr($tx, $ln - $hl);
	}
	return $ntx;
}

sub write_out_file {
	my ($tx, $fil) = @_;
	open OF, ">$fil" or mydie("YEEK! Can NOT create [$fil] ...\n");
	print OF $tx;
	close OF;
	prt("Written " . length($tx) . " characters to [$fil]...\n");
}


sub check_for_word {
	my $lc = scalar @lines;
	prt( "Processing $lc lines ... seeking MS Word meta ...\n" );
	my $isword = 0;
	my ($cont);
	foreach $line (@lines) {
		chomp $line;
		$line =~ s/\r$//;
		## <meta name="Generator" content="Microsoft Word 10 (filtered)">
		if ($line =~ /<meta\s+name="?Generator"?\s+?content="?(.*)"?>/i) {
			$cont = $1;
			if ($cont =~ /Microsoft/i) {
				prt( "Found [$cont] [$line] ...\n" );
				if ($cont =~ /Word/i) {
					$isword = 1;
					prt( "Found WORD signature ...\n" );
					last;
				}
			}
		}
	}
	return $isword;
}

sub get_tag {
	my ($t) = shift;
	my $m = length($t);
	my ($j, $c);
	my $tg = '';
	for ($j = 0; $j < $m; $j++) {
		$c = substr($t,$j,1);
		if ($c eq '<') {
			$tg = $c;
			$j++;
			for ( ; $j < $m; $j++) {
				$c = substr($t,$j,1);
				##if (($c eq "\n")||($c eq "\r")) {
				##	$c = ' ';
				##}
				$tg .= $c;
				if ($c eq '>') {
					last;
				}
			}
			last;
		}
	}
	return $tg;
}

sub trim_tail {
	my ($ln) = shift;
	while ($ln =~ /\s$/m) {
		$ln = substr($ln,0, length($ln) - 1);
	}
	return $ln;
}


sub del_td_style {
	my ($td) = shift;
	my $mx = length($td);
	my ($j, $c, $d);
	my $ntd = '';
	my $hds = 0;
	my $ss = '';
	$d = '';
	for ($j = 0; $j < $mx; $j++) {
		$c = substr($td,$j,1);
		if ($hds && ($c =~ /s/i) && (($mx - $j) > 7)) {
			$ss = substr($td,$j); # get balance
			if ($ss =~ /^style=(.*)/) {
				$j += 6;
				$d = substr($td,$j,1); # get " or '
				if (($d eq '"')||($d eq "'")) {
					$j++;
				} else {
					$d = ' ';
				}
				for ( ; $j < $mx; $j++) {
					$c = substr($td,$j,1);
					if (($c eq $d)||($c eq '>')) {
						last;
					}
				}
			}
		}
		if ($c =~ /\s/) {
			$hds = 1;
		} else {
			$hds = 0;
		}
		if ($c ne $d) {
			if ($c eq '>') {
				$ntd = trim_tail($ntd);
			}
			$ntd .= $c;
		}
		$d = '';
	}
	return $ntd;
}

sub make_adjustments {
	my ($tx) = shift;
	my $tl = length($tx);
	my ($i);
	my $ch = '';
	my $nt = '';
	my $tag = '';
	my $att = '';
	my $tgl = '';
	my $intd = 0;
	my $ntag = '';
	for ($i = 0; $i < $tl; $i++) {
		$ch = substr($tx,$i,1);
		if ($ch eq '<') {
			$tag = get_tag( substr($tx,$i) );
			$i += (length($tag) - 1) if (length($tag));
			$tgl = $tag;
			$tgl =~ s/\n/ /g;
			$tgl =~ s/\r/ /g;
			if ($tgl =~ /<td(.*)>/im) {
				$intd = 1;
				$att = $1;
				prt( "TD tag [$att] [$tag]...\n" );
				if ($tgl =~ /<td\s+?(.+)>/im) {
					$att = $1;
					if ($tgl =~ /style=/i) {
						prt( "Is TD with STYLE attrib [$att] ...\n" );
						$tag = del_td_style($tag);
						prt( "New tag [$tag]\n" );
					} else {
						prt( "Is TD with attrib [$att] ...\n" );
					}
				} else {
					prt( "Is simple TD tag ...\n" );
				}
			} elsif ((length($tag) > 4)&&(substr($tag,0,4) eq '<!--')) {
				prt( "Got comment ...\n" );
			} elsif ($tag =~ /<\/td>/) {
				prt( "Close TD [$tag]\n" );
				$intd = 0;
			} else {
				prt( "other tag [$tag] ...\n" );
			}
			$nt .= $tag;
		} else {
			$nt .= $ch;
		}
	}
	$tl = length($nt);
	prt("Now returning $tl characters ...\n") if $dbg1;
	return $nt;
}

sub trimall {
	my ($ln) = shift;
	chomp $ln;
	$ln =~ s/\r$//;
	$ln =~ s/\t/ /g;
	while ($ln =~ /\s\s/) {
		$ln =~ s/\s\s/ /g;
	}
	while ($ln =~ /^\s/) {
		$ln = substr($ln,1);
	}
	while ($ln =~ /\s$/) {
		$ln = substr($ln,0, length($ln) - 1);
	}
	return $ln;
}

# eof - cleantd01.pl
