#!/perl -w
# NAME: tidyword.pl
# AIM: Take a tidied word filtered html and remove MsoNormal, MsoPlainText calls paragraphs,
# That is no margin paragraphs separated by a 'blank' become 'normal' paragraphs, and
# paragraphs following each other become one paragraph with <br> separating the lines
# 06/05/2007 - geoff mclane - geoffmclane.com
use strict;
use warnings;
require 'logfile.pl' or die "Unable to load logfile.pl ...\n";
# log file stuff
my ($LF);
my $outfile = 'temp.'.$0.'.txt';
if ($0 =~ /\w{1}:\\.*/) {
	my @tmpsp = split(/\\/,$0);
	$outfile = 'temp.'.($tmpsp[-1]).'.txt';
}
open_log($outfile);
prt( "$0 ... Hello, World ...\n" );
my $def_file = "C:\\Documents and Settings\\Geoff McLane\\My Documents\\Louis\\tempout.htm";
my $in_file = $def_file;
my $out_file = "tempnew.htm";
my $lncnt = 0;
my @lines = ();
my $line = '';
my $tag = '';
my $ch = '';
my $boff = 0;
my $bln = 0;
my $bpoff = 0;
my $bpln = 0;
my $epoff = 0;
my $epln = 0;
my @paras = ();
my $inpara = 0;
# debug switches
my $dbg1 = 0;	# show line collection
if (open INF, "<$in_file") {
	@lines = <INF>;
	close INF;
}
$lncnt = scalar @lines;
prt( "Processing $lncnt lines from $in_file ...\n" );
for (my $i = 0; $i < $lncnt; $i++) {
	$line = $lines[$i];
	my $lnlen = length($line);
	for (my $j = 0; $j < $lnlen; $j++) {
		$ch = substr($line,$j,1);
		if ($ch eq '<') {
			if (length($tag)) {
				# deal with last tag
			}
			$tag = $ch;
			$boff = $j;
			$bln = $i;
			$j++;
			for ( ; $j < $lnlen; $j++) {
				$ch = substr($line,$j,1);
				$tag .= $ch;
				if ($ch eq '>') {
					# end of tag
					if ($tag =~ /^<p\s+(.*)>/i) {
						prt( "$tag [$1] line $i:$boff\n" ) if ($dbg1);
						$bpoff = $boff;
						$bpln = $bln;
						prt( "WARNING: Already in paragraph!\n" ) if ($inpara);
						$inpara = 1;
					} elsif ($tag =~ /^<\/p>/) {
						prt( "$tag CLOSED line $i:$j para: $bpln:$bpoff to $i:$j\n" ) if ($dbg1);
						push(@paras, [$bpln, $bpoff, $i, $j, 1, "content"] );
						$inpara = 0;
					}
					last;
				}
			}
		}
	}
}

my $pcnt = scalar @paras;
prt( "Looking at $pcnt paragraphs ...\n" );
for (my $i = 0; $i < $pcnt; $i++) {
	#push(@paras, [$bpln, $bpoff, $i, $j, 1, "content"] );
	$bpln = $paras[$i][0];
	$bpoff = $paras[$i][1];
	$epln = $paras[$i][2];
	$epoff = $paras[$i][3];
	prt( "Paragraph: $bpln:$bpoff to $epln:$epoff\n" ) if ($dbg1);
	$line = getpara( $bpln, $bpoff, $epln, $epoff, @lines );
	my $ln2 = getcontent( $line );
	my $res = ($ln2 =~ /\S/);
	if ($res) {
		prt( "content: $ln2 [$bpln, $bpoff, $epln, $epoff]\n" );
		$paras[$i][4] = length($ln2);
		$paras[$i][5] = $ln2;
	} else {
		prt( "$line (BLANK) [$bpln, $bpoff, $epln, $epoff]\n" );
		$paras[$i][4] = 0;
		$paras[$i][5] = "";
	}
}

open OUT, ">$out_file" or mydie( "ERROR: Unable to create $out_file ... $! ...\n" );
my $lastbr = 0;
for (my $i = 0; $i < $lncnt; $i++) {
	$line = $lines[$i];
	my $lnlen = length($line);
	my $ln2 = '';
	my $flg = 0;
	my $flg2 = 0;
	my $endp = '';
	my $i2 = lineinparas($i);
	$bpln = 0;
	if ($i2 < $pcnt) {
		$bpln = $paras[$i2][0];
		$bpoff = $paras[$i2][1];
		$epln = $paras[$i2][2];
		$epoff = $paras[$i2][3];
		$flg = $paras[$i2][4];
		$ln2 = $paras[$i2][5];
		# deal with substitution ...
		$flg2 = 0;
		if (($i2 + 1) < $pcnt) {
			$flg2 = $paras[$i2+1][4];
		}
		if ($flg) {
			if ($flg2) {
				$endp = '<br>';
			} else {
				$endp = '</p>';
			}
		} else {
			$endp = 'KILL';
		}
		if ($bpln == $epln) {
			if ($flg) {
				prt( "DEAL WITH LINE $i ...[$bpln, $bpoff, $epln, $epoff]\n<p>$ln2$endp ($lastbr)\n" );
				if ($lastbr == 0) {
					print OUT "<p>";
				}
				print OUT $ln2.$endp;
				if ($endp =~ /<br>/) {
					$lastbr = 1;
				} else {
					$lastbr = 0;
				}
			} else {
				prt( "KILL LINE $i ...[$bpln, $bpoff, $epln, $epoff] ($ln2) ($lastbr)\n" );
			}
		} else {
			if ($flg) {
				if ($lastbr == 0) {
					print OUT "<p>";
				}
				print OUT $ln2.$endp;
				prt( "DEAL WITH LINES $i-$epln ...[$bpln, $bpoff, $epln, $epoff]\n$ln2$endp\n ($lastbr)" );
				if ($endp =~ /<br>/) {
					$lastbr = 1;
				} else {
					$lastbr = 0;
				}
			} else {
				prt( "KILL LINE $i ...[$bpln, $bpoff, $epln, $epoff] ($ln2) ($lastbr)\n" );
			}
		}
		if ($epln > $bpln) {
			$i = $epln;
		}
	} else {
		print OUT $line; 
		chomp $line;
		prt( "$line ($i)\n" );
	}
}

close OUT;
close_log($outfile,1);
exit(0);


sub lineinparas {
	my ($il) = shift;
	for (my $j1 = 0; $j1 < $pcnt; $j1++) {
		my $pl1 = $paras[$j1][0];
		my $pl2 = $paras[$j1][2];
		if ($pl1 == $il) {
			return $j1;	# found this LINE
		} elsif ($pl1 > $il ) {
			last;	# reached a line GREATER
		}
		if ($pl2 > $pl1) {
			if ($il > $pl1) {
				if ($pl2 >= $il) {
					return $j1;	# found this LINE
				}
			}
		}
		# continue while para line LT given line
	}
	return $pcnt + 1;
}

sub getcontent {
	my ($ln) = shift;
	if ($ln =~ /^<p\s+.*>(.*)<\/p>/) {
		$ln = $1;
	} else {
		my $c = '';
		my $i = 0;
		my $nln = '';
		my $len = length($ln);
		for ($i = 0; $i < $len; $i++) {
			$c = substr($ln,$i,1);
			if ($c eq '>') {
				$i++;
				last;
			}
		}
		if ($c eq '>') {
			for (; $i < $len; $i++) {
				$c = substr($ln,$i,1);
				if ($c eq '<') {
					last;
				}
				$nln .= $c;
			}
		}
		$ln = $nln if length($nln);
	}
	$ln =~ s/&nbsp;/ /g;
	return $ln;
}

sub getpara {
	my ( $bpl, $bpo, $epl, $epo, @lns ) = @_;
	my $ln = $lns[$bpl];
	my $ll = length($ln);
	if ($bpo) {
		$ln = substr($ln,$bpo);
	}
	if ($bpl == $epl) {
		$ln = substr($ln,0, $epo - $bpo + 1);
	} else {
		while( $bpl < $epl ) {
			$bpl++;
			my $ln2 = $lns[$bpl];
			if ($bpl == $epl) {
				$ln2 = substr($ln2, 0, $epo + 1);
			}
			$ln .= $ln2;
		}
	}
	return $ln;
}

# eof
