#!/Perl

use HTML::Parser ();
use Data::Dump ();

my $definp = 'C:/HOMEPAGE/P26/browser1.htm';
###my $definp = 'c:/HOMEPAGE/P26/perl.htm';
###my $definp = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/Wednesday.htm";
###my $definp = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/My Webs/moon-01.htm";
###my $definp = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/Russel/Russ-04.htm";
my $WHITE_PATTERN2 = "^[ \t\r\n]*\$"; # spacey if ($var =~ /$WHITE_PATTERN2/o ) { ...}
my $defout = 'tempstrip.txt';
my $defstrip = 'tempout.txt';

my ($HO1, $HO2);
my $dncr = 0;
my @tagarr; ## tag array
my $intable = 0;
my $tnewline = 0;
my $tnewhcol = 0;
my $tnewcol = 0;
my $colcount = 0;
my $inbody = 0;
my $inhead = 0;
my $noheadout = 1; ### avoid HEAD tag text
my $mxt = 60; ### 40
my $intd = 0;
my $nointdcr = 1;

open $HO1, ">$defout"  or die "No output file ... [$defout]!\n";
open $HO2, ">$defstrip"  or die "No output file ... [$defstrip]!\n";

print "$0: Started on " . localtime(time()) . "\n"; ### . " in $cwdir ...\n" if $shwtm;

print "Hello, World ... Strip HTML ... ";

my $infile = shift || die "\nERROR: Must give an INPUT FILE ...\n";

$infile = $definp;

die "\nERROR: Can not locate the file [$infile]!\n" if (! -f $infile);

prt ("From $infile ...\n");

my $p = HTML::Parser->new(api_version => 3);

$p->handler(default => \&hand, "event, line, column, text, tagname, attr");

$p->parse_file($infile);

print "$0: Ended on " . localtime(time()) . "\n"; ### . " in $cwdir ...\n" if $shwtm;
close $HO1;
close $HO2;
print "Check results in $defstrip ...\n";
system ($defstrip); ### check out the RESULTS
print "Check results in $defout ...\n";
system ($defout); ### anaysis of data from parser

sub hand {
    my($event, $line, $column, $to, $tagname, $attr) = @_;
	my $typ = uc(substr($event,0,1)); ## get TYPE
	my $ll = length($to);
    my $msg =  "$typ L$line C$column:";
    my @d =  $msg;
    push(@d, $to);
    push(@d, $tagname) if defined $tagname;
	push(@d, $attr) if $attr;
	my $asz = @d; ### get length of array
	my $np = 1;
	my $tag = '*NO_TAG*';
	if (defined $tagname) {
		$tag = uc($tagname);
	}
	my $dtxt = Data::Dump::dump(@d);
	my $text = $to;
	my $txout = ''; ### start the final OUTPUT
	if ($typ eq 'S') {
		## start of tag
		$msg .= " S-$tag";
		if ($tag eq 'HEAD') {
			$inhead = 1;
		} elsif ($tag eq 'BODY') {
			$inbody = 1;
		} elsif ($tag eq 'TABLE') {
			$intable++;
			$tnewline = 0;
			$tnewhcol = 0;
			$tnewcol = 0;
		} elsif ($tag eq 'TR') {
			if ($colcount) {
				$txout .= "\n"; ## print $HO2 "\n";
				$msg .= " Added <TR>NEW LINE!";
				$colcount = 0;
			} else {
				$msg .= " Skipped <TR>NEW LINE!";
			}
			$tnewline++;
			$tnewhcol = 0;
			$tnewcol = 0;
		} elsif ($tag eq 'TH') {
			$msg .= " Added <TH>SPACE!";
			$txout .= " "; ### print $HO2 ' ';
			$tnewhcol++;
		} elsif ($tag eq 'TD') {
			$msg .= " Added <TD>SPACE! in td";
			$txout .= ' '; ### print $HO2 ' ';
			$tnewcol++;
			$intd = 1;
		} else {
			$msg .= ' B tag with no case';
		}
	} elsif ($typ eq 'E') {
		## end tag
		$msg .= " E-$tag";
		if ($tag eq 'HEAD') {
			$inhead = 0;
		} elsif ($tag eq 'BODY') {
			$inbody = 0;
		} elsif ($tag eq 'TABLE') {
			if ($intable) {
				$intable--;
			}
		} elsif ($tag eq 'TR') {
			$tnewline-- if $tnewline;
		} elsif ($tag eq 'TH') {
			$tnewhcol--;
		} elsif ($tag eq 'TD') {
			$tnewcol--;
			$intd = 0;
		} else {
			$msg .= ' E tag with no case';
		}
	} elsif ($typ eq 'T') {
		### text item
		if ($ll) {
			my $addtx = 0;
			$text =~ s/&nbsp;/ /g; ## get back spaces
			$text =~ s/&lt;/</g; ## get back less than
			$text =~ s/&gt;/>/g; ## get back greater than
			$text =~ s/&quot;/"/g; ## get back QUOTES
			### note this LAST
			$text =~ s/&amp;/&/g; ## get back ampersound
			if ($to ne $text) {
				my $l2 = length($text);
				$msg .= " *CHG* [$to]$ll to [$text]$l2 ! ";
				$ll = $l2;
			}

			my @sptxt = split (' ', $text);
			if ($text =~ $WHITE_PATTERN2) {
				my $iscr = 0;
				if ($text =~ /^[\r\n]*$/ ) {
					$msg .= "[all cr/lf stuff for $ll]";
					$iscr = 1;
				} elsif ($text =~ /^ *$/ ) {
					$msg .= "[space for $ll]";
					$txout .= $text; ### print $HO2 $text;
					$msg .= " Added <real>SPACE!";
					$addtx = 1;
					$np = 0;
				} else {
					my $tt = $text;
					$tt =~ s/[\r\n]//g; ### kill the CR/LF
					$txout .= $tt; ### add this space???
					$msg .= " [*CHK* mixed sp[$tt] + cr/lf for $ll]";
					if ($intd && $nointdcr) {
						$msg .= 'intd cr avoided';
						$iscr = 0;
					} else {
						$iscr = 1;
					}
				}
				if ($iscr) {
					if ($dncr >= 2) {
						$msg .= " *dup cr*";
						###return;
					} else {
						$msg .= " Added <newline>!";
						$txout .= "\n"; ### print $HO2 "\n";
						$np = 0;
						$dncr++;
						$colcount = 0;
					}
				}
			} else {
				if (( $ll > 4 ) && 
					(( substr ($text, 0, 4) eq '<!--' )||
					 ( substr ($sptxt[0], 0, 4) eq '<!--'       ))) {
					substr($text, $mxt) = "..." if length($text) > $mxt; ### limit, to say 40
					$msg .= "[" . $text . "]";
					$msg .= "html comment $msg *end c*";
				} else {
					$dncr = 0;
					$txout .= $text; ### print $HO2 $text;
					$colcount += $ll;
					substr($text, $mxt) = "..." if length($text) > $mxt;
					$msg .= "[" . $text . "]ADDED";
					$np = 0;
				}
			}
		} else {
			$msg .= " [$text] SKIPPED";
		}
	} else {
		if (length($text)) {
			$msg .= '[' . $text . ']';
		} else {
			$msg .= '[empty]';
		}
	}

	###if ($np) {
	my $omsg = $msg;
	prt ($dtxt . "\n");
	if (length ($txout)) {
		###if ($tag eq 'HEAD') { $inhead = 1;
		if ($inhead && $noheadout) {
			### no output
			$omsg = "-=> $msg [SKIP-IN-HEAD]"
		} else {
			print $HO2 $txout;
			$omsg = "$msg (written)";
		}
	} else {

		$omsg = "--> $msg (no out text length)";

	}
	prt ($omsg . "\n");

}

sub prt {
	my ($t) = @_;
	print $t;
	print $HO1 $t;
}

# EOF
