#!/Perl
# NAME: htmltools.pl
# AIM: HTML tools - utility functions - 2006-08-26
# to include this, must declare @hrefs and @imgs if
# collecthrefs( $txt, 1 ) or collectimgs( $txt, 1 ) resp. called
my $htmtdbg1 = 0;
my $htmtdbg2 = 0;
my $htmtdbg3 = 0;
my $htmtdbg4 = 0;
my $htmtdbg5 = 0;
my $htmtdbg6 = 0;
my $htmtdbg7 = 0;	# show acquired <body background="something"...>
my @tools_htm = ();

sub trimbothends {
	my ($txt) = shift;
	while ($txt =~ /^\s/) {
		$txt = substr($txt,1);
	}
	while ($txt =~ /\s$/) {
		$txt = substr($txt,0,length($txt)-1);
	}
	return $txt;
}

sub tag2newline { # ($txt2,'td');
	my ($txt, $tag) = @_;
	my $len = length($txt);
	my $ntxt = '';
	my $i;
	my $ch = '';
	my $ft = '';
	my $lcnt = 0;
	for ($i = 0; $i < $len; $i++ ) {
		$ch = substr($txt,$i,1);
		if ($lcnt && ($ch eq '<')) {
			$ft = $ch;
			$i++;
			for ( ; $i < $len; $i++ ) {
				$ch = substr($txt,$i,1);
				$ft .= $ch;
				if ($ch eq '>') {
					if ($ft =~ /^<$tag/i) {
						$ft = "\n".$ft;
					}
					last;
				}
			}
			$ntxt .= $ft;
		} else {
			$ntxt .= $ch;
			if ($ch eq "\n") {
				$lcnt = 0;
			} else {
				$lcnt++;
			}
		}
	}
	return $ntxt;
}

sub comments2newline($) { # ($txt2);
	my ($txt) = shift;
	my $len = length($txt);
	my $ntxt = '';
	my $i;
	my $ch = '';
	my $ft = '';
	my $lcnt = 0;
	for ($i = 0; $i < $len; $i++ ) {
		$ch = substr($txt,$i,1);
		if ($lcnt && ($ch eq '<')) {
			$ft = $ch;	# set start
			$i++;
			if ($i < $len) {
				$ch = substr($txt,$i,1);
				$ft .= $ch;
				#if (($ct eq '!')||($ch eq '?')) {
				if ($ch eq '!') {
					$ft = "\n".$ft;
				}
			}
			$ntxt .= $ft;
		} else {
			$ntxt .= $ch;
			if ($ch eq "\n") {
				$lcnt = 0;
			} else {
				$lcnt++;
			}
		}
	}
	return $ntxt;
}

# strip a tag completely ...
# from <tag. ... to ... </tag>
sub striptag {
	my ($txt, $tag) = @_;
	my $len = length($txt);
	my $ntxt = '';
	my $ch = '';
	my $ftag = '';
	my $nline = '';
	my $i = 0;
	my $intag = 0;
	###prt("Processing $len chars for $tag ...\n");
	for ( ; $i < $len; $i++) {
		$ch = substr($txt, $i, 1);
		if ($intag) {
			if ($ch eq "<") {
				###prt("Got begin < ...\n");
				$i++;
				$ftag = '';
				for ( ; $i < $len; $i++ ) {
					$ch = substr($txt, $i, 1);
					if ($ch eq '>') {
						last;
					} else {
						$ftag .= $ch;
					}
				}
				###prt("Got tag [$ftag] ...\n");
				###if (lc($ftag) eq lc($tag)) {
				if (lc(substr($ftag,1)) eq lc($tag)) {
					$intag = 0;
				}
			} 
		} else {
			if ($ch eq "<") {
				###prt("Got begin < ...\n");
				$i++;
				$ftag = '';
				for ( ; $i < $len; $i++ ) {
					$ch = substr($txt, $i, 1);
					if (($ch eq '>')||($ch eq ' ')) {
						last;
					} else {
						$ftag .= $ch;
					}
				}
				###prt("Got tag [$ftag] ...\n");
				if (lc($ftag) eq lc($tag)) {
					if ($ch eq ' ') {
						$i++;
						for ( ; $i < $len; $i++ ) {
							$ch = substr($txt, $i, 1);
							if ($ch eq '>') {
								last;
							}
						}
					}
					$intag = 1;
				} else {
					$ntxt .= '<'.$ftag.$ch;
				}
			} else {
				$ntxt .= $ch;
			}
		}
	}
	return $ntxt;
}

sub return_tag {
	my ($txt, $tag) = @_;
	my $len = length($txt);
	my $ntxt = '';
	my $ch = '';
	my $ftag = '';
	my $nline = '';
	my $i = 0;
	my $intag = 0;
	###prt("Processing $len chars for $tag ...\n");
	for ( ; $i < $len; $i++) {
		$ch = substr($txt, $i, 1);
		if ($intag) {
			if ($ch eq "<") {
				###prt("Got begin < ...\n");
				$i++;
				$ftag = '';
				for ( ; $i < $len; $i++ ) {
					$ch = substr($txt, $i, 1);
					if ($ch eq '>') {
						last;
					}
					$ftag .= $ch;
				}
				###prt("Got tag [$ftag] ...\n");
				###if (lc($ftag) eq lc($tag)) {
				if (lc(substr($ftag,1)) eq lc($tag)) {
					$intag = 0;
					return $ntxt;
				}
				$ntxt = '';
				$ch = '';
			} 
			$ntxt .= $ch;
		} else {
			if ($ch eq "<") {
				###prt("Got begin < ...\n");
				$i++;
				$ftag = '';
				for ( ; $i < $len; $i++ ) {
					$ch = substr($txt, $i, 1);
					if (($ch eq '>')||($ch eq ' ')||($ch =~ /\s/)) {
						last;
					}
					$ftag .= $ch;
				}
				###prt("Got tag [$ftag] ...\n");
				if (lc($ftag) eq lc($tag)) {
					if (($ch eq ' ')||($ch =~ /\s/)) {
						$i++;
						for ( ; $i < $len; $i++ ) {
							$ch = substr($txt, $i, 1);
							if ($ch eq '>') {
								last;
							}
						}
					}
					$intag = 1;
				}
			}
		}
	}
	return $ntxt;
}

sub dropcomments { # strip_comments - strip comments - comment strip
	my ($txt) = shift;
	my $ntxt = '';
	my $len = length($txt);
	my $ch = '';
	my $pch1 = '';
	my $pch2 = '';
	my $i = 0;
	for ($i = 0; $i < $len; $i++) {
		$ch = substr($txt, $i, 1);
		if ($ch eq '<') {
			if ((($i + 3) < $len)&&
				(substr($txt, $i+1, 3) eq '!--')) {
				$i += 2;
				$pch1 = '';
				$pch2 = '';
				for ( ; $i < $len; $i++) {
					$ch = substr($txt, $i, 1);
					if (($ch eq '>')&&($pch1 eq '-')&&($pch2 eq '-')) {
						last;
					}
					$pch2 = $pch1;
					$pch1 = $ch;
				}
			} else {
				$ntxt .= $ch;
			}
		} else {
			$ntxt .= $ch;
		}
	}
	return $ntxt;
}

sub dropcomments_from_array {
	my (@arr) = @_;
	my $txt = '';
	foreach my $ln (@arr) {
		chomp $ln;
		$txt .= ' {=*==*=} ' if (length($txt));
		$txt .= $ln;
	}
	$txt = dropcomments( $txt );
	@arr = split( / \{=\*==\*=\} /, $txt );
	return @arr;
}

# Collect HREF anchors from a TEXT stream
# 25/07/2007 - Skip over comments <!-- to -->
sub collecthrefs {
	my ($txt,$del) = @_;
	my $ntxt = '';
	my $len = length($txt);
	my $ch = '';
	my $hrf = '';
	my $i;
	for ($i = 0; $i < $len; $i++) {
		$ch = substr($txt,$i,1);
		if ($ch eq '<') {
			$hrf = $ch;
			$i++;
			for ( ; $i < $len; $i++) {
				$ch = substr($txt,$i,1);
				$hrf .= $ch;
				# 25/07/2007 watch OUT for COMMENTS - skip these
				if ($ch eq '-') {
					if ($hrf eq '<!--') {
						# we have START of a COMMENT - YUK!!!
						$i++;	# move to NEXT
						for ( ; $i < $len; $i++) {
							$ch = substr($txt,$i,1);
							$hrf .= $ch;
							if ($ch eq '>') {
								if ($hrf =~ /-->$/) {
									last;
								}
							}
						}
					}
				}
				if ($ch eq '>') {
					last;
				}
			}

			if ($hrf =~ /^<a\s/i) {
				if ($del == 0) {
					$ntxt .= $hrf; # no delete - add the text
				}
				prt("Got anchor [$hrf] ...\n") if ($htmtdbg3);
				if ($hrf =~ /href=["']*(\S+)["']?./im) {
					$hrf = $1;
					push(@hrefs,$hrf);
					push(@tools_htm,$hrf);
					prt("Got [$hrf] ...\n") if ($htmtdbg2);
				}
			} elsif ($hrf =~ /^<\/a>$/i) {
				if ($del == 0) {
					$ntxt .= $hrf;
				}
			} else {
				$ntxt .= $hrf;
			}
		} else {
			$ntxt .= $ch;
		}
	}
	prt( "Collected ". scalar @hrefs . " HREF ...\n" ) if ($htmtdbg2);

	return $ntxt;
}

# Collect HREF anchors from a TEXT stream
# 25/07/2007 - Skip over comments <!-- to -->
sub collect_hrefs {
	my ($txt) = shift;
	my $len = length($txt);
	my $ch = '';
	my $hrf = '';
	my @hrarr = ();
	my $i;
	for ($i = 0; $i < $len; $i++) {
		$ch = substr($txt,$i,1);
		if ($ch eq '<') {
			$hrf = $ch;	# start a tag
			$i++;
			for ( ; $i < $len; $i++) {
				$ch = substr($txt,$i,1);
				$hrf .= $ch;
				# 25/07/2007 watch OUT for COMMENTS - skip these
				if ($ch eq '-') {
					if ($hrf eq '<!--') {
						# we have START of a COMMENT - YUK!!!
						$i++;	# move to NEXT
						for ( ; $i < $len; $i++) {
							$ch = substr($txt,$i,1);
							$hrf .= $ch;
							if ($ch eq '>') {
								if ($hrf =~ /-->$/) {
									last;
								}
							}
						}
					}
				}
				if ($ch eq '>') {
					last;
				}
			}

			if ($hrf =~ /^<a\s/i) {
				prt("Got anchor [$hrf] ...\n") if ($htmtdbg3);
				if ($hrf =~ /href=(["']?\S+["']?)./im) {
					$hrf = $1;
					$hrf =~ s/"//g;
					$hrf =~ s/'//g;
					push(@hrarr,$hrf);
					prt("Got [$hrf] ...\n") if ($htmtdbg2);
				}
			}
		}
	}
	prt( "Collected ". scalar @hrarr . " HREF ...\n" ) if ($htmtdbg2);
	return @hrarr;
}


sub collecthrefs_nearly_ok {
	my ($txt,$del) = @_;
	my $ntxt = '';
	my $len = length($txt);
	my $ch = '';
	my $hrf = '';
	my $i;
	for ($i = 0; $i < $len; $i++) {
		$ch = substr($txt,$i,1);
		if ($ch eq '<') {
			$hrf = $ch;
			$i++;
			for ( ; $i < $len; $i++) {
				$ch = substr($txt,$i,1);
				$hrf .= $ch;
				if ($ch eq '>') {
					last;
				}
			}

			if ($hrf =~ /^<a\s/i) {
				if ($del == 0) {
					$ntxt .= $hrf; # no delete - add the text
				}
				prt("Got anchor [$hrf] ...\n") if ($htmtdbg3);
				if ($hrf =~ /href=["'](\S+)["']./i) {
					$hrf = $1;
					push(@hrefs,$hrf);
					push(@tools_htm,$hrf);
					prt("Got [$hrf] ...\n") if ($htmtdbg2);
				}
			} elsif ($hrf =~ /^<\/a>$/i) {
				if ($del == 0) {
					$ntxt .= $hrf;
				}
			} else {
				$ntxt .= $hrf;
			}
		} else {
			$ntxt .= $ch;
		}
	}
	prt( "Collected ". scalar @hrefs . " HREF ...\n" ) if ($htmtdbg2);

	return $ntxt;
}

sub collect_anchors {
	my ($txt,$del) = @_;
	my $ntxt = '';
	my $len = length($txt);
	my $ch = '';
	my $hrf = '';
	my $i;
	for ($i = 0; $i < $len; $i++) {
		$ch = substr($txt,$i,1);
		if ($ch eq '<') {
			$hrf = $ch;
			$i++;
			for ( ; $i < $len; $i++) {
				$ch = substr($txt,$i,1);
				$hrf .= $ch;
				# 25/07/2007 watch OUT for COMMENTS - skip these
				if ($ch eq '-') {
					if ($hrf eq '<!--') {
						# we have START of a COMMENT - YUK!!!
						$i++;	# move to NEXT
						for ( ; $i < $len; $i++) {
							$ch = substr($txt,$i,1);
							$hrf .= $ch;
							if ($ch eq '>') {
								if ($hrf =~ /-->$/) {
									last;
								}
							}
						}
					}
				}
				if ($ch eq '>') {
					last;
				}
			}

			if ($hrf =~ /^<a\s/i) {
				prt("Got anchor [$hrf] ...\n") if ($htmtdbg3);
				push(@hrefs,$hrf);
				push(@tools_htm,$hrf);
				if ($del == 0) {
					$ntxt .= $hrf; # no delete - add the text
				}
			} elsif ($hrf =~ /^<\/a>$/i) {
				if ($del == 0) {
					$ntxt .= $hrf;
				}
			} else {
				$ntxt .= $hrf;
			}
		} else {
			$ntxt .= $ch;
		}
	}
	prt( "Collected ". scalar @tools_htm . " anchors ...\n" ) if ($htmtdbg2);

	return $ntxt;
}


sub ret_anchor_array {
	my ($txt) = shift;
	@tools_htm = ();
	collect_anchors( $txt, 0 );
	return @tools_htm;
}


sub ret_hrefs_array {
	my ($txt) = shift;
	@tools_htm = ();
	collecthrefs( $txt, 0 );
	return @tools_htm;
}

sub collectimgs {
	my ($txt,$del) = @_;
	my $ntxt = '';
	my $len = length($txt);
	my $ch = '';
	my $hrf = '';
	my $i;
	for ($i = 0; $i < $len; $i++) {
		$ch = substr($txt,$i,1);
		if ($ch eq '<') {
			$hrf = $ch;
			$i++;
			for ( ; $i < $len; $i++) {
				$ch = substr($txt,$i,1);
				$hrf .= $ch;
				if ($ch eq '>') {
					last;
				}
			}

			if ($hrf =~ /^<img\s+/i) {
				if ($del == 0) {
					$ntxt .= $hrf;
				}
				prt("Got [$hrf] ...\n") if ($htmtdbg3);
				if ($hrf =~ /src=["']*(\S+)['"]*.*/i) {
					$hrf = $1;
					$hrf =~ s/['"]$//;
					push(@imgs,$hrf);
					prt("Got [$hrf] ...\n") if ($htmtdbg2);
				}
			} else {
				$ntxt .= $hrf;
			}
		} else {
			$ntxt .= $ch;
		}
	}
	return $ntxt;
}

# return image array
# BUT NOT ONLY
# <img border="0" src="[images/construc.gif]" width="87" height="87" alt="under construction">
# BUT ALSO 
# <link rel="stylesheet" type="text/css" href="[home.css]">
# AND
# <link rel="shortcut icon" href="[favicon.ico]">
# AND 
# <script language="JavaScript" type="text/javascript" src="[fgtoc.js]">
# AND
# <body background="[clds4.jpg]" ...>
# AND
# <applet code="[TimerClass.class]"
#        width="90"
#        height="20">
sub ret_imgs_array {
	my ($txt) = shift;
	my @ims = ();
	my $len = length($txt);
	my $ch = '';
	my $hrf = '';
	my $i;
	my $lnum = 1;
	my $cnum = 0;
	for ($i = 0; $i < $len; $i++) {
		$ch = substr($txt,$i,1);
		$cnum++;
		if ($ch eq '<') {
			### prt( "htmltools:$lnum:$cnum: Start TAG ...\n" ) if ($htmtdbg6);
			$hrf = $ch;
			$i++;
			for ( ; $i < $len; $i++) {
				$ch = substr($txt,$i,1);
				$cnum++;
				if ($ch eq "\n") {
					$hrf .= ' ';
					$lnum++;
					$cnum = 0;
				} else {
					$hrf .= $ch;
				}
				if ($ch eq '>') {
					last;
				} elsif ($ch eq '-') {
					if ($hrf eq '<!--') {
						prt( "htmltools:$lnum:$cnum: Entered a COMMENT - get to comment end ...\n" ) if ($htmtdbg6);
						$i++;
						$hrf = '';
						for ( ; $i < $len; $i++) {
							$ch = substr($txt,$i,1);
							$cnum++;
							if ($ch eq "\n") {
								$hrf = '';
								$lnum++;
								$cnum = 0;
							} else {
								$hrf .= $ch;
							}
							if ($ch eq '>') {
								if ($hrf =~ /-->$/) {
									prt( "htmltools:$lnum:$cnum: End COMMENT ...[$hrf]\n" ) if ($htmtdbg6);
									$hrf = '';
									last;	# out of inner inner
								}
							}
						}
						$hrf = '';
						last;	# out of inner
					}
				}
			}
			### prt( "htmltools:$lnum:$cnum: [$hrf]\n" );
			if ($hrf =~ /^<img\s+/i) {
				prt("htmltools:$lnum:$cnum: Got [$hrf] ...\n") if ($htmtdbg3);
				if ($hrf =~ /src=\s*["']*(\S+)['"]*.*/i) {
					$hrf = $1;
					$hrf =~ s/['"]$//;
					push(@ims,$hrf);
					prt("htmltools:$lnum:$cnum: Got IMG SRC [$hrf] ...\n") if ($htmtdbg4);
				} else {
					prt( "WARNING: htmltools:$lnum:$cnum: IMG sans source [$hrf]\n" );
				}
			} elsif ($hrf =~ /<link\s+/i) {
				###if ($hrf =~ /href=["']*([\w\.]+)['"]*.*/i) {
				if ($hrf =~ /href=["']*(\S+)['"]*.*/i) {
					$hrf = $1;
					$hrf =~ s/>$//;
					$hrf =~ s/['"]$//;
					push(@ims,$hrf);
					prt("htmltools:$lnum:$cnum: Got LINK HREF [$hrf] ...\n") if ($htmtdbg4);
				}
			} elsif ($hrf =~ /<script\s+/i) {
				if ($hrf =~ /src=["']*(\S+)['"]*.*/i) {
					$hrf = $1;
					$hrf =~ s/>$//;
					$hrf =~ s/['"]$//;
					push(@ims,$hrf);
					prt("htmltools:$lnum:$cnum: Got SCRIPT SRC [$hrf] ...\n") if ($htmtdbg4);
				}
				#else {
				#	prt( "WARNING: htmltools: SCRIPT sans SRC [$hrf]\n" );
				#}
			} elsif ($hrf =~ /^<body\s+(.*)>$/i) {
				$hrf = $1;
				if ($hrf =~ /background=["']*(\S+)['"]*.*/i) {
					$hrf = $1;
					$hrf =~ s/>$//;
					$hrf =~ s/['"]$//;
					push(@ims,$hrf);
					prt("htmltools:$lnum:$cnum: Got body background [$hrf] ...\n") if ($htmtdbg7);
				}
				#else {
				#	prt( "WARNING: htmltools: body sans background [$hrf]\n" );
				#}
			} elsif ($hrf =~ /^<applet\s+(.*)>$/i) {
				$hrf = $1;
				if ($hrf =~ /code=["']*(\S+)['"]*.*/i) {
					$hrf = $1;
					$hrf =~ s/>$//;
					$hrf =~ s/['"]$//;
					push(@ims,$hrf);
					prt("htmltools:$lnum:$cnum: Got applet code [$hrf] ...\n") if ($htmtdbg7);
				}
				#else {
				#	prt( "WARNING: htmltools: applet sans code [$hrf]\n" );
				#}
			}
		}
		if ($ch eq "\n") {
			$lnum++;
			$cnum = 0;
		}
	}

	if ($htmtdbg5) {
		$i = scalar @ims;
		prt( "Returning $i IMG/OTHER items ...\n" );
		foreach $hrf (@ims) {
			prt( "$hrf " );
		}
		prt("\n");
	}
	return @ims;
}

# just remove a <tag>, and </tag> ...
# but leave the stuff between
sub removetag {
	my ($txt, $tg) = @_;
	my $ntxt = '';
	my $len = length($txt);
	my $ch = '';
	my $hrf = '';
	my $i;
	for ($i = 0; $i < $len; $i++) {
		$ch = substr($txt,$i,1);
		if ($ch eq '<') {
			$hrf = $ch;
			$i++;
			for ( ; $i < $len; $i++) {
				$ch = substr($txt,$i,1);
				$hrf .= $ch;
				if ($ch eq '>') {
					last;
				}
			}
			if ($hrf =~ /^<$tg\s/i) {
			} elsif ($hrf =~ /^<$tg>$/i) {
			} elsif ($hrf =~ /^<\/$tg>$/i) {
			} else {
				$ntxt .= $hrf;
			}
		} else {
			$ntxt .= $ch;
		}
	}
	return $ntxt;
}

sub remove_script {
	my ($txt) = shift;
	my $dbgsc = 0;	# only if a LOT of noise wanted
	my $tg = 'script';
	my $ntxt = '';
	my $len = length($txt);
	my $ch = '';
	my $hrf = '';
	my $i;
	my $insc = 0;
	my $quot = '';
	my $pch = '';
	my $qtxt = '';
	my $lstl = '';
	my $lnum = 1;
	for ($i = 0; $i < $len; $i++) {
		$ch = substr($txt,$i,1);
		if ($insc) {
			if ( (($ch eq '"')||($ch eq "'")) && ( $pch ne "\\" ) ) {
				$quot = $ch;
				$qtxt = $ch;
				$pch = $ch;
				prt( "$lnum:$i: Begin QUOTES [$ch] ...[$lstl]\n" ) if ($dbgsc);
				$i++;
				for ( ; $i < $len; $i++) {
					$ch = substr($txt,$i,1);
					if (($ch eq $quot) && ( $pch ne "\\" )) {
						$qtxt .= $ch;
						prt( "$lnum:$i: End QUOTES [$quot] [$qtxt]\n" ) if ($dbgsc);
						last;
					} elsif ($ch eq "\n") {
						prt( "$lnum:$i: End QUOTES ON NEW LINE [$qtxt]\n" ) if ($dbgsc);
						last;
					}
					$pch = $ch;
					$qtxt .= $ch;
				}
			} elsif (($ch eq '*')&&($pch eq '/')) {
				prt( "$lnum:$i: Begin /* comment ... [$lstl]\n" ) if ($dbgsc);
				$qtxt = "$pch$ch";
				$pch = $ch;
				$i++;
				for ( ; $i < $len; $i++) {
					$ch = substr($txt,$i,1);
					if (($ch eq '/')&&($pch eq '*')) {
						prt( "$lnum:$i: End /* comment ... [$qtxt]\n" ) if ($dbgsc);
						last;
					}
					$pch = $ch;
					$qtxt .= $ch;
				}
			} elsif (($ch eq '/')&&($pch eq '/')) {
				prt( "$lnum:$i: Begin // comment ...[$lstl]\n" ) if ($dbgsc);
				$qtxt = "$pch$ch";
				$pch = $ch;
				$i++;
				for ( ; $i < $len; $i++) {
					$ch = substr($txt,$i,1);
					if ($ch eq "\n") {
						prt( "$lnum:$i: End comment ... [$qtxt]\n" ) if ($dbgsc);
						last;
					}
					$pch = $ch;
					$qtxt .= $ch;
				}
			} elsif ($ch eq '<') {
				$hrf = $ch;
				$lstl .= $ch;
				$i++;
				prt( "$lnum:$i: Being tag ... [$lstl]\n" ) if ($dbgsc);
				if ($i < $len) {
					$ch = substr($txt,$i,1);
					if ($ch =~ /[\w\/!]/) {	# if alphanumeric, or '/' or '!'
						for ( ; $i < $len; $i++) {
							$ch = substr($txt,$i,1);
							$hrf .= $ch if ($ch ne "\n");
							if ($ch eq '>') {
								prt( "$lnum:$i: End tag ... [$hrf]\n" ) if ($dbgsc);
								last;
							} elsif ($hrf eq '<!--') {
								prt( "$lnum:$i: Skip comment tag ... [$hrf]\n" ) if ($dbgsc);
								last;
							}
							if ($ch eq "\n") {
								$lstl = '';
							} else {
								$lstl .= $ch;
							}
						}
						if ($hrf =~ /<\/$tg>/i) {
							prt( "$lnum:$i: End $tg [$hrf]\n" ) if ($dbgsc);
							$insc = 0;
						}
					} else {
						prt( "$lnum:$i: Non-alphanumeric follows - assume NOT tag ...\n" ) if ($dbgsc);
					}
				}
			}

			$pch = $ch;
			if ($ch eq "\n") {
				$lstl = '';
				$lnum++;
			} else {
				$lstl .= $ch;
			}
		} else {
			if ($ch eq '<') {
				$hrf = $ch;
				$i++;
				for ( ; $i < $len; $i++) {
					$ch = substr($txt,$i,1);
					$hrf .= $ch;
					if ($ch eq '>') {
						last;
					}
					$lnum++ if ($ch eq "\n");
				}
				if ($hrf =~ /^<$tg\s+/i) {
					prt( "$lnum:$i: Begin $tg sp [$hrf]\n" ) if ($dbgsc);
					$insc = 1;
					$pch = '';
				} elsif ($hrf =~ /^<$tg>$/i) {
					prt( "$lnum:$i: Begin $tg [$hrf]($i)\n" ) if ($dbgsc);
					$insc = 1;
					$pch = '';
				} else {
					$ntxt .= $hrf;
				}
			} else {
				$ntxt .= $ch;
			}
			$lnum++ if ($ch eq "\n");
		}
	}
	return $ntxt;
}

sub removefont {
	my ($txt) = shift;
	my $ntxt = removetag($txt,'font');
	return $ntxt;
}

sub removetagattrib {
	my ($txt, $tag) = @_;
	my $ntxt = '';
	my $len = length($txt);
	my $ch = '';
	my $hrf = '';
	my $i;
	for ($i = 0; $i < $len; $i++) {
		$ch = substr($txt,$i,1);
		if ($ch eq '<') {
			$hrf = $ch;
			$i++;
			for ( ; $i < $len; $i++) {
				$ch = substr($txt,$i,1);
				$hrf .= $ch;
				if ($ch eq '>') {
					last;
				}
			}
			if ($hrf =~ /^<$tag>$/i) {
				$ntxt .= $hrf;
			} elsif ($hrf =~ /^<$tag\s+/i) {
				###prt("Removing $tag attrib [$hrf]\n");
				$ntxt .= substr($hrf,0,length($tag)+1).'>';
			} else {
				$ntxt .= $hrf;
			}
		} else {
			$ntxt .= $ch;
		}
	}
	return $ntxt;
}


sub removetdattrib {
	my ($txt) = shift;
	my $ntxt = removetagattrib($txt,'td');
	return $ntxt;
}

sub removetrattrib {
	my ($txt) = shift;
	my $ntxt = removetagattrib($txt,'tr');
	return $ntxt;
}

sub substitutions { # ($txt2);
	my ($txt) = shift;
	$txt =~ s/&nbsp;/ /gm;
	$txt =~ s/&amp;/&/gm;
	return $txt;
}

sub trimblanklines {
	my ($txt) = shift;
	my $len = length($txt);
	my $ntxt = '';
	my $ln = '';
	my $ch = '';
	my $i = 0;
	for ($i = 0; $i < $len; $i++) {
		$ch = substr($txt,$i,1);
		if (($ch eq "\n")||($ch eq "\r")) {
			if (length($ln)) {
				if ($ln =~ /\S+/) {	# if got NOT space
					$ln = trimbothends($ln);
					if (length($ln)) {
						$ntxt .= $ln . $ch;
					}
				}
			}
			$ln = '';
		} else {
			$ln .= $ch;
		}
	}
	if (length($ln)) {
		if ($ln =~ /\S+/) {
			$ln = trimbothends($ln);
			if (length($ln)) {
				$ntxt .= $ln;
			}
		}
	}
	return $ntxt;
}

sub trimblanks {
	my ($txt) = shift;
	my $len = length($txt);
	my $ntxt = '';
	my $ln = '';
	my $ch = '';
	my $i = 0;
	for ($i = 0; $i < $len; $i++) {
		$ch = substr($txt,$i,1);
		if (($ch eq "\n")||($ch eq "\r")) {
			if (length($ln)) {
				if ($ln =~ /\S+/) {	# if got NOT space
					###$ln = trimbothends($ln);
					###if (length($ln)) {
						$ntxt .= $ln . $ch;
					###}
				}
			}
			$ln = '';
		} else {
			$ln .= $ch;
		}
	}
	if (length($ln)) {
		if ($ln =~ /\S+/) {
			###$ln = trimbothends($ln);
			###if (length($ln)) {
				$ntxt .= $ln;
			###}
		}
	}
	return $ntxt;
}


sub trimblanklines_OK_maybe {
	my ($txt) = shift;
	my $len = length($txt);
	my $ntxt = '';
	my $ln = '';
	my $ch = '';
	my $i = 0;
	for ($i = 0; $i < $len; $i++) {
		$ch = substr($txt,$i,1);
		if ($ch eq "\n") {
			if (length($ln)) {
				if ($ln =~ /\S+/) {
					while ($ln =~ /^\s/) {
						$ln = substr($ln,1);
					}
					if (length($ln)) {
						$ntxt .= $ln . $ch;
					}
				}
			}
			$ln = '';
		} else {
			$ln .= $ch;
		}
	}
	if (length($ln)) {
		if ($ln =~ /\S?/) {
			$ntxt .= $ln;
		}
	}
	return $ntxt;
}

sub triminlinetd {
	my ($txt) = shift;
	my $len = length($txt);
	my $ntxt = '';
	my $ln = '';
	my $ch = '';
	my $lt = '';
	my $nlt = '';
	my $nln = '';
	my $i = 0;
	for ($i = 0; $i < $len; $i++) {
		$ch = substr($txt,$i,1);
		if ($ch eq "\n") {
			if ($ln =~ /.*<td.*>(.*)<\/td>/i) {
				$lt = $1;	# get text between <td>...</td>
				# $nlt =~ s/\s//g; this removes ALL spaces - NOT GOOD!
				$nlt = trimbothends($lt);
				if (length($nlt)) {
					###prt("Got inline <td>...</td> - [$ln] [$lt] [$nlt]...\n");
					$nln = $ln;
					$nln =~ s/$lt/$nlt/;
					###prt("New line [$nln]...\n");
					$ln = $nln;
				}
			}
			$ntxt .= $ln.$ch;
			$ln = '';
		} else {
			$ln .= $ch;
		}
	}
	if (length($ln)) {
		if ($ln =~ /\S?/) {
			$ntxt .= $ln;
		}
	}
	return $ntxt;
}

# strip from '<?' to '?>', excluding within quotes
sub strip_php_script {
	my ($txt) = shift;
	my $ntxt = '';
	my $max = length($txt);
	my $pch = '';
	my $inphp = 0;
	my $inquote = '';
	for (my $i = 0; $i < $max; $i++) {
		my $ch = substr($txt,$i,1);
		if ($inphp) {
			##print "Should be END PHP ...\n" if (($ch eq '>')&&($pch eq '?'));
			if (length($inquote)) {
				# wating for end of QUOTE
				if ( ($ch eq $inquote) && ($pch ne "\\") ) {
					#print "End of QUOTE\n";
					$inquote = '';
				}
			} else {
				if ( (($ch eq '"')||($ch eq "'")) && ($pch ne "\\") ) {
					#print "Start of QUOTE\n";
					$inquote = $ch;
				} 
				if (($ch eq '>')&&($pch eq '?')) {
					$inphp = 0;
					#print "End of PHP ...\n";
				}
			}
			$pch = $ch;
			next;
		} else {
			if (($ch eq '?') && ($pch eq '<')) {
				$ntxt = substr($ntxt, 0, length($ntxt) - 1);
				#print "Start of PHP ...\n";
				$inphp = 1;
				next;
			}
		}
		$pch = $ch;
		$ntxt .= $ch;
	}
	return $ntxt;
}

sub drop_php_from_array {
	my (@arr) = @_;
	my $txt = '';
	foreach my $ln (@arr) {
		chomp $ln;
		$txt .= ' {=*==*=} ' if (length($txt));
		$txt .= $ln;
	}
	$txt = strip_php_script( $txt );
	@arr = split( / \{=\*==\*=\} /, $txt );
	my $lnc = scalar @arr;
	for (my $i = 0; $i < $lnc; $i++) {
		$arr[$i] .= "\n";
	}
	return @arr;
}


sub htmlexpand {
	my ($rtxt) = shift;
	my $tlen = length($rtxt);
prt( "len=$tlen - Add STYLE TAG to new line ...\n") if ($htmtdbg1);
$rtxt = tag2newline($rtxt,'style');
$tlen = length($rtxt);
prt( "len=$tlen - Add TABLE TAG to new line ...\n") if ($htmtdbg1);
$rtxt = tag2newline($rtxt,'table');
$tlen = length($rtxt);
prt( "len=$tlen - Add TR TAG to new line ...\n") if ($htmtdbg1);
$rtxt = tag2newline($rtxt,'tr');
$tlen = length($rtxt);
prt( "len=$tlen - Add TH TAG to new line ...\n") if ($htmtdbg1);
$rtxt = tag2newline($rtxt,'th');
$tlen = length($rtxt);
prt( "len=$tlen - Add TD TAG to new line ...\n") if ($htmtdbg1);
$rtxt = tag2newline($rtxt,'td');
$tlen = length($rtxt);
prt( "len=$tlen - Add SCRIPT TAG to new line ...\n") if ($htmtdbg1);
$rtxt = tag2newline($rtxt,'script');
$tlen = length($rtxt);
prt( "len=$tlen - Add PRE TAG to new line ...\n") if ($htmtdbg1);
$rtxt = tag2newline($rtxt,'pre');
$tlen = length($rtxt);
prt( "len=$tlen - Returned from htmlexpand ...\n" ) if ($htmtdbg1);
	return $rtxt;
}

sub dropdoctype {
	my ($txt) = shift;
	my $tlen = length($txt);
    my $pch = '';
    my $ch = '';
    my $rtxt = '';
    for (my $i = 0; $i < $tlen; $i++) {
        $ch = substr($txt,$i,1);
        if ($ch eq '<') {
            $pch = substr($txt,$i);
            if ($pch =~ /^<!DOCTYPE\s+/i) {
                ###prt( "Got DOCTYPE ...\n" );
                $i++;   # move to next
                for (; $i < $tlen; $i++) {
                    $ch = substr($txt,$i,1);
                    if ($ch eq '>') {
                        $ch = '';
                        last;
                    }
                }
            }
        }
        $rtxt .= $ch;
    }
    return $rtxt;
}

sub html_clean_up1 {
	my ($rtxt) = shift;
	my $tlen = length($rtxt);
prt( "len=$tlen - Drop DOCTYPE <!DOCTYPE... > ...\n");
$rtxt = dropdoctype($rtxt);
$tlen = length($rtxt);
prt( "len=$tlen - Drop comments <!--...--> ...\n");
$rtxt = dropcomments($rtxt);
$tlen = length($rtxt);
prt( "len=$tlen - Strip <HEAD>...</HEAD> tag ...\n");
$rtxt = striptag($rtxt, 'HEAD');
$tlen = length($rtxt);
prt( "len=$tlen - Strip <script>...</script> tag ...\n");
$rtxt = striptag($rtxt,'script');
$tlen = length($rtxt);
prt( "len=$tlen - Strip <noscript>...</noscript> tag ...\n");
$rtxt = striptag($rtxt,'noscript');
$tlen = length($rtxt);
prt( "len=$tlen - Strip <SELECT>...</SELECT> tag ...\n");
$rtxt = striptag($rtxt,'select');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <font ...> tags ...\n");
$rtxt = removefont($rtxt);
$tlen = length($rtxt);
prt( "len=$tlen - Remove <b> tags ...\n");
$rtxt = removetag($rtxt,'b');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <tt> tags ...\n");
$rtxt = removetag($rtxt,'tt');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <nobr> tags ...\n");
$rtxt = removetag($rtxt,'nobr');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <span> tags ...\n");
$rtxt = removetag($rtxt,'span');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <div> tags ...\n");
$rtxt = removetag($rtxt,'div');
$tlen = length($rtxt);
if ($rtxt =~ /<strong>/) {
    prt( "len=$tlen - Remove <strong> tags ...\n");
    $rtxt = removetag($rtxt,'strong');
    $tlen = length($rtxt);
}
prt( "len=$tlen - Remove <ul> tags ...\n");
$rtxt = removetag($rtxt,'ul');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <u> tags ...\n");
$rtxt = removetag($rtxt,'u');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <h1> tags ...\n");
$rtxt = removetag($rtxt,'h1');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <h2> tags ...\n");
$rtxt = removetag($rtxt,'h2');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <li> tags ...\n");
$rtxt = removetag($rtxt,'li');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <br> tags ...\n");
$rtxt = removetag($rtxt,'br');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <html> tags ...\n");
$rtxt = removetag($rtxt,'html');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <body> tags ...\n");
$rtxt = removetag($rtxt,'body');
$tlen = length($rtxt);
prt( "len=$tlen - Remove p attributes ...\n");
$rtxt = removetagattrib($rtxt,'p');
$tlen = length($rtxt);
prt( "len=$tlen - Remove th attributes ...\n");
$rtxt = removetagattrib($rtxt,'th');
$tlen = length($rtxt);
prt( "len=$tlen - Remove tr attributes ...\n");
$rtxt = removetrattrib($rtxt);
$tlen = length($rtxt);
prt( "len=$tlen - Remove td attributes ...\n");
$rtxt = removetdattrib($rtxt);
$tlen = length($rtxt);
$rtxt = trimblanklines($rtxt);
$tlen = length($rtxt);
    return $rtxt;
}

sub htmlcleanall {
	my ($rtxt) = shift;
	my $tlen = length($rtxt);
prt( "len=$tlen - Drop comments <!--...--> ...\n");
$rtxt = dropcomments($rtxt);
$tlen = length($rtxt);
prt( "len=$tlen - Strip <HEAD>...</HEAD> tag ...\n");
$rtxt = striptag($rtxt, 'HEAD');
$tlen = length($rtxt);
prt( "len=$tlen - Strip <script>...</script> tag ...\n");
$rtxt = striptag($rtxt,'script');
$tlen = length($rtxt);
prt( "len=$tlen - Strip <noscript>...</noscript> tag ...\n");
$rtxt = striptag($rtxt,'noscript');
$tlen = length($rtxt);
prt( "len=$tlen - Strip <SELECT>...</SELECT> tag ...\n");
$rtxt = striptag($rtxt,'select');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <font ...> tags ...\n");
$rtxt = removefont($rtxt);
$tlen = length($rtxt);
prt( "len=$tlen - Remove <b> tags ...\n");
$rtxt = removetag($rtxt,'b');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <tt> tags ...\n");
$rtxt = removetag($rtxt,'tt');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <nobr> tags ...\n");
$rtxt = removetag($rtxt,'nobr');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <span> tags ...\n");
$rtxt = removetag($rtxt,'span');
$tlen = length($rtxt);
prt( "len=$tlen - Remove th attributes ...\n");
$rtxt = removetagattrib($rtxt,'th');
$tlen = length($rtxt);
prt( "len=$tlen - Remove tr attributes ...\n");
$rtxt = removetrattrib($rtxt);
$tlen = length($rtxt);
prt( "len=$tlen - Remove td attributes ...\n");
$rtxt = removetdattrib($rtxt);
$tlen = length($rtxt);
prt( "len=$tlen - Delete <a...> & </a>\n");
$rtxt = collecthrefs($rtxt,1);
$tlen = length($rtxt);
prt( "len=$tlen - Delete <img...>\n");
$rtxt = collectimgs($rtxt,1);
$tlen = length($rtxt);
prt( "len=$tlen - Do substitutions ...\n");
$rtxt = substitutions($rtxt);
$tlen = length($rtxt);
prt( "len=$tlen - Trim blank lines ...\n");
$tlen = length($rtxt);
$rtxt = trimblanklines($rtxt);
$tlen = length($rtxt);
prt( "len=$tlen - Trim inline td ...\n");
$rtxt = triminlinetd($rtxt);
$tlen = length($rtxt);
prt( "len=$tlen - Returned from htmlcleanall ...\n");
	return $rtxt;
}

# added 18/07/2008
sub get_tag_attr_array {
    my ($tag) = shift;
    my ($i, $i2, $ch, $ln, @arr, $tg, $spc, $com, $incom, $pch, $nch);
    $tg = '';
    $ln = length($tag);
    @arr = ();
    $spc = 0;
    $incom = 0;
    $pch = '';
    my $indent = '    ';
    for ($i = 0; $i < $ln; $i++) {
        $i2 = $i + 1;
        $ch = substr($tag,$i,1);
        $nch = '';
        $nch = substr($tag,$i2,1) if ($i2 < $ln);
        $tg .= $ch; # add it to the tag
        if ($incom) {
            if ($ch eq $com) {
                $incom = 0;
            }
        } else {
            if (($ch eq '"')||($ch eq "'")) {
                $com = $ch;
                $incom = 1;
            } elsif (($ch =~ /^\s$/) && ($nch ne '/') && ($nch ne '>')) {
                $spc++;
                if ($spc > 1) {
                    if (length($tg)) {
                        $tg = $indent.$tg if (@arr);
                        push(@arr,$tg);
                    }
                    $tg = '';
                }
            }
        }
        $pch = $ch;
    }
    if (length($tg)) {
        $tg = $indent.$tg if (@arr);
        push(@arr,$tg);
    }
    return @arr;
}


sub split_tag_attrs {
    my ($tag) = shift;
    my @arr = ();
    if ($tag =~ /\s+/) {
        # there is a chance it has more than ONE attribute
        @arr = get_tag_attr_array($tag);
    } else {
        push(@arr,$tag);
    }
    return @arr;
}

sub array_tags2newline {
    my (@arr) = @_;
    my @narr = ();
    my ($ch, $len);
    my ($ln, $i, $pre, $lc, $l, $tag, $intag);
    my $maxtag = 60;
    $pre = '';
    $lc = scalar @arr;
    $intag = 0;
    $tag = '';
    for ($l = 0; $l < $lc; $l++) {
        $ln = $arr[$l]; # get LINE
        $len = length($ln); # and its LENGTH
        for ($i = 0; $i < $len; $i++) {
            # process char by char
            $ch = substr($ln,$i,1);
            if ($intag) {
                # seek END of tag
                $tag .= $ch;
                if ($ch eq '>') {
                    if ($tag =~ /^<.+>$/) {
                        # got WHOLE tag
                        if (length($tag) > $maxtag) {
                            push(@narr, split_tag_attrs($tag));
                        } else {
                            push(@narr,$tag);
                        }
                    } else {
                        push(@narr,$tag);
                    }
                    $tag = '';
                    $intag = 0;
                }
            } else {
                # seek start of TAG
                if ($ch eq '<') {
                    if (length($pre)) {
                        if ($pre =~ /^\s+$/) {
                            # is all space - dump it
                        } else {
                            push(@narr,$pre);
                        }
                    }
                    $pre = '';  # clear anything before
                    $tag = $ch; # START tag
                    $intag = 1; # and now IN A TAG
                } else {
                    $pre .= $ch;
                }
            }
        }
        # done LINE, so add this tag
        push(@narr,$tag) if length($tag);
        $tag = '';
    }
    return @narr;
}

1;

# eof - htmltools.pl
