#!/Perl
# imgalt01.pl - 2006.10.24 - geoff mclane (geoffmclane.com)
# AIM: To extract the <img alt="..." atribute for translation
# If $addtr is 1, then a search and load current 'tranlation'
# which is added to the table ...
# =====================================================================
use strict;
require 'logfile.pl' or die "ERROR: Can NOT load logfile.pl ...\n";
require 'htmltools.pl' or die "ERROR: Can NOT load htmltools.pl ...\n";
# log file stuff
my ($LF);
my $outfile = 'temp.'.$0.'.txt';
open_log($outfile);
prt( "$0 ... Hello, World ...\n" );
# user variable
my $def_folder = 'C:\HOMEPAGE\P26\travel'; 
my $def_input = $def_folder . '\tunisia.htm';
my $def_output = 'tempalt2.htm';
my $addtr = 1;
# from file
my $tr_file = $def_folder . '\tempalt.htm';

my $dosubs = 1; # modify in3 file, changing the alt text, and write out3
my $def_in3 = $def_folder . '\tunisfr2.htm';
my $def_out3 = $def_folder . '\tempalt3.htm';

my @trtable = ();
my @tlines = ();
my @langarr = ();
# debug
my $dbg1 = 1; # show length after adjustments
my $dbg2 = 0; # show 'other' tags
my $dbg3 = 0; # show collections phase
my $dbg4 = 0; # show sub collection phase
my $dbg5 = 0; # show the text collection
my $dbg6 = 0; # show substitution
# program variables
my $line = '';
my @lines = ();
my @frlines = ();
my $cnt = 0;
my $txt = '';
my $ccnt = 0;
my $newtxt = '';
my @attlist = ();
my @altlist = ();
my $in_file = $def_input;
my $out_file = $def_output;

my $htm_head = <<"EOF";
<html>
<head>
<title>Alt List</title>
</head>
<body>
<table border="2">
EOF

my $htm_tail = <<"EOF";
</table>
</body>
</html>
EOF

$in_file = pop @ARGV if (@ARGV);
$out_file = pop @ARGV if (@ARGV);
prt( "Got input from [$in_file], output to [$out_file] ...\n" );
if ( ! -f $in_file) {
	mydie("OOPS: Can NOT locate [$in_file] ...\n");
}
if ($addtr) {
	load_existing_table($tr_file);
}
open IF, "<$in_file" or mydie("OOPS: Can NOT open [$in_file] ...\n");
@lines = <IF>;	# slurp it all in
close IF;
$cnt = scalar @lines;
prt("Processing $cnt lines from [$in_file] ...\n");
$txt = join("\n", @lines);
$ccnt = length($txt);
prt("Or $ccnt characters from [$in_file] ...\n");
extract_img_alts( $txt );
show_att_list();
out_alt_list( $out_file );
if ($dosubs && @langarr) {
	open IFF, "<$def_in3" or mydie( "OOPS: Can not open file $def_in3 ... $! ...\n" );
	@frlines = <IFF>;
	close IFF;
	prt( "Process " . scalar @frlines . " lines from [$def_in3] ...\n" );
	$txt = do_substitution();
	open OFF, ">$def_out3" or mydie( "YEEK! Unable to create [$def_out3] ... $! ...\n" );
	print OFF $txt;
	close OFF;
	system( $def_out3 );
}
#$ccnt = length($newtxt);
#write_out_file($newtxt, $out_file);
#system($out_file);
close_log($outfile,1);
exit(0);

# ###############################################
# all subs below
# ##############
sub do_substitution {
	my $lc = scalar @langarr;
	my ($i, $img, $eng, $fr, $j, $c, $d, $imtag, $im2);
	my $frhtm = join('', @frlines);
	my $tl = length($frhtm);
	prt( "Attempting $lc substitutions ... in $tl htm chars...\n" );
	my $fnd = 0;
	my $newfr = ''; # accumulate into here
	for ($i = 0; $i < $lc; $i++) {
		$img = $langarr[$i][0];
		$eng = $langarr[$i][1];
		$fr  = $langarr[$i][2];
		$imtag = '';
		$d = '';
		$fnd = 0;
		$newfr = '';
		$tl = length($frhtm);
		prt( "\nText length now $tl characters ...\n" ) if ($dbg6);
		for ($j = 0; $j < $tl; $j++) {
			$c = substr($frhtm,$j,1);
			if ($d eq '<') {
				if ($c eq "\n") {
					if (substr($imtag,-1) =~ /\s/) {
						$c = '';
					} else {
						$c = ' ';
					}
				}
				$imtag .= $c;
				if ($c eq '>') {
					$d = $c;
					if ($imtag =~ /^<img.+/) {
						$imtag = trimall($imtag);
						if ($imtag =~ /src=['"](.+?)['"]/i) {
							$im2 = $1;
							if ($im2 eq $img) {
								if ($imtag =~ /alt=['"](.+?)['"]/i) {
									substr($imtag, index($imtag,$1),length($1),$fr);
									prt( "Change [$1] to [$fr] ..\n" ) if ($dbg6);
									prt( "$imtag\n" ) if ($dbg6);
									$fnd = 1;
								}
								$newfr .= $imtag; # add in this block
								last;
							}
						}
					}
					$newfr .= $imtag; # add in this block
				}
			} elsif ($c eq '<') {
				$imtag = $c;
				$d = $c;
			} else {
				$newfr .= $c;
			}
		}
		##############################################################
		if (!$fnd) {
			prt( "Did not find [$img] ...\n" );
		} else {
			$j++ if ($j < $tl);
			$newfr .= substr($frhtm, $j) if ($j < $tl); # use the NEW text
			$frhtm = $newfr;
		}
	}
	return $frhtm;
}

sub get_table_block {
	my ($tn) = shift; # table number
	my $lc = scalar @tlines;
	my ($l, $i, $c, $tg, $d, $ln, $ll);
	my $tbl = '';
	my $tc = 0;
	my $in_tbl = 0;
	$d = '';
	for ($l = 0; $l < $lc; $l++) {
		$ln = $tlines[$l]; # entract a line
		$ln = trimall($ln); # clean it up
		$ll = length($ln);
		if ($ll && $in_tbl && (length($tbl))) {
			$c = substr($tbl,-1);
			if ( !(($c =~ /\s/)||($c eq '>')) ) {
				$tbl .= ' ';
			}
		}
		for ($i = 0; $i < $ll; $i++) {
			$c = substr($ln,$i,1);
			$tbl .= $c if ($in_tbl);
			if ($d eq '<') {
				$tg .= $c;
				if ($c eq '>') {
					# got a tag
					if ($tg =~ /<table.*?>/i) {
						$tc++;
						if ($tn == $tc) {
							$in_tbl = 1;
						}
					} elsif ($tg =~ /<\/table>/i) {
						if ($in_tbl) {
							$tbl = substr($tbl, 0, length($tbl) - length($tg));
						}
						$in_tbl = 0;
					}
					$d = '';
				}
			} elsif ($c eq '<') {
				$tg = $c;
				$d = $c;
			}
		}
	}
	return $tbl;
}

sub load_existing_table {
	my ($fil) = shift;
	my $ln = '';
	my $rows = 0;
	my $cols = 0;
	my $in_row = 0;
	my $in_td = 0;
	my $img = '';
	my $eng = '';
	my $fr = '';

	if ( ! -f $fil) {
		mydie( "ERROR: Unable to locate exisitng [$fil] file ... $! ...\n" );
	}
	open INF, "<$fil" or mydie( "ERROR: Unable to OPEN exisitng [$fil] file ... $! ...\n" );
	@tlines = <INF>;
	close INF;
	prt( "Got " . scalar @tlines . " lines from file [$fil] ...\n" );
	my $tt = get_table_block(1);
	##prt( "Table block = [$tt]\n" );
	#$tt = tag2newline($tt, 'caption');
	#$tt = tag2newline($tt, 'tr');
	#$tt = tag2newline($tt, 'th');
	#$tt = tag2newline($tt, 'td');
	#$tt = trimblanklines($tt);
	#prt( "\nTable block 2 = \n[$tt]\n" );
	$tt = alltags2newline($tt);
	##prt( "\nTable block 3 = \n[$tt]\n" );
	@tlines = split("\n",$tt);
	prt( "Got " . scalar @tlines . " table lines ...\n" );
	foreach $ln (@tlines) {
		$ln = trimall($ln);
		if ($ln =~ /<tr.*>/i) {
			$rows++;
			$in_row = 1;
			$cols = 0;
		} elsif ($ln =~ /<th.*>/i) {
			# ignore these
			$cols = 0;
		} elsif ($ln =~ /<caption.*>/i) {
			# ignore
			$cols = 0;
		} elsif ($ln =~ /<td.*>/i) {
			$cols++;
			$in_td = 1;
		} elsif ($ln =~ /<\/caption>/i) {
			# ignore this
			$cols = 0;
		} elsif ($ln =~ /<\/th>/i) {
			# ignore
			$cols = 0;
		} elsif ($ln =~ /<\/tr>/i) {
			$in_row = 0;
			$cols = 0;
		} elsif ($ln =~ /<\/td>/i) {
			$in_td = 0;
		} else {
			# should be a text entry
			if ($in_td) {
				if ($cols == 1) {
					$img = $ln;
					prt( "img=[$ln]\n" ) if ($dbg5);
				} elsif ($cols == 2) {
					$eng = $ln;
					prt( "eng=[$ln]\n" ) if ($dbg5);
				} elsif ($cols == 3) {
					$fr = $ln;
					prt( "fr=[$ln]\n" ) if ($dbg5);
					push(@langarr, [$img, $eng, $fr]);
				}
			}
		}
	}
}

sub alltags2newline {
	my ($tx) = shift;
	my $tl = length($tx);
	my ($i, $c, $d);
	my $nt = '';
	$d = '';
	for ($i = 0; $i < $tl; $i++) {
		$c = substr($tx,$i,1);
		if ($c eq '<') {
			if (length($nt) && (substr($nt,-1) ne "\n")) {
				$nt .= "\n";
			}
		} 
		if (($d eq '>')&&($c ne "\n")) {
			if (length($nt) && (substr($nt,-1) ne "\n")) {
				$nt .= "\n";
			}
		}
		$nt .= $c;
		$d = $c;
	}
	return $nt;
}

sub short_text {
	my ($tx, $len) = @_;
	my $ln = length($tx);
	my $ntx = $tx;
	if ($ln > ($len + 3)) {
		my $hl = int( $len / 2 );
		$ntx = substr($tx,0,$hl);
		$ntx .= '...';
		$hl = $len - $hl;
		$ntx .= substr($tx, $ln - $hl);
	}
	return $ntx;
}

sub write_out_file {
	my ($tx, $fil) = @_;
	open OF, ">$fil" or mydie("YEEK! Can NOT create [$fil] ...\n");
	print OF $tx;
	close OF;
	prt("Written " . length($tx) . " characters to [$fil]...\n");
}


sub get_tag {
	my ($t) = shift;
	my $m = length($t);
	my ($j, $c);
	my $tg = '';
	for ($j = 0; $j < $m; $j++) {
		$c = substr($t,$j,1);
		if ($c eq '<') {
			$tg = $c;
			$j++;
			for ( ; $j < $m; $j++) {
				$c = substr($t,$j,1);
				$tg .= $c;
				if ($c eq '>') {
					last;
				}
			}
			last;
		}
	}
	return $tg;
}

sub get_att_hash {
	my ($tg) = shift;
	$tg =~ s/\n/ /gm;
	$tg =~ s/\r/ /gm;
	my $ml = length($tg);
	my ($i, $c, $d);
	my $tag = '';
	my $att = '';
	my $val = '';
	my %h = ();
	for ($i = 0; $i < $ml; $i++) {
		$c = substr($tg,$i,1);
		if ($c eq '<') {
			$i++;
			for ( ; $i < $ml; $i++) {
				$c = substr($tg,$i,1);
				if (($c =~ /\s/)||($c eq '>')) {
					last;
				}
				$tag .= $c;
			}
			# got the tag, now the attributes, if any
			prt( "tag=[$tag]\n" ) if ($dbg4);
			while (($c =~ /\s/)&&(($i + 1) < $ml)) {
				while (($c =~ /\s/)&&(($i + 1) < $ml)) {
					$i++;
					$c = substr($tg,$i,1);
				}
				$att = '';
				$val = '';
				if ( !($c =~ /\s/) && ($c ne '>')) {
					$att = $c; # start attribute
					$i++;
					for ( ; $i < $ml; $i++) {
						$c = substr($tg,$i,1);
						if ($c eq '=') {
							last;
						}
						$att .= $c;
					}
					if (($c eq '=')&&(($i + 1) < $ml)) {
						$i++;
						$d = substr($tg,$i,1);
						if (($d eq '"')||($d eq "'")) {
							$val = $d; # keep the inverted comma
						} else {
							$val = $d; # keep first item
							$d = ' ';
						}
						$i++;
						for ( ; $i < $ml; $i++) {
							$c = substr($tg,$i,1);
							if ($c eq '>') {
								last;
							} elsif ($c eq $d) {
								if ($c ne ' ') {
									$val .= $c;
									if (($i + 1) < $ml) {
										$i++;
										$c = substr($tg,$i,1);
									}
								}
								last;
							}
							$val .= $c;
						}
					}
					if (length($att) && length($val)) {
						prt( "att=[$att] value=[$val] c=[$c]\n" ) if ($dbg4);
						if (defined $h{$att}) {
							prt("Duplicate attribute!!! [$att] val1=[" . $h{$att} . "] adding [$val] ...\n" );
							if ($h{$att} ne $val) {
								$h{$att} .= '|' . $val;
							}
						} else {
							$h{$att} = $val;
						}
					} else {
						prt( "Warning: failed to get att=[$att] value=[$val] c=[$c]\n" );
					}
				}
			} # end while 
			#############################
			push(@attlist, [$tag, \%h]);
		}
	}
}

sub trim_tail {
	my ($ln) = shift;
	while ($ln =~ /\s$/m) {
		$ln = substr($ln,0, length($ln) - 1);
	}
	return $ln;
}

sub strip_quotes {
	my ($tx) = shift;
	$tx =~ s/^('|")//;
	$tx =~ s/('|")$//;
	return $tx;
}

sub show_att_list {
	my $ac = scalar @attlist;
	prt( "Got $ac entries in attlist ...\n" );
	my ($i, $src, $alt);
	for ($i = 0; $i < $ac; $i++) {
		my $tg = $attlist[$i][0];
		my %th = $attlist[$i][1];
		prt( "TAG=[$tg]\n" ) if ($dbg4);
		##foreach my $k (keys(%th)) {
		##	my $v = $th{$k};
		##	prt( "k=[$k] v=[$v]\n" );
		##}
		$src = '';
		$alt = '';
		foreach my $k (keys(%{$attlist[$i][1]})) {
			my $v = ${$attlist[$i][1]}{$k};
			prt( "k=[$k] v=[$v]\n" ) if ($dbg4);
			if ($k =~ /^src$/i) {
				$src = strip_quotes($v);
			} elsif ($k =~ /^alt$/) {
				$alt = strip_quotes($v);
			}
		}
		if (length($src) && length($alt)) {
			push(@altlist, [$src, $alt]);
		} else {
			prt( "WARNING: Failed to find src and alt ...\n" );
		}
	}
}

sub get_fr {
	my ($ig) = shift;
	my ($img, $eng, $fr, $i);
	my $icnt = scalar @langarr;
	for ($i = 0; $i < $icnt; $i++) {
		$img = $langarr[$i][0];
		$eng = $langarr[$i][1];
		$fr = $langarr[$i][2];
		if ($img eq $ig) {
			return $fr;
		}
	}
	return '&nbsp;';
}

sub out_alt_list {
	my ($fil) = shift;
	my $ct = scalar @altlist;
	if ($ct) {
		my ($i, $sr, $at, $msg);
		prt( "Outputting $ct alt list entries to $fil ...\n" );
		open OTF, ">$fil" or mydie( "ERROR: Unable to open $fil file ... $! \n" );
		print OTF $htm_head;
		for ($i = 0; $i < $ct; $i++) {
			$sr = $altlist[$i][0];
			$at = $altlist[$i][1];
			$msg = "<tr>\n";
			$msg .= "<td>\n";
			##$msg .= $sr;
			$msg .= '<img src="' . $def_folder . '/' . $sr . '" width="60" height="40">';
			$msg .= "</td>\n";
			$msg .= "<td>\n";
			$msg .= $at;
			$msg .= "</td>\n";
			$msg .= "<td>\n";
			$msg .= get_fr($sr);
			$msg .= "</td>\n";
			$msg .= "</tr>\n";
			print OTF $msg;
		}
		print OTF $htm_tail;
		close OTF;
		###system($fil);
	} else {
		prt( "WARNING: Did not find any src/alt sets ...\n" );
	}
}


sub extract_img_alts {
	my ($tx) = shift;
	my $tl = length($tx);
	my ($i);
	my $ch = '';
	my $nt = '';
	my $tag = '';
	my $att = '';
	my $tgl = '';
	my $intd = 0;
	my $ntag = '';
	for ($i = 0; $i < $tl; $i++) {
		$ch = substr($tx,$i,1);
		if ($ch eq '<') {
			$tag = get_tag( substr($tx,$i) );
			$i += (length($tag) - 1) if (length($tag));
			$tgl = $tag;
			$tgl =~ s/\n/ /g;
			$tgl =~ s/\r/ /g;
			if ($tgl =~ /<img(.*)>/im) {
				$att = $1;
				prt( "IMG tag [$tag]...\n" ) if ($dbg3);
				get_att_hash($tag);
			} elsif ((length($tag) > 4)&&(substr($tag,0,4) eq '<!--')) {
				prt( "Got comment ...\n" ) if ($dbg2);
			} else {
				prt( "other tag [$tag] ...\n" ) if ($dbg2);
			}
			$nt .= $tag;
		} else {
			$nt .= $ch;
		}
	}
	$tl = length($nt);
	prt("Now returning $tl characters ...\n") if $dbg1;
	return $nt;
}

sub trimall {
	my ($ln) = shift;
	chomp $ln;
	$ln =~ s/\r$//;
	$ln =~ s/\t/ /g;
	while ($ln =~ /\s\s/) {
		$ln =~ s/\s\s/ /g;
	}
	while ($ln =~ /^\s/) {
		$ln = substr($ln,1);
	}
	while ($ln =~ /\s$/) {
		$ln = substr($ln,0, length($ln) - 1);
	}
	return $ln;
}

# eof - imgalt01.pl
