#!/Perl
# iso639.pl
# AIM to download the table from the site, and build a 'validation' table ...
use strict;
use warnings;
use LWP::Simple;
require "logfile.pl" or die "Missing logfile.pl ...\n"; # my simple log file and some other utility subs
require "htmltools.pl" or die "Missing htmltools.pl ...\n";
# log file stuff
my ($LF);
my $outfile = 'temp'.$0.'.txt';
my $dodownload = 0;
my $URL = 'http://www.loc.gov/standards/iso639-2/englangn.html';
my $out_file = 'iso639.htm';
open_log($outfile);
prt( "$0 ... Hello, World...\n" );
prt( "Moment ... loading $URL ...\n" );
my $tbltext = '';
my @lines = ();
my @tlines = ();
my $cnt = 0;
my $line = '';
my $tln = '';
my $col = 0;
my $col1 = '';
my $col2 = '';
my $col3 = '';
my $col4 = '';
my $i = 0;
my @colist = ();
my %iso3 = ();
my %iso2 = ();
my $tblcnt = 0;
my $lncnt = 0;
my $maxcnt = 15; # maximum search for </td>
my $enttbl = 0;
my $bgntbl = 0;
my $thcnt = 0;
if ($dodownload) {
	$tbltext = get($URL);
	$cnt = length($tbltext);
	prt( "Downloaded $cnt characters ... writing $out_file ...\n" );
	write2file( $tbltext, $out_file );
} else {
	prt( "Loading $out_file ... moment ...\n" );
	open INF, "<$out_file" or mydie( "ERROR: Failed to open [$out_file] ... $! \n" );
	@lines = <INF>;	# slurp it into lines
	close INF;
	$tbltext = join("",@lines);
	$tbltext = tag2newline($tbltext, 'table');
	$tbltext = tag2newline($tbltext, 'tr');
	$tbltext = tag2newline($tbltext, 'th');
	$tbltext = tag2newline($tbltext, 'td');
	$tbltext = trimblanklines($tbltext);
	write2file($tbltext, 'temptbl.htm');
	$cnt = length($tbltext);
	prt( "Loaded $cnt characters ... from $out_file ...\n" );
}
@lines = split("\n", $tbltext);
$cnt = scalar @lines;
prt( "Got $cnt lines to process ... \n" );
foreach $line (@lines) {
	$tln = trimall( html_subs($line) );
	push(@tlines, $tln) if length($tln);
}
$cnt = scalar @tlines;
prt( "Got $cnt trimmed lines to process ... \n" );
##foreach $line (@tlines) {
for ($i = 0; $i < $cnt; $i++) {
	$lncnt++;
	$line = $tlines[$i];
	if ($line =~ /<table.*>/i) {
		$tblcnt++;
		$bgntbl = 1;
		$enttbl++;	# count another entry to a table
		prt( "$lncnt - Enter table $tblcnt ...[$line]\n" );
		$thcnt = 0;
	} elsif ($line =~ /<\/table>/i) {
		if ($tblcnt) {
			prt( "$lncnt - Exit table $tblcnt ...[$line]\n" );
			$tblcnt--;
		} else {
			prt( "$lncnt - Exit table NO COUNT! ...[$line]\n" );
		}
	} elsif ($line =~ /<tr.*?>/i) {
		if ($col == 4) {
			###prt( "[$col1] [$col2] [$col3] [$col4] \n" );
			# Language Name English French 639-2 639-1 
			push(@colist, [ $col1, $col2, $col3, $col4 ]);
		} else {
			if ($bgntbl || ($enttbl < 4) || ($thcnt > 0)) {
				# $col should be zero if just entered, and not yet in second
			} else {
				prt( "$lncnt - $tblcnt CHECK col=$col ...[$col1] [$col2] [$col3] [$col4]\n" );
			}
		}
		$bgntbl = 0;
		$thcnt = 0;
		$col = 0;
	} elsif ($line =~ /<td.*?>/i) {
		my $lc = 0;
		while ( !($line =~ /<\/td>/i) ) {
			$i++;
			$lncnt++;
			if ($i < $cnt) {
				$line .= $tlines[$i];
			} else {
				last;
			}
			$lc++;
			if ($lc > $maxcnt) {
				last; # exit anyway ...
			}
		}
		$tln = removetag($line, 'td');
		$tln = removetag($tln, 'span');
		$tln = removetag($tln, 'p');
		$tln = removetag($tln, 'br');
		$tln = removetag($tln, 'font');
		$tln = striptag($tln, 'a');
		$col++;
		if ($col == 1) {
			$col1 = $tln;
		} elsif ($col == 2) {
			$col2 = $tln;
		} elsif ($col == 3) {
			$col3 = $tln;
		} elsif ($col == 4) {
			$col4 = $tln;
		}
	} elsif ($line =~ /<th.*?>/i) {
		my $lc = 0;
		while ( !($line =~ /<\/th>/i) ) {
			$i++;
			$lncnt++;
			if ($i < $cnt) {
				$line .= $tlines[$i];
			} else {
				last;
			}
			$lc++;
			if ($lc > $maxcnt) {
				last; # exit anyway ...
			}
		}
		$thcnt++;
	}
}
$cnt = scalar @colist;
prt( "Got $cnt in country list ... \n" );
# Language Name English French 639-2 639-1 
for ($i = 0; $i < $cnt; $i++) {
	$col1 = trimall($colist[$i][0]);
	$col2 = trimall($colist[$i][1]);
	$col3 = trimall($colist[$i][2]);
	$col4 = trimall($colist[$i][3]);
	if ($col3 =~ /\//) {
		my @arr = split("/",$col3);
		foreach my $i3 (@arr) {
			if (defined $iso3{$i3}) {
				if ( !is_the_same( $iso3{$i3}, $col1) ) {
					prt( "Exists! [$i3] = [" . $iso3{$i3} . "] adding [$col1] ...\n" );
					$iso3{$i3} .= ' ' . $col1;
				}
			} else {
				$iso3{$i3} = $col1;
			}
		}
	} else {
		if (defined $iso3{$col3}) {
			if ( !is_the_same( $iso3{$col3}, $col1) ) {
				prt( "Exists! [$col3] = [" . $iso3{$col3} . "] adding [$col1] ...\n" );
				$iso3{$col3} .= ' ' . $col1;
			}
		} else {
			$iso3{$col3} = $col1;
		}
	}
	if (length($col4)) {
		if (defined $iso2{$col4}) {
			if ( !is_the_same( $iso2{$col4}, $col1) ) {
				prt( "Exists! [$col4] = [" . $iso2{$col4} . "] adding [$col1] ...\n" );
				$iso2{$col4} .= ' ' . $col1;
			}
		} else {
			$iso2{$col4} = $col1;
		}
	}
	###prt( "[$col1] [$col2] [$col3] [$col4] \n" );
}
my @keys3 = keys %iso3;
my @keys2 = keys %iso2;
my $kc3 = scalar @keys3;
my $kc2 = scalar @keys2;
prt( "Got $kc3 ISO 639-2 and $kc2 ISO 639-1 ... writting file ...\n" ); 

my $msg = '';
my $maxln = 75;
$line = '';
$cnt = 0;
$msg .= 'tmbstr ISO639_2 = {' . "\n";
foreach $tln (sort @keys3) {
	$cnt++;
	$line .= '"'.$tln.'"';
	##if ($cnt < $kc3) {
		$line .= ', ';
	##}
	if (length($line) > $maxln) {
		$msg .= $line . "\n";
		$line = '';
	}
}
if (length($line)) {
	$msg .= $line . "\n";
	$line = '';
}
$msg .= " 0 };\n\n";
$msg .= 'tmbstr ISO639_1[] = {' . "\n";
$cnt = 0;
foreach $tln (sort @keys2) {
	$cnt++;
	$line .= '"'.$tln.'"';
	##if ($cnt < $kc2) {
		$line .= ', ';
	##}
	if (length($line) > $maxln) {
		$msg .= $line . "\n";
		$line = '';
	}
}
if (length($line)) {
	$msg .= $line . "\n";
	$line = '';
}
$msg .= " 0 };\n";
write2file( $msg, "tempiso.txt" );
prt( "All written to tempiso.txt ...\n" );
if ($dodownload) {
	system( $out_file );
}

close_log($outfile,1);
exit(0);

sub is_the_same {
	my ($t1, $t2) = @_;
	my (@a1, @a2);
	my $ct = 0;
	my $ct1 = 0;
	my $ct2 = 0;
	@a1 = split(/[;,]/,$t1);
	@a2 = split(/[;,]/,$t2);
	foreach my $b1 (@a1) {
		$ct1++;
		$b1 = trimall($b1);
		foreach my $b2 (@a2) {
			$b2 = trimall($b2);
			if ($b1 eq $b2) {
				$ct2++;
				last;
			}
		}
	}
	if ($ct1 == $ct2) {
		return 1;
	}
	return 0;
}

sub write2file {
	my ($txt,$fil) = @_;
	open WOF, ">$fil" or mydie( "ERROR: Unable to open $fil! - $! \n" );
	print WOF $txt;
	close WOF;
}

sub trimall {
	my ($ln) = shift;
	chomp $ln;
	$ln =~ s/\r$//;
	$ln =~ s/\t/ /g;
	while ($ln =~ /\s\s/) {
		$ln =~ s/\s\s/ /g;
	}
	while ($ln =~ /^\s/) {
		$ln = substr($ln,1);
	}
	while ($ln =~ /\s$/) {
		$ln = substr($ln,0, length($ln) - 1);
	}
	return $ln;
}

sub html_subs {
	my ($htm) = shift;
	$htm = substitutions($htm);
	$htm =~ s/&eacute;/e/gm;
	$htm =~ s/&egrave;/e/gm;
	$htm =~ s/&ccedil;/c/gm;
	$htm =~ s/&aring;/a/gm;
	$htm =~ s/&#231;/c/gm;
	$htm =~ s/&agrave;/a/gm;
	$htm =~ s/&#180;/'/gm;
	$htm =~ s/&iuml;/i/gm;
	$htm =~ s/&acirc;/a/gm;
	$htm =~ s/&Icirc;/I/gm;
	$htm =~ s/&#252;/u/gm;
	return $htm;
}


# eof - iso639.pl
