#!/perl -w
# NAME: chkhsrc.pl
# AIM: Given a input FOLDER, check all the HTML found for a <img src="...."
# reference and make sure that reference EXISTS ...
# 31/05/2007 - geoff mclane - geoffair.com/mperl/index.htm
use strict;
use warnings;
use File::Basename;
unshift(@INC, 'C:/GTools/perl');
require 'logfile.pl' or die "Unable to load logfile.pl ...\n";
# log file stuff
my ($LF);
my $pgmname = $0;
if ($pgmname =~ /\w{1}:\\.*/) {
	my @tmpsp = split(/\\/,$pgmname);
	$pgmname = $tmpsp[-1];
}
my $outfile = "temp.$pgmname.txt";
open_log($outfile);
prt( "$0 ... Hello, World ...\n" );
my $recurse = 1;	# recursive
my $ignfpd = 1;		# ignore FRONTPAGE folders
my @fpfolders = qw( _vti_cnf _vti_pvt _private _derived );
my $in_folder = "C:\\HOMEPAGE\\GeoffAir";
my @in_files = ();
my $cnt = 0;
my $file = '';
my $warnings = '';
my $imgcnt = 0;

# debug only bits
my $dbg1 = 0;	# show entering folder ...
my $dbg10 = 0;	# show diag for get_img_srcs() ...
my $dbg11 = 0;	# in image processing show entered/exits script 
my $dbg12 = 0;	# in image processing show processing count
my $dbg13 = 0;	# in image processing show ok - found file
my $dbg14 = 0;	# in image processing show image count found
my $dbg15 = 0;	# in image processing show image count when NONE found

parse_args(@ARGV);

process_folder( $in_folder );
$cnt = scalar @in_files;
prt( "Found $cnt HTML files to process ...\n" );

foreach $file (@in_files) {
	my ($nm,$dir) = fileparse($file);
	if (open INF, "<$file") {
		my @lines = <INF>;
		close INF;
		my @srcs = get_img_srcs($file, @lines);
		$imgcnt += check_images( $file, @srcs );
	}
}
prt( "Processed $imgcnt image sources ...\n" );
if (length($warnings)) {
	prt( "\nWARNINGS FOLLOW:\n$warnings\n" );
} else {
	prt( "No warnings ...\n" );
}
close_log($outfile,1);
exit(0);

##################################

sub check_images {
	my ($file, @srcs) = @_;
	my ($nm, $dir) = fileparse($file);
	my $scnt = scalar @srcs;
	if ($scnt) {
		prt( "Found $scnt imgs in $nm ...\n" ) if ($dbg14);
		for (my $i = 0; $i < $scnt; $i++) {
			my $src = $srcs[$i][0];
			my $lnn = $srcs[$i][1];
			if ($src =~ /^http:\/\//i) {
				# remote HREF
			} else {
				my $ff = $dir.$src;
				if ( -f $ff ) {
					prt( "$src - ok\n" ) if ($dbg13);
				} else {
					my $msg = "WARNING: [$src] $file:$lnn NOT FOUND!";
					$warnings .= "\n" if length($warnings);
					$warnings .= $msg;
					prt( "$msg\n" );
				}
			}
		}
	} else {
		prt( "Found NO imgs in [$file] ...\n" ) if ($dbg15);
	}
	return $scnt;
}

sub get_img_srcs {
	my ($fil, @lns) = @_;
	my $lc = scalar @lns;
	my $scnt = 0;
	my ($nm,$dir) = fileparse( $fil );
	prt( "Processing $lc lines from [$nm] dir=[$dir]...\n" ) if ($dbg12);
	my @isrc = ();
	my $ln = '';
	my $bal = '';
	my $inscript = 0;
	my $msg = '';
	my $bgnln = 0;
	my $lnnos = '';
	for (my $i = 0; $i < $lc; $i++) {
		$ln = $bal;
		$bal = '';
		$ln .= $lns[$i];
		chomp $ln;
		prt( "$i [$ln] ...\n" ) if ($dbg10);
		if ($inscript) {
			if ($ln =~ /<\/script>/i) {
				$inscript =0;
				prt( "EXIT a SCRIPT ...\n" ) if ($dbg11);
			}
			next;
		}
		if ( $ln =~ /<img\s+(.*)/i ) {
			my $iln = $1;
			prt( "Found [$iln] ...\n" ) if ($dbg10);
			$bgnln = $i;
			while ( !($iln =~ />/) && ($i < $lc)) {
				$i++;
				my $nxln = $lns[$i];
				chomp $nxln;
				prt( "Adding [$nxln] ...\n" ) if ($dbg10);
				$iln .= ' '.$nxln;
			}
			$lnnos = "$bgnln:$i";
			my $ind = index($iln, '>');
			if ($ind != -1) {
				$bal = substr($iln, $ind+1);
				$iln = substr($iln, 0, $ind+1);
			}
			#if ($iln =~ /src=\"(.+)\"/i) {
			if ($iln =~ /src=\"(\S+)\"/i) {
				prt( "SRC = $1 In line [$iln]$lnnos...\n" ) if ($dbg10);
				push(@isrc, [$1, $lnnos]);
				$scnt++;
			} else {
				$msg = "WARNING: SRC NOT FOUND in [$iln]$lnnos...";
				$warnings .= "\n" if length($warnings);
				$warnings .= $msg;
				prt( "$msg\n" );
			}
		} elsif ( $ln =~ /<script.*>/i ) {
			$inscript = 1;
			prt( "Entered a SCRIPT ...\n" ) if ($dbg11);
			if ($ln =~ /<\/script>/i) {
				$inscript =0;
				prt( "EXIT a SCRIPT ...\n" ) if ($dbg11);
			}
		}
	}
	prt( "Returning $scnt img sources ...\n") if ($dbg10);
	return @isrc;
}


sub parse_args {
	my (@av) = @_;
	while (@av) {
		$in_folder = $av[0];
		shift @av;
	}
}

sub is_my_ext {
	my ($fil) = shift;
	my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ );
	if ((lc($ext) eq ".htm")||(lc($ext) eq ".html")) {
		return 1;
	}
	return 0;
}

# my $ignfpd = 1;	# ignore FRONTPAGE folders
sub is_fp_folder {
	my ($inf) = shift;
	foreach my $fil (@fpfolders) {
		if (lc($inf) eq lc($fil)) {
			return 1;
		}
	}
	return 0;
}


sub process_folder {
	my ($inf) = shift;
	my $fcnt = 0;
	prt( "Processing $inf folder ...\n" ) if ($dbg1);
	if ( opendir( DIR, $inf ) ) {
		my @files = readdir(DIR);
		closedir DIR;
		foreach my $fil (@files) {
			if (($fil eq ".")||($fil eq "..")) {
				next;
			}
			my $ff = $inf."\\".$fil;
			if ( -d $ff ) {
				if ($recurse) {
					if ($ignfpd && is_fp_folder($fil)) {	# ignore FRONTPAGE folders
						next;
					}
					process_folder( $ff );
				}
			} else {
				if (is_my_ext($fil)) {
					push(@in_files, $ff);
					$fcnt++;
				}
			}
		}
		prt( "Processed $inf folder finding $fcnt HTML files ...\n" );
	} else {
		prt( "ERROR: Failed to open folder $inf ...\n" );
	}
}

# eof - chkhsrc.pl
