#!/perl -w
# NAME: gethrefs.pl
# AIM: Parse a HTML file, and extract HREF links
# 18/07/2010 - revisit and test...
use strict;
use warnings;
use File::Basename;  # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] )
use constant {
     HRT_UNKNOWN => 0,
     HRT_LOCAL => 1,
     HRT_LINK => 2,
     HRT_SCRIPT => 4,
	 HRT_FILE => 8,
     HRT_BASE => 16
};
use constant {
	FT_UNKNOWN => 0,
	FT_HTML => 1,
	FT_GRAF => 2,
	FT_CSS => 3,
	FT_SCRIPT => 4,
	FT_TEXT => 5,
	FT_ZIP => 6,
    FT_BIN => 7,
    FT_CODE => 8,
    FT_DIR => 9
};

# offsets in file array
use constant {
	OF_FF => 0,	# full file name
	OF_HR => 1,	# array ref of href links
	OF_IM => 2,	# array ref of image links
	OF_LK => 3,	# linked count
	OF_SP => 4,	# spare
	OF_TO => 5,	# links TO
	OF_FM => 6, # links FROM
	OF_FT => 7	# file type
};

my $perl_root = 'C:\Gtools\perl';
unshift(@INC,$perl_root);
require 'logfile.pl' or die "Unable to load logfile.pl ...\n";
require 'htmltools.pl' or die "Unable to load htmltools.pl ...\n";
# for htmltools, if functions used
my @imgs = ();
my @hrefs = ();
# log file stuff
my ($LF);
my $pgmname = $0;
if ($pgmname =~ /\w{1}:\\.*/) {
	my @tmpsp = split(/\\/,$pgmname);
	$pgmname = $tmpsp[-1];
}
my $outfile = $perl_root."\\temp.$pgmname.txt";
open_log($outfile);
###prt( "$0 ... Hello, World ...\n" );

my $in_file = 'C:\GTools\java\examples\JavaTech\Code_List.htm';
##my $in_file = 'C:\HOMEPAGE\GA\travel\maroc\index.htm';
###my $in_file = 'temphtml.htm';
my @all_hrefs = ();

my $outtemp = $perl_root."\\templist.txt";
my $show_full_list = 0;
my $show_missed_files = 0;

# CONSTANTS
###########
# File Type Extensions
my @html_extension = qw( .htm .html .shtml .php );
my @graf_extension = qw( .jpg .jpeg .gif .png .bmp .ico .mpg );
my @css_extension  = qw( .css );
my @script_extension = qw( .js .class .cgi .java .remote );
my @zip_extension = qw( .zip .tar .gz .jar );
my @txt_extension = qw( .txt .doc .bat .cmd .old .bak .policy .pdf );
my @code_extension = qw( .c .cxx .cpp .h .hxx .hpp .idl .mak );
my @bin_extension = qw( .dat .exe .au );

# private FRONTPAGE folders
my @fpfolders = qw( _vti_cnf _vti_pvt _private _derived );

# features
my $ignfpd = 1;	# ignore FRONTPAGE folders
my @excludes = qw( desktop.ini php.ini blank.html blank.htm );
my $recurse = 1;	# recursive
my @splexcludes = qw( macpc );

my %ext_hash = ();
my @all_files = ();
my $refcnt = 0;
my @done_files = ();
my %not_found = ();

my ($base_file,$base_dir);
my $base_href = ''; # set if <BASE href="..."> found

# DEBUG
my $dbg1 = 0;	# show discarded material
my $dbg2 = 0;	# show "$hcnt:HREF: hr[$hr] $ctyp tail[$tail] pre[$disc] tag[$tag] bgn[$bgn]...
my $dbg3 = 0;	# show Processing $lncnt lines from $fil ...
my $dbg4 = 0;	# show File [$name], in [$rdir] ...
my $dbg5 = 0;   # show HREF immediately
my $dbg6 = 0;   # show FOLDERS searched...

sub pgm_exit($$) {
    my ($val,$msg) = @_;
    if (length($msg)) {
        $msg =~ s/\n$//;
        prt("$msg\n");
    }   
    close_log($outfile,1);
    exit($val);
}

##################################################################
#	OF_FF => 0,	# full file name
#	OF_HR => 1,	# array ref of href links
#	OF_IM => 2,	# array ref of image links
#	OF_LK => 3,	# linked count
#	OF_SP => 4,	# spare
#	OF_TO => 5,	# links TO
#	OF_FM => 6, # links FROM
#	OF_FT => 7	# file type
sub show_results {
	my $fcnt = scalar @all_files;
	my ($i, $ff, $ft, $cnt, $mcnt);
    $mcnt = 0;
	for ($i = 0; $i < $fcnt; $i++) {
		$ft = $all_files[$i][OF_FT];
		if ($ft == FT_HTML) {
			$cnt = $all_files[$i][OF_SP];
			if ($cnt == 0) {
				$ff = $all_files[$i][OF_FF];
				$mcnt++;    # prt( "Missed [$ff]\n" );
			}
		}
	}
    if ($mcnt) {
        prt("Got $mcnt 'missed' files... not marked...\n");
        for ($i = 0; $i < $fcnt; $i++) {
            $ft = $all_files[$i][OF_FT];
            if ($ft == FT_HTML) {
                $cnt = $all_files[$i][OF_SP];
                if ($cnt == 0) {
                    $ff = $all_files[$i][OF_FF];
                    prt( "Missed [$ff]\n" );
                }
            }
        }
    }
}

# =========================================================================
# url_parse - needs some more to remove any other post, like index.htm?a=b...
# ---------
sub url_parse($) {
    my ($url) = @_;
    my $post = '';
    my $name = '';
    my $dir  = '';
    my $ind = index($url,'#');
    if ($ind > 0) {
        $post = substr($url,$ind);
        $url = substr($url,0,$ind);
    }
    if ($url =~ /\/$/) {
        $dir = $url;
    } else {
        ($name,$dir) = fileparse($url);
        if ( !($name =~ /\./) ) {
            # without an EXTENT, assume directory
            $dir .= $name.'/';
            $name = '';
        }
    }
    return $dir,$name,$post;    # url_parse - return (dir,name,post)
}

sub uri_parse2($) {
    my ($uri) = shift;
    $uri =~ /^(([^:\/\?#]+):)?(\/\/([^\/\?#]*))?([^\?#]*)(\?([^#]*))?(#(.*))?/;
    # Then:
    my $scheme    = (defined $2) ? $2 : '';
    my $authority = (defined $4) ? $4 : '';
    my $path      = (defined $5) ? $5 : '';
    my $query     = (defined $7) ? $7 : '';
    my $fragment  = (defined $9) ? $9 : '';
    return $scheme,$authority,$path,$query,$fragment;
}

##################################################################
#########################################################
# Passed an array of extensions,
# check if this is one of them?
#########################################################
sub is_my_extension {
	my ($fil, $rexts) = @_;
	my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ );
    my $lcext = lc($ext);
    my ($ex);
	foreach $ex (@{$rexts}) {
		return 1 if (lc($ex) eq $lcext);
	}
	return 0;
}

############################################
# only looking for HTM, HTML, PHP,
# could be extended to others maybe ...
############################################
sub is_htm_extension {
	my ($fil) = shift;
	return( is_my_extension($fil, \@html_extension) );
}
sub is_graphic_extension {
	my ($fil) = shift;
	return( is_my_extension($fil, \@graf_extension) );
}
sub is_zip_extension {
	my ($fil) = shift;
	return( is_my_extension($fil, \@zip_extension) );
}
sub is_css_extension {
	my ($fil) = shift;
	return( is_my_extension($fil, \@css_extension) );
}
sub is_txt_extension {
	my ($fil) = shift;
	return( is_my_extension($fil, \@txt_extension) );
}
sub is_code_extension {
	my ($fil) = shift;
	return( is_my_extension($fil, \@code_extension) );
}
sub is_script_extension {
	my ($fil) = shift;
	return( is_my_extension($fil, \@script_extension) );
}
sub is_bin_extension {
	my ($fil) = shift;
	return( is_my_extension($fil, \@bin_extension) );
}

#use constant {
#	FT_UNKNOWN => 0,
##	FT_HTML => 1,
##	FT_GRAF => 2,
##	FT_CSS => 3,
##	FT_SCRIPT => 4,
##	FT_TEXT => 5,
##	FT_ZIP => 6,
##  FT_DIR => 7
#};
sub get_file_type_const($);

sub get_file_type_const($) {
	my ($fil) = shift;
	if (is_htm_extension($fil)) {
		return FT_HTML;
	} elsif (is_graphic_extension($fil)) {
		return FT_GRAF;
	} elsif (is_zip_extension($fil)) {
		return FT_ZIP;
	} elsif (is_css_extension($fil)) {
		return FT_CSS;
	} elsif (is_txt_extension($fil)) {
		return FT_TEXT;
	} elsif (is_script_extension($fil)) {
		return FT_SCRIPT;
	} elsif (is_bin_extension($fil)) {
		return FT_BIN;
	} elsif (is_code_extension($fil)) {
		return FT_CODE;
	} elsif ($fil =~ /\/$/) {
        return FT_DIR;
    } elsif ($fil =~ /#/) {
        my $ih = index($fil,'#');
        if ($ih > 0) {
            my $f2 = substr($fil,0,$ih);
            return get_file_type_const($f2);
        }
	} elsif ($fil =~ /\//) {
        return FT_DIR;  # gross assumption
    } elsif ($fil =~ /^\w+$/) {
        return FT_DIR;  # another gross assumption
    }

    pgm_exit(1,"Why UNKNOWN for [$fil]?\n");
	return FT_UNKNOWN;
}

sub file_type_const_to_string {
	my ($ft) = shift;
	if ($ft == FT_HTML) {
		return "html";
	} elsif ($ft == FT_GRAF) {
		return "graphic";
	} elsif ($ft == FT_ZIP) {
		return "zip";
	} elsif ($ft == FT_CSS) {
		return "css";
	} elsif ($ft == FT_TEXT) {
		return "text";
	} elsif ($ft == FT_SCRIPT) {
		return "script";
	} elsif ($ft == FT_BIN) {
        return "binary";
	} elsif ($ft == FT_CODE) {
        return "code";
	} elsif ($ft == FT_DIR) {
        return "directory";
	} elsif ($ft == FT_UNKNOWN) {
		return "unknown";
	}
	pgm_exit(1,"***FIX ME*** uncased type [$ft]!");
    return "";
}

sub fix_rel_url($) {
	my ($path) = @_;
	my @a = split(/\//, $path);
	my $npath = '';
	my $max = scalar @a;
	my @na = ();
	for (my $i = 0; $i < $max; $i++) {
		my $p = $a[$i];
		if ($p eq '.') {
			# ignore this
		} elsif ($p eq '..') {
			if (@na) {
				pop @na;	# discard previous
			} else {
				pgm_exit(1,"ERROR: Got relative .. without previous!!! path=$path\n" );
			}
		} else {
			push(@na,$p);
		}
	}
	foreach my $pt (@na) {
		$npath .= "/" if length($npath);
		$npath .= $pt;
	}
	return $npath;
}

sub is_http_link($) {
    my ($hr) = shift;
    return 1 if ($hr =~ /^http(s*):\/\//);
    return 0;
}

sub get_full_base_href($$) {
    my ($rrhr, $bhr) = @_;
    my $rhr = ${$rrhr};
    my ($nm,$dir,$fhr);
    if ( length($bhr) && !is_http_link($rhr) && !($rhr =~ /^#/) ) {
        if ($bhr =~ /\/$/) {
            $dir = $bhr;    # assume a DIRECTORY
        } else {
            ($nm,$dir) = fileparse($bhr); # assume a FILE, so get the dir only...
        }
        $fhr = $dir.$rhr;
        $fhr = fix_rel_url($fhr);
        ${$rrhr} = $fhr;
        return 1;
    }
    return 0;
}

sub get_href_type_const($);

sub get_href_type_const($) {
	my ($hrf) = shift;
    my ($ih,$id,$is);
	if (is_http_link($hrf)) {
		return HRT_LINK;
	} elsif ($hrf =~ /^ftp:\/\//i) {
		return HRT_LINK;
	} elsif ($hrf =~ /^javascript:/i) {
		return HRT_SCRIPT;
	} elsif (substr($hrf,0,1) eq '#') {
		return HRT_LOCAL;
    }
    #if ( get_full_base_href(\$hrf,$base_href) ) {
    #    if ($hrf =~ /^http(s*):\/\//i) {
    #        return HRT_LINK;
    #    }
    #}

    $ih = index($hrf,'#');
    $id = rindex($hrf,'.');
    $is = rindex($hrf,'/');
    if ($ih > 0) {
        my $hr2 = substr($hrf,0,$ih);
        my $srt = get_href_type_const($hr2);
        $id = rindex($hr2,'.');
        $is = rindex($hr2,'/');
        if ($id > 0) {
            return ($srt | HRT_FILE);
        }
    }
    if ($id > 0) {
        # contains a DOT - assume a file
		return HRT_FILE;
	}
    if ($hrf =~ /\/$/) {
        # ends in '/', assume file - acutally directory
		return HRT_FILE;
    }
    if ($hrf =~ /\//) {
        # contains any '/', assume a file
		return HRT_FILE;
    }
    if ($hrf =~ /^\w+$/) {
        # contains any alphanumeric only, assume a file
		return HRT_FILE;
    }
    pgm_exit(1,"Why UNKNOWN on href [$hrf] ih=$ih id=$id is=$is\n");
	return HRT_UNKNOWN;
}

sub href_type_to_string {
	my ($hrt) = shift;
    my $ret = '';
	if ($hrt & HRT_LINK) {
		$ret .= "extern link ";
    }
	if ($hrt & HRT_SCRIPT) {
		$ret .= "script ";
    }
    if ($hrt & HRT_LOCAL) {
		$ret .= "local ";
    }
    if ($hrt & HRT_FILE) {
        $ret .= "file ";
    }
    if ($hrt & HRT_BASE) {
		$ret .= "BASE ";
    }
	if ($hrt == HRT_UNKNOWN) {
		$ret = "unknown";
	}
    $ret =~ s/\s+$//;
    prt("\n***FIX ME*** uncased type [$hrt]!\n") if (length($ret) == 0);

	return $ret;
}


##############################################


sub get_hrefs_from_string($) {
	my ($ln) = shift;
	my ($i, $j, $line, $ch, $ch2, $len, $tag, $disc, $hcnt);
    my ($bgn, $fhr, $hr, $tail, $max, $hrt, $ft, $ctyp);
	my ($sp,$tag2,$gottag);
	my @hrf = ();
	$ln =~ s/\n/ /g;
	$ln = trim_all($ln);
	# sub write2file { 	my ($txt,$fil) = @_;
	# write2file($fulln,'tempfl.txt');
	$len = length($ln);
	$disc = '';
	$hcnt = 0;
    $base_href = '';    # assume NO <BASE href="...">
    # process single long string, char by char
	for ($i = 0; $i < $len; $i++) {
		$ch = substr($ln,$i,1);
		if ($ch eq '<') {
			$tag = $ch; # start a tag
			$i++;
			$ch = substr($ln,$i,1);
			# could check for things like <! ... later maybe
            $tag2 = '';
            $gottag = 0;
			for (; $i < $len; $i++) {
				$ch = substr($ln,$i,1);
				$tag .= $ch;
				if ($ch eq '>') {
					last;
				}
                if (!$gottag) {
                    if ($ch =~ /\w/) {
                        $tag2 .= $ch;
                    } else {
                        $gottag = 1;
                    }
                }
			}

			if ($tag =~ /(.*\s+)href(\s*)=/i) {
				$bgn = $1;
				$sp = length($2);
				$hcnt++;
				$fhr = substr($tag,length($bgn)+5+$sp);
				$fhr = substr($fhr,1) while ($fhr =~ /^\s/); # remove all LEADING space
				$ch = substr($fhr,0,1);
                prt("$tag [$tag2] [$fhr]\n") if ($dbg5);
				if (($ch eq '"')||($ch eq "'")) {
					$max = length($fhr);
					$hr = '';
					$tail = '';
					# collect actual HREF=
					for ($j = 1; $j < $max; $j++) {
						$ch2 = substr($fhr,$j,1);
						if ($ch eq $ch2) {
							$tail = substr($fhr,$j);
							last;
						}
						$hr .= $ch2;
					}

                    if ($tag2 =~ /^BASE$/i) {
                        $hrt = HRT_BASE;
                        #prt("Got [$tag2] [$fhr] [$hr]\n");
                        my ($d,$n,$p) = url_parse($hr);
                        prt("Got BASE [$hr] = [$d]+[$n]+[$p]\n");
                        $base_href = $hr;
                    } else {
                        get_full_base_href(\$hr,$base_href);
                        $hrt = get_href_type_const($hr);
                    }
					$ctyp = '';
					$ft = FT_UNKNOWN;
					if ($hrt & HRT_FILE) {
						$ft = get_file_type_const($hr);
						$ctyp = "ext[".file_type_const_to_string($ft)."] ";
					}
					$ctyp = 'type['.href_type_to_string($hrt)."] $ctyp";
                    #prt("tag [$tag2] [$hr] $ctyp\n");
					prt( "$hcnt:HREF: hr[$hr] $ctyp tail[$tail] pre[$disc] tag[$tag] bgn[$bgn]\n" ) if ($dbg2);
					#          href  HRT-type FT-file
					#            0     1       2
					push(@hrf, [$hr, $hrt,    $ft]);
				} else {
					prt( "$hcnt:HREF: fhr[$fhr] pre[$disc] tag[$tag] bgn[$bgn] CHECK ME\n" );
				}
			} else {
				prt( "DISCARDED: pre[$disc] tag[$tag] ...\n" ) if ($dbg1);
			}
			$disc = '';
		} else {
			$disc .= $ch;
		}
	}
	return @hrf;
}

sub parse_file($$) {
    my ($bdir,$bfil) = @_;
	my $fil = $bdir.$bfil;
	my ($lncnt, $full, $hrcnt, $i, $hfcnt, $typ, $filcnt,$linkcnt);
	my @hrf = ();
	if ( ! open INF, "<$fil") {
        prt( "WARNING: Can NOT open file [$fil]...\n" );
    	return @hrf;
    }
    my @lines = <INF>;
    close INF;
    $lncnt = scalar @lines;
    prt( "Processing $lncnt lines from [$fil] ...\n" );
    $full = join('',@lines);
    # sub write2file { 	my ($txt,$fil) = @_;
    #my $scrp = return_tag($full,'script');
    ##my $scrp = get_all_tag_text($full,'script');
    ##write2file($scrp,'tempscript.txt');
    ##prt( "Got script text [$scrp]\n" );
    @hrf = get_hrefs_from_string($full);
    $hrcnt = scalar @hrf;
    $filcnt = 0;
    for ($i = 0; $i < $hrcnt; $i++) {
        $fil = $bdir.$hrf[$i][0];
        $typ = $hrf[$i][1];
        if ( ($typ & HRT_FILE) && !($typ & HRT_LINK) ) {
            $filcnt++;
            if (! -f $fil) {
                if (defined $not_found{$fil}) {
                    $not_found{$fil}++;
                } else {
                    prt( "WARNING: File [$fil] NOT found ...\n" );
                    $not_found{$fil} = 1;
                }
            }
        }
    }
    prt( "Got $hrcnt HREF entries, from $bfil ... $filcnt link files\n" );
    $linkcnt = 0;
    my %counted = ();
    my %by_extent = ();
    my %by_fn = ();
    my @dupes = ();
    my $msg = '';
    my ($nm,$dir,$ext,$ind,$ff);
    for ($i = 0; $i < $hrcnt; $i++) {
        $fil = $hrf[$i][0];
        $typ = $hrf[$i][1];
        if ($typ & HRT_LINK) {
            $ind = index($fil,'#');
            $fil = substr($fil,0,$ind) if ($ind > 0);
            if (defined $counted{$fil}) {
                $counted{$fil}++;
            } else {
                $counted{$fil} = 1;
                $linkcnt++;
            	($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ );
                $by_extent{$ext} = [] if (!defined $by_extent{$ext});
                push( @{$by_extent{$ext}}, $fil );
                $ff = $nm.$ext;
                if ($ext =~ /^\.java/) {
                    if (defined $by_fn{$ff}) {
                        push(@dupes,$ff);
                        $by_fn{$ff}++;
                    } else {
                        $by_fn{$ff} = 1;
                    }
                }
            }
        }
    }
    if ($linkcnt) {
        %counted = ();
        prt("Listing $linkcnt links...\n");
        if ($show_full_list) {
            for ($i = 0; $i < $hrcnt; $i++) {
                $fil = $hrf[$i][0];
                $typ = $hrf[$i][1];
                if ($typ & HRT_LINK) {
                    $ind = index($fil,'#');
                    $fil = substr($fil,0,$ind) if ($ind > 0);
                    if (defined $counted{$fil}) {
                        $counted{$fil}++;
                    } else {
                        $counted{$fil} = 1;
                        prt("$fil\n");
                        $msg .= "$fil\n";
                    }
                }
            }
        }
        foreach $ext (keys %by_extent) {
            my $list = $by_extent{$ext};
            foreach $fil (@{$list}) {
                #prt("$fil\n");
                $msg .= "$fil\n";
            }
        }
        write2file($msg,$outtemp);
        prt("Written list to $outtemp...\n");
        if (@dupes) {
            prt("Note: ".scalar @dupes." duplicated file names...\n");
            prt( join(" ",@dupes)."\n");
        } else {
            prt("Appears NO duplicated names...\n");
        }
    }

	return @hrf;
}

####################################
####################################################################
# process_folder(folder) 
# Main DIRECTORY processing function
#
# Open the FOLDER given, and collect ALL files found,
# iterate into sub-directories, if $recurse is non-zero,
# and it is NOT a special FRONTPAGE (hidden) FOLDER.
#
# Files are collected into multidemensional arrays
####################################################################
sub process_folder {
	my ($inf) = shift;
	my ($ft,$ff,$nm,$dir,$ext,$val,$fil,$idir);
	my $fcnt = 0;
	prt( "Processing $inf folder ...\n" ) if ($dbg1);
	if ( opendir( DIR, $inf ) ) {
		my @files = readdir(DIR);
		closedir DIR;
        $idir = $inf;
        $idir .= "\\" if (!($idir =~ /(\\|\/)$/));
		foreach $fil (@files) {
			next if (($fil eq ".")||($fil eq ".."));
            $ff = $idir.$fil;
			if ( -d $ff ) {
				if ($recurse) {
					if ($ignfpd && is_fp_folder($fil)) {	# ignore FRONTPAGE folders
						next;
					}
					if (@splexcludes && in_spl_excludes($fil)) {
						next;
					}
					process_folder( $ff );
				}
			} else {
    			$ft = get_file_type_const($fil);
				# NOTE: multidimensional arrays pushed - offsets into arrays
				if ( !in_excludes($fil) ) {  # NOT in @excludes
					($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ );
					$val = 0;
					$val = $ext_hash{$ext} if ( defined $ext_hash{$ext} );
					$val++;
					$ext_hash{$ext} = $val;
					push(@all_files, [$ff, '', '', 0, 0, '', '', $ft] );
					$fcnt++;
				}
			}
		}
		prt( "Processed $inf folder finding $fcnt files ...\n" ) if ($dbg6);
	} else {
		prt( "ERROR: Failed to open folder $inf ...\n" );
	}
}

################################################
# my $ignfpd = 1;	# ignore FRONTPAGE folders
################################################
sub is_fp_folder {
	my ($inf) = shift;
	foreach my $fil (@fpfolders) {
		if (lc($inf) eq lc($fil)) {
			return 1;
		}
	}
	return 0;
}

####################################
# Check if FILE is in EXCLUDE list
####################################
sub in_excludes {
	my ($fil) = shift;
	my $lcf = lc($fil);
	foreach my $f (@excludes) {
		if (lc($f) eq $lcf) {
			return 1;
		}
	}
	return 0;
}

sub in_spl_excludes {
	my ($fldr) = shift;
	my $lfldr = lc($fldr);
	foreach my $f (@splexcludes) {
		if (lc($f) eq $lfldr) {
			return 1;
		}
	}

	return 0;
}

sub set_status_case {
	my ( $ch, $pch, $inccm, $inlnc, $inqot, $qot ) = @_;
	my $ldbg2 = 0;
	if ($$inccm) {
		if (($ch eq '/')&&($pch eq '*')) {
			$$inccm = 0;
			prt( "status: End C comment /* */ ...\n" ) if ($ldbg2);
		}
	} elsif ($$inlnc ) {
		if ($ch eq "\n") {
			$$inlnc = 0;
			prt( "status: End line comment // ...\n" ) if ($ldbg2);
		}
	} elsif ($$inqot ) {
		if ($ch eq $$qot) {
			prt( "status: End quote $$qot ...\n" ) if ($ldbg2);
			$$inqot = 0;
			$$qot = '';
		}
	} else {
		if ($ch eq '/') {
			if ($pch eq '/') {
				$$inlnc = 1;
				prt( "status: Entered line comment // ...\n" ) if ($ldbg2);
			}
		} elsif ($ch eq '*') {
			if ($pch eq '/') {
				$$inccm = 1;
				prt( "status: Entered C comment /* */ ...\n" ) if ($ldbg2);
			}
		} elsif (($ch eq '"')||($ch eq "'")) {
			$$qot = $ch;
			$$inqot = 1;
			prt( "status: Entered quote $$qot ...\n" ) if ($ldbg2);
		}
	}
}


sub get_all_tag_text {
	my ($txt, $tag) = @_;
	my $len = length($txt);
	my $ldbg1 = 0;
	my $ntxt = '';
	my $ch = '';
	my $pch = '';
	my $ftag = '';
	my $nline = '';
	my $i = 0;
	my $intag = 0;
	my $incomment = 0;
	my $inqot = 0;	# in quotes ' or "
	my $qot = '';
	my $inlnc = 0;	# in line comment
	my $inccm = 0;  # in C comment
	my ($part, $shlen);
	###prt("Processing $len chars for $tag ...\n");
	for ($i = 0; $i < $len; $i++) {
		$pch = $ch;
		$ch = substr($txt, $i, 1);
		set_status_case( $ch, $pch, \$inccm, \$inlnc, \$inqot, \$qot );
		if ($incomment) {
			$ntxt .= $ch;
			if ($ch eq '>') {
				$shlen = -15;
				if (length($ntxt) < 15) {
					$shlen = 0 - length($ntxt);
				}
				prt( "Potential close [".substr($ntxt,$shlen)."] ...($i)" ) if ($ldbg1);
				if (substr($ntxt,-3) eq '-->') {
					if (!$inqot && !$inlnc && !$inccm) {
						prt( " Yes\n" ) if ($ldbg1);
						$incomment = 0;	# no longer IN comment
						prt("End comment <!-- --> ...\n") if ($ldbg1);
					} else {
						if ($inqot) {
							prt( " NO DUE TO IN QUOTE\n" ) if ($ldbg1);
						} elsif ($inlnc) {
							prt( " NO DUE TO IN LINE COMMENT\n" ) if ($ldbg1);
						} elsif ($inccm) {
							prt( " NO DUE TO IN C COMMENT\n" ) if ($ldbg1);
						} else {
							prt( " NO DUE TO SOME REASON!!! **** CHECK ME!!! ****\n" ) if ($ldbg1);
						}
					}
				} else {
					prt( " NO!\n" ) if ($ldbg1);
				}
			}
		} elsif ($intag) {
			if ($ch eq "<") {
				###prt("Got begin < ...\n");
				$part = substr($txt,$i,4);
				if ($part eq '<!--') {	# if a powerful comment starts
					prt("Entering comment <!-- ...\n") if ($ldbg1);
					$incomment = 1;			# unconditionally go to end of this comment
					$ntxt .= $ch;
					next;
				}
				$i++;
				$ftag = '';
				for ( ; $i < $len; $i++ ) {
					$pch = $ch;
					$ch = substr($txt, $i, 1);
					if ($ch eq '>') {
						last;
					}
					$ftag .= $ch;
				}
				$ntxt .= '<'.$ftag;
				###prt("Got tag [$ftag] ...\n");
				if (lc(substr($ftag,1)) eq lc($tag)) {
					$intag = 0;
				}
			} 
			$ntxt .= $ch;
		} else {
			if ($ch eq "<") {
				###prt("Got begin < ...\n");
				$i++;
				$ftag = '';
				for ( ; $i < $len; $i++ ) {
					$pch = $ch;
					$ch = substr($txt, $i, 1);
					if (($ch eq '>')||($ch eq ' ')||($ch =~ /\s/)) {
						last;
					}
					$ftag .= $ch;
				}
				###prt("Got tag [$ftag] ...\n");
				if (lc($ftag) eq lc($tag)) {
					$ntxt .= '<'.$ftag.$ch;
					if (($ch eq ' ')||($ch =~ /\s/)) {
						$i++;
						for ( ; $i < $len; $i++ ) {
							$pch = $ch;
							$ch = substr($txt, $i, 1);
							$ntxt .= $ch;
							if ($ch eq '>') {
								last;
							}
						}
					}
					###prt( "Entered tag <$ftag...> ($tag)...\n" );
					$intag = 1;
				}
			}
		}
	}
	return $ntxt;
}

sub unix_2_dos {
	my ($f) = shift;
	$f =~ s/\//\\/g;
	return $f;
}

#	OF_FF => 0,	# full file name
#	OF_HR => 1,	# array ref of href links
#	OF_IM => 2,	# array ref of image links
#	OF_LK => 3,	# linked count
#	OF_SP => 4,	# spare
#	OF_TO => 5,	# links TO
#	OF_FM => 6, # links FROM
#	OF_FT => 7	# file type
sub mark_in_all_files {
	my ($fil, $dir) = @_;
	my $max = scalar @all_files;
	my ($i, $ff, $nm, $dr, $fnd);
	$dir = unix_2_dos($dir);
	$fnd = 0;
	for ($i = 0; $i < $max; $i++) {
		$ff = $all_files[$i][OF_FF];
		($nm,$dr) = fileparse($ff);
		$dr = unix_2_dos($dr);
		if (($nm eq $fil)&&($dir eq $dr)) {
			$all_files[$i][OF_SP]++;
			$fnd++;
		}
	}
	if (!$fnd) {
		prt( "WARNING: [$fil] in [$dir] NOT found ...\n" );
	}
}

sub in_done_files {
	my ($nm, $dr, @dn) = @_;
	my $ct = scalar @dn;
	my ($nnm, $ndr);
	for (my $j = 0; $j < $ct; $j++) {
		$nnm = $dn[$j][0];
		$ndr = $dn[$j][1];
		if (($nnm eq $nm) && ($ndr eq $dr)) {
			return 1;
		}
	}
	return 0;
}


sub process_references {
	my ($bdir) = @_;
	my $rcnt = scalar @all_hrefs;
	my ($i, $ff, $ft, $name, $rdir, $hrt, @new_hrefs, @nxt_hrefs, $dncnt);
	$bdir = unix_2_dos($bdir);
	prt( "Processing $rcnt HREF found ...\n" );
	$dncnt = 0;
	for ($i = 0; $i < $rcnt; $i++) {
		$ff = $bdir.$all_hrefs[$i][0];
		$hrt = $all_hrefs[$i][1];
		$ft = $all_hrefs[$i][2];
		if (($hrt == HRT_FILE)&&($ft == FT_HTML)) {
			($name,$rdir) = fileparse( $ff );
			$rdir = unix_2_dos($rdir);
			if ($ff =~ /\.\./) {
				# this back up ...
				prt( "SKIPPING file [$name], in [$rdir] ...\n" );
			} else {
				prt( "File [$name], in [$rdir] ...\n" ) if ($dbg4);
				mark_in_all_files($name, $rdir);
				@new_hrefs = parse_file($rdir, $name);
				$dncnt++;
			}
			push(@done_files, [$name, $rdir]);
		}
	}

	$rcnt = scalar @new_hrefs;
	prt( "Processed $dncnt file, for new $rcnt files ...\n" ) if ($rcnt);
	$dncnt = 0;
	while ($rcnt) {
		@nxt_hrefs = ();
		$dncnt = 0;
		prt( "Processing $rcnt NEW HREF found ...\n" );
		for ($i = 0; $i < $rcnt; $i++) {
			$ff = $bdir.$new_hrefs[$i][0];
			$hrt = $new_hrefs[$i][1];
			$ft = $new_hrefs[$i][2];
			if (($hrt == HRT_FILE)&&($ft == FT_HTML)) {
				($name,$rdir) = fileparse( $ff );
				$rdir = unix_2_dos($rdir);
				if ( !in_done_files($name, $rdir, @done_files) ) {
					if ($ff =~ /\.\./) {
						# this back up ...
						prt( "SKIPPING file [$name], in [$rdir] ...\n" );
					} else {
						prt( "File [$name], in [$rdir] ...\n" );
						mark_in_all_files($name, $rdir);
						@nxt_hrefs = parse_file($rdir, $name);
						$dncnt++;
					}
					push(@done_files, [$name, $rdir]);
				}
			}
		}
		@new_hrefs = @nxt_hrefs;
		$rcnt = scalar @new_hrefs;
		prt( "Processed $dncnt file, for new $rcnt files ...\n" ) if ($rcnt);
	}
}

########################################################
### MAIN ###
($base_file,$base_dir) = fileparse( $in_file );
$base_dir = unix_2_dos($base_dir);

process_folder($base_dir);	# get ALL the files, in mutidemensional array

@all_hrefs = parse_file($base_dir, $base_file);

push(@done_files, [$base_file, $base_dir]);

mark_in_all_files($base_file, $base_dir);

$refcnt = scalar @all_hrefs;

process_references($base_dir);

show_results() if ($show_missed_files);

pgm_exit(0,"");

########################################################


# eof - gethrefs.pl
