#!/perl -w
# NAME: gethrefs.pl
# AIM: Parse a HTML file, and extract HREF links
# 20/10/2011 - Add user options - parse_args()
# 18/07/2010 - revisit and test...
use strict;
use warnings;
use File::Basename;  # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] )
use Cwd;
my $perl_dir = 'C:\GTools\perl';
unshift(@INC, $perl_dir);
require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' Check paths in \@INC...\n";
###require 'logfile.pl' or die "Unable to load logfile.pl ...\n";
require 'htmltools.pl' or die "Unable to load htmltools.pl ...\n";

use constant {
     HRT_UNKNOWN => 0,
     HRT_LOCAL => 1,
     HRT_LINK => 2,
     HRT_SCRIPT => 3,
	 HRT_FILE => 4,
     HRT_BASE => 5
};
use constant {
	FT_UNKNOWN => 0,
	FT_HTML => 1,
	FT_GRAF => 2,
	FT_CSS => 3,
	FT_SCRIPT => 4,
	FT_TEXT => 5,
	FT_ZIP => 6,
    FT_DIR => 7
};

# offsets in file array
use constant {
	OF_FF => 0,	# full file name
	OF_HR => 1,	# array ref of href links
	OF_IM => 2,	# array ref of image links
	OF_LK => 3,	# linked count
	OF_SP => 4,	# spare
	OF_TO => 5,	# links TO
	OF_FM => 6, # links FROM
	OF_FT => 7	# file type
};

# for htmltools, if functions used
my @imgs = ();
my @hrefs = ();
# log file stuff
our ($LF);
my $pgmname = $0;
if ($pgmname =~ /(\\|\/)/) {
    my @tmpsp = split(/(\\|\/)/,$pgmname);
    $pgmname = $tmpsp[-1];
}
my $outfile = $perl_dir."\\temp.$pgmname.txt";
open_log($outfile);

# user variables
my $VERS = "0.0.2 2011-10-20";
my $load_log = 0;

my $in_file = '';

my $debug_on = 0;
my $def_file = 'C:\GTools\java\examples\JavaTech\Code_List.htm';
##my $in_file = 'C:\HOMEPAGE\GA\travel\maroc\index.htm';
###my $in_file = 'temphtml.htm';
my @all_hrefs = ();
my $verbosity = 0;

my $splout = $perl_dir."\\tempspec.txt";

# CONSTANTS
###########
# File Type Extensions
my @html_extension = qw( .htm .html .shtml .php );
my @graf_extension = qw( .jpg .jpeg .gif .png .bmp .ico .mpg );
my @css_extension  = qw( .css );
my @script_extension = qw( .js .class .cgi .java );
my @zip_extension = qw( .zip .tar .gz .jar );
my @txt_extension = qw( .txt .doc );

# private FRONTPAGE folders
my @fpfolders = qw( _vti_cnf _vti_pvt _private _derived );

# features
my $show_not_found = 0;
my $show_local_links = 0;   # also show INTERNAL (in page) links
my $ignfpd = 1;	# ignore FRONTPAGE folders
my @excludes = qw( desktop.ini php.ini blank.html blank.htm );
my $recurse = 1;	# recursive
my @splexcludes = qw( macpc );

my %ext_hash = ();
my @all_files = ();
my $refcnt = 0;
my @done_files = ();
my %not_found = ();

my ($base_file,$base_dir);
my $base_href = ''; # set if <BASE href="..."> found


# DEBUG
my $dbg1 = 0;	# show discarded material
my $dbg2 = 0;	# show "$hcnt:HREF: hr[$hr] $ctyp tail[$tail] pre[$disc] tag[$tag] bgn[$bgn]...
my $dbg3 = 0;	# show Processing $lncnt lines from $fil ...
my $dbg4 = 0;	# show File [$name], in [$rdir] ...
my $dbg5 = 0;   # show HREF immediately
my $dbg6 = 0;   # show 'tag' immediately

### program variables
my @warnings = ();
my $cwd = cwd();
my $os = $^O;

sub VERB1() { return $verbosity >= 1; }
sub VERB2() { return $verbosity >= 2; }
sub VERB5() { return $verbosity >= 5; }
sub VERB9() { return $verbosity >= 9; }

sub show_warnings($) {
    my ($val) = @_;
    if (@warnings) {
        prt( "\nGot ".scalar @warnings." WARNINGS...\n" );
        foreach my $itm (@warnings) {
           prt("$itm\n");
        }
        prt("\n");
    } else {
        ### prt( "\nNo warnings issued.\n\n" );
    }
}

sub pgm_exit($$) {
    my ($val,$msg) = @_;
    if (length($msg)) {
        $msg .= "\n" if (!($msg =~ /\n$/));
        prt($msg);
    }
    show_warnings($val);
    close_log($outfile,$load_log);
    exit($val);
}


sub prtw($) {
   my ($tx) = shift;
   $tx =~ s/\n$//;
   prt("$tx\n");
   push(@warnings,$tx);
}


##################################################################
#	OF_FF => 0,	# full file name
#	OF_HR => 1,	# array ref of href links
#	OF_IM => 2,	# array ref of image links
#	OF_LK => 3,	# linked count
#	OF_SP => 4,	# spare
#	OF_TO => 5,	# links TO
#	OF_FM => 6, # links FROM
#	OF_FT => 7	# file type
sub show_results {
	my $fcnt = scalar @all_files;
	my ($i, $ff, $ft, $cnt);
	for ($i = 0; $i < $fcnt; $i++) {
		$ft = $all_files[$i][OF_FT];
		if ($ft == FT_HTML) {
			$cnt = $all_files[$i][OF_SP];
			if ($cnt == 0) {
				$ff = $all_files[$i][OF_FF];
				prt( "Missed [$ff]\n" );
			}
		}
	}
}

##################################################################
sub get_href_type_const {
	my ($hrf) = shift;
	if ($hrf =~ /^http(s*):\/\//i) {
		return HRT_LINK;
	} elsif ($hrf =~ /^ftp:\/\//i) {
		return HRT_LINK;
	} elsif ($hrf =~ /^javascript:/i) {
		return HRT_SCRIPT;
	} elsif (substr($hrf,0,1) eq '#') {
		return HRT_LOCAL;
    } elsif (index($hrf,'.') >= 0) {
		return HRT_FILE;
	}
	return HRT_UNKNOWN;
}

sub href_type_to_string {
	my ($hrt) = shift;
	if ($hrt == HRT_LINK) {
		return "extern link";
	} elsif ($hrt == HRT_SCRIPT) {
		return "script";
	} elsif ($hrt == HRT_LOCAL) {
		return "local";
    } elsif ($hrt == HRT_FILE) {
		return "file";
    } elsif ($hrt == HRT_BASE) {
		return "BASE";
	} elsif ($hrt == HRT_UNKNOWN) {
		return "unknown";
	}
	return '***FIX ME*** uncased type';
}

sub get_hrefs_from_string($) {
	my ($ln) = shift;
	my ($i, $j, $line, $ch, $ch2, $len, $tag, $disc, $hcnt);
    my ($bgn, $fhr, $hr, $tail, $max, $hrt, $ft, $ctyp);
	my ($sp,$tag2,$gottag);
	my @hrf = ();
	$ln =~ s/\n/ /g;
	$ln = trim_all($ln);
	# sub write2file { 	my ($txt,$fil) = @_;
	# write2file($fulln,'tempfl.txt');
	$len = length($ln);
	$disc = '';
	$hcnt = 0;
    # process single long string, char by char
	for ($i = 0; $i < $len; $i++) {
		$ch = substr($ln,$i,1);
		if ($ch eq '<') {
			$tag = $ch; # start a tag
			$i++;
			$ch = substr($ln,$i,1);
			# could check for things like <! ... later maybe
            $tag2 = '';
            $gottag = 0;
			for (; $i < $len; $i++) {
				$ch = substr($ln,$i,1);
				$tag .= $ch;
				if ($ch eq '>') {
					last;
				}
                if (!$gottag) {
                    if ($ch =~ /\w/) {
                        $tag2 .= $ch;
                    } else {
                        $gottag = 1;
                    }
                }
			}

			if ($tag =~ /(.*\s+)href(\s*)=/i) {
				$bgn = $1;
				$sp = length($2);
				$hcnt++;
				$fhr = substr($tag,length($bgn)+5+$sp);
				$fhr = substr($fhr,1) while ($fhr =~ /^\s/); # remove all LEADING space
				$ch = substr($fhr,0,1);
                prt("$tag [$tag2] [$fhr]\n") if ($dbg5);
				if (($ch eq '"')||($ch eq "'")) {
					$max = length($fhr);
					$hr = '';
					$tail = '';
					# collect actual HREF=
					for ($j = 1; $j < $max; $j++) {
						$ch2 = substr($fhr,$j,1);
						if ($ch eq $ch2) {
							$tail = substr($fhr,$j);
							last;
						}
						$hr .= $ch2;
					}
                    if ($tag2 =~ /^BASE$/i) {
                        $hrt = HRT_BASE;
                        prt("Got [$tag2] [$fhr]\n");
                    } else {
                        $hrt = get_href_type_const($hr);
                    }
					$ctyp = '';
					$ft = FT_UNKNOWN;
					if ($hrt == HRT_FILE) {
						$ft = get_file_type_const($hr);
						$ctyp = "ext[".file_type_const_to_string($ft)."] ";
					}
					$ctyp = 'type['.href_type_to_string($hrt)."] $ctyp";
                    prt("tag [$tag2] [$hr] $ctyp\n") if ($dbg6);
                    $base_href = $hr if ($hrt == HRT_BASE);
					prt( "$hcnt:HREF: hr[$hr] $ctyp tail[$tail] pre[$disc] tag[$tag] bgn[$bgn]\n" ) if ($dbg2);
					#          href  HRT-type FT-file
					#            0     1       2
					push(@hrf, [$hr, $hrt,    $ft]);
				} else {
					prt( "$hcnt:HREF: fhr[$fhr] pre[$disc] tag[$tag] bgn[$bgn] CHECK ME\n" );
				}
			} else {
				prt( "DISCARDED: pre[$disc] tag[$tag] ...\n" ) if ($dbg1);
			}
			$disc = '';
		} else {
			$disc .= $ch;
		}
	}
	return @hrf;
}

sub parse_file($$) {
    my ($bdir,$bfil) = @_;
	my $fil = $bdir.$bfil;
	my ($lncnt, $full, $hrcnt, $i, $hfcnt, $typ, $filcnt);
	my @hrf = ();
	if ( ! open INF, "<$fil") {
        prt( "WARNING: Can NOT open file [$fil]...\n" );
    	return \@hrf;
    }
    my @lines = <INF>;
    close INF;
    $lncnt = scalar @lines;
    prt( "Processing $lncnt lines from [$fil] ...\n" );
    $full = join('',@lines);
    # sub write2file { 	my ($txt,$fil) = @_;
    #my $scrp = return_tag($full,'script');
    ##my $scrp = get_all_tag_text($full,'script');
    ##write2file($scrp,'tempscript.txt');
    ##prt( "Got script text [$scrp]\n" );
    @hrf = get_hrefs_from_string($full);
    $hrcnt = scalar @hrf;
    $filcnt = 0;
    for ($i = 0; $i < $hrcnt; $i++) {
        $fil = $bdir.$hrf[$i][0];
        $typ = $hrf[$i][1];
        if ($typ == HRT_FILE) {
            $filcnt++;
            if (! -f $fil) {
                if (defined $not_found{$fil}) {
                    $not_found{$fil}++;
                } else {
                    prt( "WARNING: File [$fil] NOT found ...\n" ) if ($show_not_found);
                    $not_found{$fil} = 1;
                }
            }
        }
    }
    prt( "Got $hrcnt HREF entries, from $bfil ... $filcnt link files\n" );
	return \@hrf;
}

sub parse_file2($$) {
    my ($bdir,$bfil) = @_;
	my $fil = $bdir.$bfil;
	my ($lncnt, $full, $hrcnt, $i, $hfcnt, $typ, $filcnt);
	my @hrf = ();
	if ( ! open INF, "<$fil") {
        prt( "WARNING: Can NOT open file [$fil]...\n" );
    	return @hrf;
    }
    my @lines = <INF>;
    close INF;
    $lncnt = scalar @lines;
    prt( "Processing $lncnt lines from [$fil] ...\n" );
    $full = join('',@lines);
    # sub write2file { 	my ($txt,$fil) = @_;
    #my $scrp = return_tag($full,'script');
    ##my $scrp = get_all_tag_text($full,'script');
    ##write2file($scrp,'tempscript.txt');
    ##prt( "Got script text [$scrp]\n" );
    @hrf = get_hrefs_from_string($full);
    $hrcnt = scalar @hrf;
    $filcnt = 0;
    for ($i = 0; $i < $hrcnt; $i++) {
        $fil = $bdir.$hrf[$i][0];
        $typ = $hrf[$i][1];
        if ($typ == HRT_FILE) {
            $filcnt++;
            if (! -f $fil) {
                if (defined $not_found{$fil}) {
                    $not_found{$fil}++;
                } else {
                    prt( "WARNING: File [$fil] NOT found ...\n" ) if ($show_not_found);
                    $not_found{$fil} = 1;
                }
            }
        }
    }
    prt( "Got $hrcnt HREF entries, from $bfil ... $filcnt link files\n" );
	return @hrf;
}

####################################
#########################################################
# Passed an array of extensions,
# check if this is one of them?
#########################################################
sub is_my_extension {
	my ($fil, @exts) = @_;
	my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ );
	foreach my $ex (@exts) {
		if (lc($ex) eq lc($ext)) {
			return 1;
		}
	}
	return 0;
}

############################################
# only looking for HTM, HTML, PHP,
# could be extended to others maybe ...
############################################
sub is_htm_extension {
	my ($fil) = shift;
	return( is_my_extension($fil, @html_extension) );
}
sub is_graphic_extension {
	my ($fil) = shift;
	return( is_my_extension($fil, @graf_extension) );
}
sub is_zip_extension {
	my ($fil) = shift;
	return( is_my_extension($fil, @zip_extension) );
}
sub is_css_extension {
	my ($fil) = shift;
	return( is_my_extension($fil, @css_extension) );
}
sub is_txt_extension {
	my ($fil) = shift;
	return( is_my_extension($fil, @txt_extension) );
}
sub is_script_extension {
	my ($fil) = shift;
	return( is_my_extension($fil, @script_extension) );
}

#use constant {
#	FT_UNKNOWN => 0,
##	FT_HTML => 1,
##	FT_GRAF => 2,
##	FT_CSS => 3,
##	FT_SCRIPT => 4,
##	FT_TEXT => 5,
##	FT_ZIP => 6,
##  FT_DIR => 7
#};
sub get_file_type_const {
	my ($fil) = shift;
	if (is_htm_extension($fil)) {
		return FT_HTML;
	} elsif (is_graphic_extension($fil)) {
		return FT_GRAF;
	} elsif (is_zip_extension($fil)) {
		return FT_ZIP;
	} elsif (is_css_extension($fil)) {
		return FT_CSS;
	} elsif (is_txt_extension($fil)) {
		return FT_TEXT;
	} elsif (is_script_extension($fil)) {
		return FT_SCRIPT;
	} elsif ($fil =~ /\/$/) {
        return FT_DIR;
    }
	return FT_UNKNOWN;
}

sub file_type_const_to_string {
	my ($ft) = shift;
	if ($ft == FT_HTML) {
		return "html";
	} elsif ($ft == FT_GRAF) {
		return "graphic";
	} elsif ($ft == FT_ZIP) {
		return "zip";
	} elsif ($ft == FT_CSS) {
		return "css";
	} elsif ($ft == FT_TEXT) {
		return "text";
	} elsif ($ft == FT_SCRIPT) {
		return "script";
	} elsif ($ft == FT_DIR) {
        return "directory";
	} elsif ($ft == FT_UNKNOWN) {
		return "unknown";
	}
	return "***FIX ME*** uncased type $ft!";
}

##############################################
####################################################################
# process_folder(folder) 
# Main DIRECTORY processing function
#
# Open the FOLDER given, and collect ALL files found,
# iterate into sub-directories, if $recurse is non-zero,
# and it is NOT a special FRONTPAGE (hidden) FOLDER.
#
# Files are collected into multidemensional arrays
####################################################################
sub process_folder {
	my ($inf) = shift;
	my ($ft, $ff, $nm, $dir, $ext, $val, $fil);
	my $fcnt = 0;
	prt( "Processing $inf folder ...\n" ) if ($dbg1);
	if ( opendir( DIR, $inf ) ) {
		my @files = readdir(DIR);
		closedir DIR;
		foreach $fil (@files) {
			if (($fil eq ".")||($fil eq "..")) {
				next;
			}
			$ft = get_file_type_const($fil);
			$ff = $inf;
			$ff .= "\\" if !($inf =~ /[\\\/]$/);
			$ff .= $fil;
			if ( -d $ff ) {
				if ($recurse) {
					if ($ignfpd && is_fp_folder($fil)) {	# ignore FRONTPAGE folders
						next;
					}
					if (@splexcludes && in_spl_excludes($fil)) {
						next;
					}
					process_folder( $ff );
				}
			} else {
				# NOTE: multidimensional arrays pushed - offsets into arrays
				if ( !in_excludes($fil) ) {  # NOT in @excludes
					($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ );
					$val = 0;
					$val = $ext_hash{$ext} if ( defined $ext_hash{$ext} );
					$val++;
					$ext_hash{$ext} = $val;
					push(@all_files, [$ff, '', '', 0, 0, '', '', $ft] );
					$fcnt++;
				}
			}
		}
		prt( "Processed $inf folder finding $fcnt files ...\n" );
	} else {
		prt( "ERROR: Failed to open folder $inf ...\n" );
	}
}

################################################
# my $ignfpd = 1;	# ignore FRONTPAGE folders
################################################
sub is_fp_folder {
	my ($inf) = shift;
	foreach my $fil (@fpfolders) {
		if (lc($inf) eq lc($fil)) {
			return 1;
		}
	}
	return 0;
}

####################################
# Check if FILE is in EXCLUDE list
####################################
sub in_excludes {
	my ($fil) = shift;
	my $lcf = lc($fil);
	foreach my $f (@excludes) {
		if (lc($f) eq $lcf) {
			return 1;
		}
	}
	return 0;
}

sub in_spl_excludes {
	my ($fldr) = shift;
	my $lfldr = lc($fldr);
	foreach my $f (@splexcludes) {
		if (lc($f) eq $lfldr) {
			return 1;
		}
	}

	return 0;
}

sub set_status_case {
	my ( $ch, $pch, $inccm, $inlnc, $inqot, $qot ) = @_;
	my $ldbg2 = 0;
	if ($$inccm) {
		if (($ch eq '/')&&($pch eq '*')) {
			$$inccm = 0;
			prt( "status: End C comment /* */ ...\n" ) if ($ldbg2);
		}
	} elsif ($$inlnc ) {
		if ($ch eq "\n") {
			$$inlnc = 0;
			prt( "status: End line comment // ...\n" ) if ($ldbg2);
		}
	} elsif ($$inqot ) {
		if ($ch eq $$qot) {
			prt( "status: End quote $$qot ...\n" ) if ($ldbg2);
			$$inqot = 0;
			$$qot = '';
		}
	} else {
		if ($ch eq '/') {
			if ($pch eq '/') {
				$$inlnc = 1;
				prt( "status: Entered line comment // ...\n" ) if ($ldbg2);
			}
		} elsif ($ch eq '*') {
			if ($pch eq '/') {
				$$inccm = 1;
				prt( "status: Entered C comment /* */ ...\n" ) if ($ldbg2);
			}
		} elsif (($ch eq '"')||($ch eq "'")) {
			$$qot = $ch;
			$$inqot = 1;
			prt( "status: Entered quote $$qot ...\n" ) if ($ldbg2);
		}
	}
}


sub get_all_tag_text {
	my ($txt, $tag) = @_;
	my $len = length($txt);
	my $ldbg1 = 0;
	my $ntxt = '';
	my $ch = '';
	my $pch = '';
	my $ftag = '';
	my $nline = '';
	my $i = 0;
	my $intag = 0;
	my $incomment = 0;
	my $inqot = 0;	# in quotes ' or "
	my $qot = '';
	my $inlnc = 0;	# in line comment
	my $inccm = 0;  # in C comment
	my ($part, $shlen);
	###prt("Processing $len chars for $tag ...\n");
	for ($i = 0; $i < $len; $i++) {
		$pch = $ch;
		$ch = substr($txt, $i, 1);
		set_status_case( $ch, $pch, \$inccm, \$inlnc, \$inqot, \$qot );
		if ($incomment) {
			$ntxt .= $ch;
			if ($ch eq '>') {
				$shlen = -15;
				if (length($ntxt) < 15) {
					$shlen = 0 - length($ntxt);
				}
				prt( "Potential close [".substr($ntxt,$shlen)."] ...($i)" ) if ($ldbg1);
				if (substr($ntxt,-3) eq '-->') {
					if (!$inqot && !$inlnc && !$inccm) {
						prt( " Yes\n" ) if ($ldbg1);
						$incomment = 0;	# no longer IN comment
						prt("End comment <!-- --> ...\n") if ($ldbg1);
					} else {
						if ($inqot) {
							prt( " NO DUE TO IN QUOTE\n" ) if ($ldbg1);
						} elsif ($inlnc) {
							prt( " NO DUE TO IN LINE COMMENT\n" ) if ($ldbg1);
						} elsif ($inccm) {
							prt( " NO DUE TO IN C COMMENT\n" ) if ($ldbg1);
						} else {
							prt( " NO DUE TO SOME REASON!!! **** CHECK ME!!! ****\n" ) if ($ldbg1);
						}
					}
				} else {
					prt( " NO!\n" ) if ($ldbg1);
				}
			}
		} elsif ($intag) {
			if ($ch eq "<") {
				###prt("Got begin < ...\n");
				$part = substr($txt,$i,4);
				if ($part eq '<!--') {	# if a powerful comment starts
					prt("Entering comment <!-- ...\n") if ($ldbg1);
					$incomment = 1;			# unconditionally go to end of this comment
					$ntxt .= $ch;
					next;
				}
				$i++;
				$ftag = '';
				for ( ; $i < $len; $i++ ) {
					$pch = $ch;
					$ch = substr($txt, $i, 1);
					if ($ch eq '>') {
						last;
					}
					$ftag .= $ch;
				}
				$ntxt .= '<'.$ftag;
				###prt("Got tag [$ftag] ...\n");
				if (lc(substr($ftag,1)) eq lc($tag)) {
					$intag = 0;
				}
			} 
			$ntxt .= $ch;
		} else {
			if ($ch eq "<") {
				###prt("Got begin < ...\n");
				$i++;
				$ftag = '';
				for ( ; $i < $len; $i++ ) {
					$pch = $ch;
					$ch = substr($txt, $i, 1);
					if (($ch eq '>')||($ch eq ' ')||($ch =~ /\s/)) {
						last;
					}
					$ftag .= $ch;
				}
				###prt("Got tag [$ftag] ...\n");
				if (lc($ftag) eq lc($tag)) {
					$ntxt .= '<'.$ftag.$ch;
					if (($ch eq ' ')||($ch =~ /\s/)) {
						$i++;
						for ( ; $i < $len; $i++ ) {
							$pch = $ch;
							$ch = substr($txt, $i, 1);
							$ntxt .= $ch;
							if ($ch eq '>') {
								last;
							}
						}
					}
					###prt( "Entered tag <$ftag...> ($tag)...\n" );
					$intag = 1;
				}
			}
		}
	}
	return $ntxt;
}

sub unix_2_dos {
	my ($f) = shift;
	$f =~ s/\//\\/g;
	return $f;
}

#	OF_FF => 0,	# full file name
#	OF_HR => 1,	# array ref of href links
#	OF_IM => 2,	# array ref of image links
#	OF_LK => 3,	# linked count
#	OF_SP => 4,	# spare
#	OF_TO => 5,	# links TO
#	OF_FM => 6, # links FROM
#	OF_FT => 7	# file type
sub mark_in_all_files {
	my ($fil, $dir) = @_;
	my $max = scalar @all_files;
	my ($i, $ff, $nm, $dr, $fnd);
	$dir = unix_2_dos($dir);
	$fnd = 0;
	for ($i = 0; $i < $max; $i++) {
		$ff = $all_files[$i][OF_FF];
		($nm,$dr) = fileparse($ff);
		$dr = unix_2_dos($dr);
		if (($nm eq $fil)&&($dir eq $dr)) {
			$all_files[$i][OF_SP]++;
			$fnd++;
		}
	}
	if (!$fnd) {
		prt( "WARNING: [$fil] in [$dir] NOT found ...\n" );
	}
}

sub in_done_files {
	my ($nm, $dr, @dn) = @_;
	my $ct = scalar @dn;
	my ($nnm, $ndr);
	for (my $j = 0; $j < $ct; $j++) {
		$nnm = $dn[$j][0];
		$ndr = $dn[$j][1];
		if (($nnm eq $nm) && ($ndr eq $dr)) {
			return 1;
		}
	}
	return 0;
}


sub process_references {
	my ($bdir) = @_;
	my $rcnt = scalar @all_hrefs;
	my ($i, $ff, $ft, $name, $rdir, $hrt, @new_hrefs, @nxt_hrefs, $dncnt);
	$bdir = unix_2_dos($bdir);
	prt( "Processing $rcnt HREF found ...\n" );
	$dncnt = 0;
	for ($i = 0; $i < $rcnt; $i++) {
		$ff = $bdir.$all_hrefs[$i][0];
		$hrt = $all_hrefs[$i][1];
		$ft = $all_hrefs[$i][2];
		if (($hrt == HRT_FILE)&&($ft == FT_HTML)) {
			($name,$rdir) = fileparse( $ff );
			$rdir = unix_2_dos($rdir);
			if ($ff =~ /\.\./) {
				# this back up ...
				prt( "SKIPPING file [$name], in [$rdir] ...\n" );
			} else {
				prt( "File [$name], in [$rdir] ...\n" ) if ($dbg4);
				mark_in_all_files($name, $rdir);
				@new_hrefs = parse_file2($rdir, $name);
				$dncnt++;
			}
			push(@done_files, [$name, $rdir]);
		}
	}

	$rcnt = scalar @new_hrefs;
	prt( "Processed $dncnt file, for new $rcnt files ...\n" );
	$dncnt = 0;
	while ($rcnt) {
		@nxt_hrefs = ();
		$dncnt = 0;
		prt( "Processing $rcnt NEW HREF found ...\n" );
		for ($i = 0; $i < $rcnt; $i++) {
			$ff = $bdir.$new_hrefs[$i][0];
			$hrt = $new_hrefs[$i][1];
			$ft = $new_hrefs[$i][2];
			if (($hrt == HRT_FILE)&&($ft == FT_HTML)) {
				($name,$rdir) = fileparse( $ff );
				$rdir = unix_2_dos($rdir);
				if ( !in_done_files($name, $rdir, @done_files) ) {
					if ($ff =~ /\.\./) {
						# this back up ...
						prt( "SKIPPING file [$name], in [$rdir] ...\n" );
					} else {
						prt( "File [$name], in [$rdir] ...\n" );
						mark_in_all_files($name, $rdir);
						@nxt_hrefs = parse_file2($rdir, $name);
						$dncnt++;
					}
					push(@done_files, [$name, $rdir]);
				}
			}
		}
		@new_hrefs = @nxt_hrefs;
		$rcnt = scalar @new_hrefs;
		prt( "Processed $dncnt file, for new $rcnt files ...\n" );
	}
}

sub fits_special_filter($$) {
    my ($hr,$ra) = @_;
    # https://git.gitorious.org/flightgear-aircraft/717.git
    if ($hr =~ /^https:\/\/git\.gitorious\.org\/flightgear-aircraft\/(.+)\.git$/) {
        push(@{$ra},$1);
        return 1;
    }
    return 0;
}

sub show_href_list($) {
    my ($raorg) = @_;
    #          href  HRT-type FT-file
	#            0     1       2
	#push(@hrf, [$hr, $hrt,    $ft]);
    my $cnt = scalar @{$raorg};
    my @arr = sort @{$raorg};
    my $ra = \@arr;
    my %dupes = ();
    prt("Display of up to $cnt hrefs... eliminating duplicates\n");
    my ($i,$i2,$ccnt,$hr,$hrt,$ft,$min,$len,$shrt,$form,$splcnt);
    my @specials = ();
    $i = int($cnt / 10);
    $form = '%d';
    $i2 = 1;
    while ($i) {
        $i2++;
        $form = '%'.sprintf("%d",$i2).'d';
        $i = int($i / 10);
    }
    $min = 0;
    for ($i = 0; $i < $cnt; $i++) {
        $hr = ${$ra}[$i][0];
        $hrt = ${$ra}[$i][1];
        $ft = ${$ra}[$i][2];
        $len = length($hr);
        $min = $len if ($len > $min);
    }
    $i2 = 0;
    for ($i = 0; $i < $cnt; $i++) {
        $hr = ${$ra}[$i][0];
        $hrt = ${$ra}[$i][1];
        $ft = ${$ra}[$i][2];
        
        if (!defined $dupes{$hr}) {
            # display it
            $dupes{$hr} = 1;
            $splcnt++ if (fits_special_filter($hr,\@specials));
            $i2++;
            $ccnt = sprintf($form,$i2);
            $hr .= ' ' while (length($hr) < $min);
            $shrt = href_type_to_string($hrt);
            prt("$ccnt: $hr $shrt\n");
        }
    }
    prt("Displayed $i2 different hrefs...\n");
    if (@specials) {
        my @arr = sort @specials;
        write2file(join("\n",@arr)."\n",$splout);
        prt("Written $splcnt to 'special' list $splout\n");
    }
}

########################################################
### MAIN ###

parse_args(@ARGV);

($base_file,$base_dir) = fileparse( $in_file );
$base_dir = unix_2_dos($base_dir);

# process_folder($base_dir);	# get ALL the files, in mutidemensional array

my $ref_hrefs = parse_file($base_dir, $base_file);
show_href_list($ref_hrefs);

push(@done_files, [$base_file, $base_dir]);

# mark_in_all_files($base_file, $base_dir);
# $refcnt = scalar @all_hrefs;

# process_references($base_dir);

# show_results();

pgm_exit(0,"");

########################################################

sub give_help {
    prt("$pgmname: version $VERS\n");
    prt("Usage: $pgmname [options] in-file\n");
    prt("Options:\n");
    prt(" --help  (-h or -?) = This help, and exit 0.\n");
    prt(" --verb[n]     (-v) = Bump [or set] verbosity. def=$verbosity\n");
    prt(" --load        (-l) = Load LOG at end.\n");
}
sub need_arg {
    my ($arg,@av) = @_;
    pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av);
}

sub parse_args {
    my (@av) = @_;
    my ($arg,$sarg);
    while (@av) {
        $arg = $av[0];
        if ($arg =~ /^-/) {
            $sarg = substr($arg,1);
            $sarg = substr($sarg,1) while ($sarg =~ /^-/);
            if (($sarg =~ /^h/i)||($sarg eq '?')) {
                give_help();
                pgm_exit(0,"Help exit(0)");
            } elsif ($sarg =~ /^v/) {
                if ($sarg =~ /^v.*(\d+)$/) {
                    $verbosity = $1;
                } else {
                    while ($sarg =~ /^v/) {
                        $verbosity++;
                        $sarg = substr($sarg,1);
                    }
                }
                prt("Verbosity = $verbosity\n") if (VERB1());
            } elsif ($sarg =~ /^l/) {
                $load_log = 1;
                prt("Set to load log at end\n") if (VERB1());
            } else {
                pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n");
            }
        } else {
            $in_file = $arg;
            prt("Set input to [$in_file]\n");
        }
        shift @av;
    }

    if ((length($in_file) ==  0) && $debug_on) {
        $in_file = $def_file;
    }
    if (length($in_file) ==  0) {
        pgm_exit(1,"ERROR: No input files found in command!\n");
    }
    if (! -f $in_file) {
        pgm_exit(1,"ERROR: Unable to find in file [$in_file]! Check name, location...\n");
    }
}

# eof - gethrefs.pl
