#!/perl -w
# NAME: gensiteindex.pl
# AIM: Given a root file, or folder, scan ALL direcotries, build and output
# a site index.
# 2010/04/13  - geoff mclane - http://geoffair.net/mperl/
use strict;
use warnings;
use File::Basename;  # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] )
use Cwd;
unshift(@INC, 'C:\GTools\perl');
require 'logfile.pl' or die "Unable to load logfile.pl...\n";
require 'htmllib.pl' or die "Unable to load htmllib.pl...\n";
# log file stuff
my ($LF);
my $pgmname = $0;
if ($pgmname =~ /\w{1}:\\.*/) {
    my @tmpsp = split(/\\/,$pgmname);
    $pgmname = $tmpsp[-1];
}
my $perl_dir = 'C:\GTools\perl';
my $outfile = $perl_dir."\\temp.$pgmname.txt";
open_log($outfile);

# user variables
my $load_log = 1;
my $in_file = 'C:\HOMEPAGE\FG\index.html';
my $show_title_list = 0;
my $show_title = 0; # display title, during accumulation
my $out_html = $perl_dir."\\tempout.htm";

my @fpfolders = qw( aspnet_client _vti_cnf _vti_pvt _private _derived );
my @html_ext = qw( .htm .html .shtml .php );
my @graf_ext = qw( .jpg .jpeg .gif .png .bmp .ico .mpg );
my @css_ext  = qw( .css );
my @script_ext = qw( .js .class .cgi );

my @def_indexes = qw( default.html default.htm  index.aspx
   index.htm index.html index.jsp index.php index.shtml  home.htm
   home.html welcome.html welcome.htm );

### program variables
my @warnings = ();
my $cwd = cwd();
my $os = $^O;

my @g_xclude_dirs = ();
my @g_xclude_files = ();
my $g_indir = '';
my $g_infile = '';   # none yet
my $g_dir_count = 0;
my $g_item_count = 0;

# global file array
# constants
my $OF_FFN = 0; # full file name
my $OF_EXT = 1; # extension type number like $FE_HTM, $FE_IMG, etc
my $OF_IND = 2; # if it contforms to an 'index' type file
my $OF_LEV = 3; # directory depth 0=root, 1, 2, 3...
my $OF_CTA = 4; # content array (ref), if HTML type
my $OF_TIT = 5; # title text (if any)
my $OF_LNK = 6; # array reference of links
#sub add_2_g_file_array($$$$) {
#    my ($ff,$idi,$ext,$lev) = @_;
#                    0   1    2    3  4
# $OF_FFN            0 = full file name
# $OF_EXT                1 = extent type number $FE_HTM, etc
# $OF_IND                     2 = is an 'index' type file
# $OF_LEV                          3 = depth of directory
# $OF_CTA                               4 = contents array, if $FE_HTM
# $OF_TIT                                  5 = title (or sfn, if none, of HTM files
# $OF_LNK                                     6 = links for HTM files
#push(@g_file_array,[$ff,$ext,$idi,$lev,[],'',[],0]);
my @g_file_array = ();

### constants
my $FE_OTH = 0;
my $FE_HTM = 1;
my $FE_IMG = 2;
my $FE_ZIP = 3;
my $FE_CSS = 4;
my $FE_TXT = 5;
my $FE_SCR = 6;

### debug
my $dbg01 = 0;  # show each directory parse

### forward refs
sub process_sub_dir($$);

sub show_warnings($) {
    my ($val) = shift;
    if (@warnings) {
        prt( "\nGot ".scalar @warnings." WARNINGS...\n" );
        foreach my $itm (@warnings) {
            prt("$itm\n");
        }
        prt("\n");
    } elsif ($val) {
        prt( "\nNo warnings issued.\n\n" );
    }
}

sub pgm_exit($$) {
    my ($val,$msg) = @_;
    if (length($msg)) {
        $msg .= "\n" if (!($msg =~ /\n$/));
        prt($msg);
    }
    show_warnings($val);
    close_log($outfile,$load_log);
    exit($val);
}


sub prtw($) {
   my ($tx) = shift;
   $tx =~ s/\n$//;
   prt("$tx\n");
   push(@warnings,$tx);
}


sub os_is_win() { return (($^O eq 'MSWin32') ? 1 : 0); }

sub dos_2_unix($) {
    my ($du) = shift;
    $du =~ s/\\/\//g;
    return $du;
}

sub unix_2_dos($) {
    my ($du) = shift;
    $du =~ s/\//\\/g;
    return $du;
}

sub is_fp_folder($) {
    my ($inf) = shift;
    $inf = lc($inf) if (os_is_win());
    foreach my $fil (@fpfolders) {
        $fil = lc($fil)  if (os_is_win());
        return 1 if ($inf eq $fil);
    }
    return 0;
}

sub is_xclude_folder($) {
    my ($inf) = shift;
    $inf = lc($inf) if (os_is_win());
    foreach my $fil (@g_xclude_dirs) {
        $fil = lc($fil)  if (os_is_win());
        return 1 if ($inf eq $fil);
    }
    return 0;
}

sub is_excluded_dir($) {
    my ($dir) = shift;
    return 1 if (is_fp_folder($dir));
    return 1 if (is_xclude_folder($dir));
    return 0;
}

sub is_user_excluded($) {
    my ($file) = shift;
    $file = lc($file) if (os_is_win());
    foreach my $fil (@g_xclude_files) {
        $fil = lc($fil)  if (os_is_win());
        return 1 if ($file eq $fil);
    }
    return 0;
}    

#########################################################
# Passed an array REF of extensions,
# check if this is one of them?
#########################################################
sub is_this_extent($$) {
   my ($ext, $rex) = @_;
   my $lcx = lc($ext);
   foreach my $x (@{$rex}) {
      return 1 if ($lcx eq lc($x));
   }
   return 0;
}

############################################
# only looking for HTM, HTML, PHP,
# could be extended to others maybe ...
############################################

# test an EXTENSION, or form '.htm'...
sub is_htm_ext($) {
   my ($ext) = shift;
   return( is_this_extent($ext,\@html_ext) );
}
sub is_graf_ext($) {
   my ($ext) = shift;
   return( is_this_extent($ext,\@graf_ext) );
}
sub is_zip_ext($) {
   my ($ext) = shift;
    my @arr = qw( .zip .gz );
   return( is_this_extent($ext,\@arr) );
}
sub is_css_ext($) {
    my ($ext) = shift;
    return( is_this_extent($ext, \@css_ext) );
}
sub is_txt_ext($) {
    my ($ext) = shift;
    my @arr = qw( .txt );
    return( is_this_extent($ext, \@arr) );
}
sub is_script_ext($) {
    my ($fil) = shift;
    return( is_this_extent($fil, \@script_ext) );
}

# test a FILE/PATH extension
sub is_htm_file_ext($) {
    my ($fil) = shift;
    my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/);
    return( is_htm_ext($e) );
}
sub is_graphic_file_ext($) {
    my ($fil) = shift;
    my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/);
    return( is_graf_ext($e) );
}
sub is_zip_file_ext($) {
    my ($fil) = shift;
    my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/);
    return( is_zip_ext($e) );
}
sub is_css_file_ext($) {
    my ($fil) = shift;
    my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/);
    return( is_css_ext($e) );
}
sub is_txt_file_ext($) {
    my ($fil) = shift;
    my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/);
    return( is_txt_ext($e) );
}
sub is_script_file_ext($) {
    my ($fil) = shift;
    my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/);
    return( is_script_ext($e) );
}

sub get_file_ext_type($) {
    my ($fil) = shift;
    return $FE_HTM if (is_htm_file_ext($fil));
    return $FE_IMG if (is_graphic_file_ext($fil));
    return $FE_ZIP if (is_zip_file_ext($fil));
    return $FE_CSS if (is_css_file_ext($fil));
    return $FE_TXT if (is_txt_file_ext($fil));
    return $FE_SCR if (is_script_file_ext($fil));
    return $FE_OTH;
}

sub type_2_stg($) {
    my ($t) = shift;
    return "HTML" if ($t == $FE_HTM);
    return "IMG" if ($t == $FE_IMG);
    return "ZIP" if ($t == $FE_ZIP);
    return "CSS" if ($t == $FE_CSS);
    return "TXT" if ($t == $FE_TXT);
    return "SCRIPT" if ($t == $FE_SCR);
    return "Other";
}

sub add_2_g_file_array($$$$) {
    my ($ff,$idi,$ext,$lev) = @_;
    #                    0   1    2    3  4
    #                    0 = full file name
    #                        1 = extent type number $FE_HTM, etc
    #                             2 = is an 'index' type file
    #                                  3 = depth of directory
    #                                       4 = contents array, if $FE_HTM
    #                                           5 = title (or sfn, if not found), of HTM files
    #                                              6 = links - for HTML files
    push(@g_file_array, [$ff,$ext,$idi,$lev,[],,'',[],0]);
}

sub get_g_file_array_counts() {
    my $len = scalar @g_file_array;
    my %hash = ();
    my ($typ,$i);
    for ($i = 0; $i < $len; $i++) {
        $typ = $g_file_array[$i][1];
        $hash{$typ}++;
    }
    my $res = "Total=$len";
    $len = 0;
    foreach $typ (keys %hash) {
        $i = $hash{$typ};
        $res .= " ".type_2_stg($typ)."=$i";
        $len += $i;
    }
    $res .= " ($len)";
    return $res;
}

# 2010.03.16 - seek an 'index.htm' type file
sub is_def_index_file($) {
   my ($fil) = @_;
   $fil = lc($fil) if (os_is_win());
   foreach my $f (@def_indexes) {
       $f = lc($f) if (os_is_win());
       return 1 if ($f eq $fil);
   }
   return 0;
}

sub process_sub_dir($$) {
    my ($indir,$lev) = @_;
    $indir = dos_2_unix($indir);    # use ALL unix form of path
    pgm_exit(1,"ERROR: Unable to open directory [$indir]!\n") if ( !opendir( DIR, $indir ) );
    my @files = readdir(DIR);
    closedir DIR;
    my ($fcnt,$file,$ff,@dirs,$idi,$ext);
    $fcnt = scalar @files;
    prt("Processing $fcnt files, from [$indir]...\n") if ($dbg01);
    $indir .= '/' if !($indir =~ /(\\|\/)$/);
    @dirs = ();
    foreach $file (@files) {
        next if (($file eq '.')||($file eq '..'));
        $g_item_count++;
        $ff = $indir.$file;
        if (-f $ff) {
            $idi = is_def_index_file($file);
            $ext = get_file_ext_type($file);
            add_2_g_file_array($ff,$idi,$ext,$lev) if (!is_user_excluded($file));
        } elsif (-d $ff) {
            push(@dirs,$ff) if (!is_excluded_dir($file));
            $g_dir_count++;
        } else {
            pgm_exit(1,"ERROR: WHAT IS THIS? [$ff]!!\n");
        }
    }
    foreach $ff (@dirs) {
        process_sub_dir($ff,($lev + 1));
    }
}

sub get_all_files($) {
    my ($inf) = @_;
    $inf = dos_2_unix($inf);    # use ALL unix form of path
    my ($infile,$indir);
    if (-f $inf) {
        ($infile,$indir) = fileparse($inf);
        #prt("Got file [$infile], in directory [$indir]...\n");
        $indir = $cwd if ($indir =~ /^\.(\\|\/)$/);
    } elsif (-d $inf) {
        $indir = $inf;
        $infile = '';   # none yet
    }
    $indir .= '/' if !($indir =~ /(\\|\/)$/);
    pgm_exit(1,"ERROR: Unable to open directory [$indir]!\n") if ( !opendir( DIR, $indir ) );
    my @files = readdir(DIR);
    closedir DIR;
    my ($fcnt,$file,$ff,@dirs,$idi,$ext);
    $fcnt = scalar @files;
    prt("Processing $fcnt files, from [$indir]...\n") if ($dbg01);
    @dirs = ();
    my $lev = 0;
    foreach $file (@files) {
        next if (($file eq '.')||($file eq '..'));
        $g_item_count++;
        $ff = $indir.$file;
        if (-f $ff) {
            $idi = is_def_index_file($file);
            $ext = get_file_ext_type($file);
            add_2_g_file_array($ff,$idi,$ext,$lev) if (!is_user_excluded($file));
        } elsif (-d $ff) {
            push(@dirs,$ff) if (!is_excluded_dir($file));
            $g_dir_count++;
        } else {
            pgm_exit(1,"ERROR: WHAT IS THIS? [$ff]!!\n");
        }
    }

    # now process the subdirectories, and sub-sub...
    foreach $ff (@dirs) {
        process_sub_dir($ff,($lev + 1));
    }
}

sub get_html_title($$) {
    my ($rca,$sfn) = @_;
    my $cnt = scalar @{$rca};
    my $txt = '';
    my ($i,$tag);
    for ($i = 0; $i < $cnt; $i++) {
        $tag = ${$rca}[$i][1];
        if ($tag eq 'title') {
            $txt = trim_all(${$rca}[$i+1][0]) if (($i + 1) < $cnt);
            last;
        }
    }
    $txt = $sfn if (length($txt) == 0);
    return $txt;
}

sub get_html_links_array($) {
    my ($rca) = @_;
    my $cnt = scalar @{$rca};   # get count of items
    my ($i,$txt,$tag,$rah);
    my ($src,$typ);
    my @arr = ();
    for ($i = 0; $i < $cnt; $i++) {
        $txt = ${$rca}[$i][0];
        $tag = ${$rca}[$i][1];
        # get the attribute HASH REFERENCE
        $rah = ${$rca}[$i][3];
        if (defined ${$rah}{'src'}) {
            $src = ${$rah}{'src'};
            $typ = get_href_type($src);
            push(@arr, [$src,$typ,$tag,'src']);
        }
        if (defined ${$rah}{'href'}) {
            $src = ${$rah}{'href'};
            $typ = get_href_type($src);
            push(@arr, [$src,$typ,$tag,'href']);
        }
    }
    return \@arr;
}

#my $OF_FFN = 0; # full file name
#my $OF_EXT = 1; # extension type number like $FE_HTM, $FE_IMG, etc
#my $OF_IND = 2; # if it contforms to an 'index' type file
#my $OF_LEV = 3; # level/depth of sub-directory 0= root
#my $OF_CTA = 4; # content array (ref), if HTML type
#my $OF_TIT = 5; # title text (if any)
#my $OF_LNK = 6; # links (for HTM files)
sub process_html_files($) {
    my ($ra) = shift;   # reference to @g_file_array
    my $max = scalar @{$ra};
    my ($i,$typ,$ff,$hr,$cnt,$hcnt,$sff,$rca,$tit,$lev);
    my ($id,$rlnk,$lcnt);
    $cnt = 0;
    $hcnt = 0;
    for ($i = 0; $i < $max; $i++) {
        $typ = ${$ra}[$i][$OF_EXT];
        $hcnt++ if ($typ == $FE_HTM);
    }
    prt("Found $hcnt HTML files, of total $max, to process from dir [$g_indir]...\n");
    for ($i = 0; $i < $max; $i++) {
        $typ = ${$ra}[$i][$OF_EXT];
        $lev = ${$ra}[$i][$OF_LEV];
        if ($typ == $FE_HTM) {
            $ff = ${$ra}[$i][$OF_FFN];
            $id = ${$ra}[$i][$OF_IND];  # is INDEX type
            $sff = sub_common_folder_unix($ff,$g_indir);
            $sff = unix_2_dos($sff) if (os_is_win());
            $cnt++;
            # $hr = get_html_file_hash($ff,0,4);  # 4 = show dropped closing elements warnings
            $hr = get_html_file_hash($ff,0,0);
            if (defined ${$hr}{$ff}) {
                $rca = ${$hr}{$ff};
                $tit = get_html_title($rca,$sff);
                $rlnk = get_html_links_array($rca);
                $lcnt = scalar @{$rlnk};
                ${$ra}[$i][$OF_CTA] = $rca; # store the file contents array
                ${$ra}[$i][$OF_TIT] = $tit; # add a 'title' (or short file name, if no title, or blank
                ${$ra}[$i][$OF_LNK] = $rlnk; # add links array (ref)
            } else {
                pgm_exit(1,"ERROR: What gives??? File name [$ff] NOT defined\n");
            }
            prt("$cnt of $hcnt: Done [$sff]$lev $lcnt links ");
            prt("title=[$tit] ") if ($show_title);
            prt("INDEX") if ($id);
            prt("\n");
        }
    }
}

sub mycmp_decend_stgi0 {
	return -1 if (lc(${$a}[0]) lt lc(${$b}[0]));
	return  1 if (lc(${$a}[0]) gt lc(${$b}[0]));
	return 0;
}

sub get_html_head() {
    my $head = <<EOF;
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"
      lang="en"
      xml:lang="en">
 <head>
  <meta http-equiv="Content-Type"
        content="text/html; charset=us-ascii" />
  <meta name="Keywords"
        content=
        "free,gpl,flightgear,simgear,terragear,flight,simulator,simulation,world,scenery" />
<title>Site Index - FlightGear</title>
  <style type="text/css">
/*<![CDATA[ page specific style */
  /*]]>*/
  </style>
</head>
<body>
<h1 align="center">Site Map</h1>
EOF
    return $head;
}

sub build_site_index($) {
    my ($ra) = shift;   # reference to @g_file_array
    my $max = scalar @{$ra};
    my ($i,$typ,$lev,$ff,$id,$sff,$rca,$tit,$lnk);
    my ($cnt,$j,$src,$tag,$styp);
    my $html = '<ul>';
    for ($i = 0; $i < $max; $i++) {
        $typ = ${$ra}[$i][$OF_EXT];
        $lev = ${$ra}[$i][$OF_LEV];
        $ff = ${$ra}[$i][$OF_FFN];
        if ($typ == $FE_HTM) {
            $id = ${$ra}[$i][$OF_IND];  # is INDEX type
            $sff = sub_common_folder_unix($ff,$g_indir);
            $sff = dos_2_unix($sff);
            $rca = ${$ra}[$i][$OF_CTA]; # get the file contents array
            $tit = ${$ra}[$i][$OF_TIT]; # get a 'title' (or short file name, if no title, or blank
            $lnk = ${$ra}[$i][$OF_LNK]; # get links array (ref)
            $cnt = scalar @{$lnk};
            # push(@arr, [$src,$typ,$tag,'href']);
            $html .= "<li><a href=\"$sff\">$tit</a><br>\n";
#            if ($cnt) {
#                for ($j = 0; $j < $cnt; $j++) {
#                    $src = ${$lnk}[$j][0];
#                    $styp = ${$lnk}[$j][1];
#                    $tag = ${$lnk}[$j][2];
#                    if (($tag ne 'script')&&($tag ne 'link' )) {
#                        $html .= "<a href=\"$src\">$src ($styp-$tag)</a>\n";
#                    }
#                }
#            }
            $html .= "</li>\n"
        }
    }
    $html .= "</ul>\n";
    $html = get_html_head().$html;
    $html .= "</body>\n";
    $html .= "</html>\n";
    write2file($html,$out_html);
    prt("HTML written to $out_html...\n");
    system($out_html);
}

#########################################
### MAIN ###
parse_args(@ARGV);
prt( "$pgmname: in [$cwd]: In file [$in_file]...\n" );
if (-f $in_file) {
    ($g_infile,$g_indir) = fileparse($in_file);
    #prt("Got file [$infile], in directory [$indir]...\n");
    $g_indir = $cwd if ($g_indir =~ /^\.(\\|\/)$/);
} elsif (-d $in_file) {
    $g_indir = $in_file;
    $g_infile = '';   # none yet
} else {
    pgm_exit(1,"ERROR: In file [$in_file] NOT valid!\n");
}
get_all_files($in_file);
prt("Processed $g_item_count items, $g_dir_count directories, for -\n");
prt( get_g_file_array_counts()."\n" );
process_html_files(\@g_file_array);
# now have has reference of ALL files
build_site_index(\@g_file_array);

pgm_exit(0,"Normal exit(0)");
########################################
sub parse_args {
    my (@av) = @_;
    while (@av) {
        $in_file = $av[0];
        shift @av;
    }
}

# eof - gensiteindex.pl
