#!/usr/bin/perl -w
# NAME: findextent.pl (was findfiles.pl)
# AIM: Search for files of a given extension in a directory, recursive by default
# 17/03/2013 - Rename more to what it does - find files with a specific EXTENSION
# 09/12/2011 geoff mclane http://geoffair.net/mperl
use strict;
use warnings;
use File::Basename;  # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] )
use File::stat; # to get the file date (mtime) and size
use Digest::MD5  qw(md5 md5_hex md5_base64);
use Cwd;
my $perl_dir = 'C:\GTools\perl';
unshift(@INC, $perl_dir);
require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' Check paths in \@INC...\n";
# log file stuff
our ($LF);
my $pgmname = $0;
if ($pgmname =~ /(\\|\/)/) {
    my @tmpsp = split(/(\\|\/)/,$pgmname);
    $pgmname = $tmpsp[-1];
}
my $outfile = $perl_dir."\\temp.$pgmname.txt";
open_log($outfile);

# user variables
my $VERS = "0.0.2 2013-03-17";
#my $VERS = "0.0.1 2011-11-03";
my $load_log = 0;
my $in_dir = '';
my $in_ext = '';
my $verbosity = 0;
my $out_xml = '';

### program variables
my @warnings = ();
my $cwd = cwd();
my $os = $^O;
my @file_list = ();
my $dir_sep = "\\";
my $tot_count = 0;
my @found_files = ();
sub process_dir($$);

my $debug_on = 1;
my $def_dir = 'C:\FGCVS\Flightgear\data';
my $def_ext = ".wav";

sub VERB1() { return $verbosity >= 1; }
sub VERB2() { return $verbosity >= 2; }
sub VERB5() { return $verbosity >= 5; }
sub VERB9() { return $verbosity >= 9; }

sub show_warnings($) {
    my ($val) = @_;
    if (@warnings) {
        prt( "\nGot ".scalar @warnings." WARNINGS...\n" );
        foreach my $itm (@warnings) {
           prt("$itm\n");
        }
        prt("\n");
    } else {
        prt( "\nNo warnings issued.\n\n" ) if (VERB9());
    }
}

sub pgm_exit($$) {
    my ($val,$msg) = @_;
    if (length($msg)) {
        $msg .= "\n" if (!($msg =~ /\n$/));
        prt($msg);
    }
    show_warnings($val);
    close_log($outfile,$load_log);
    exit($val);
}


sub prtw($) {
   my ($tx) = shift;
   $tx =~ s/\n$//;
   prt("$tx\n");
   push(@warnings,$tx);
}

sub process_in_file($) {
    my ($inf) = @_;
    if (! open INF, "<$inf") {
        pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); 
    }
    my @lines = <INF>;
    close INF;
    my $lncnt = scalar @lines;
    prt("Processing $lncnt lines, from [$inf]...\n");
    my ($line,$inc,$lnn);
    $lnn = 0;
    foreach $line (@lines) {
        chomp $line;
        $lnn++;
        if ($line =~ /\s*#\s*include\s+(.+)$/) {
            $inc = $1;
            prt("$lnn: $inc\n");
        }
    }
}

sub process_dir($$) {
    my ($dir,$lev) = @_;
    my @dirs = ();
    my ($file,$ff,$sb);
    my ($n,$d,$e);
    if (opendir(DIR,$dir)) {
        my @files = readdir(DIR);
        closedir DIR;
        $dir .= $dir_sep if ( !($dir =~ /(\\|\/)$/) );
        foreach $file (@files) {
            next if ($file eq ".");
            next if ($file eq "..");
            $ff = $dir.$file;
            if (-d $ff) {
                push(@dirs,$ff);
            } elsif (-f $ff) {
                if ($sb = stat($ff)) {
                    ($n,$d,$e) = fileparse($file, qr/\.[^.]*/);
                    push(@file_list,[$file,$ff,$sb->mtime,$sb->size,$e]);
                    $tot_count++;
                    if (($tot_count % 1000) == 0) {
                        prt("Got $tot_count files...\n");
                    }
                } else {
                    prtw("WARNING: Unable to 'stat' $ff\n");
                }
            }
        }
    } else {
        prtw("WARNING: Unable to open directory $dir\n");
    }
    foreach $file (@dirs) {
        process_dir($file,$lev+1);
    }
    if ($lev == 0) {
       prt("Got $tot_count files...\n");
    }
}

sub process_in_dir($) {
    my ($dir) = shift;
    if (opendir(DIR,$dir)) {
        closedir DIR;
    } else {
        pgm_exit(1,"ERROR: Unable to open directory $dir!\n");
    }
    process_dir($dir,0);
}

sub process_files($) {
    my ($ra) = shift;   # = \@file_list
    my $cnt = scalar @{$ra};
    my $tot_siz = 0;
    prt("Found $cnt files to process... matching to [$in_ext]...\n");
    #                  0     1   2          3         4
    # push(@file_list,[$file,$ff,$sb->mtime,$sb->size,$e]);
    my ($i,$fil,$ff,$tm,$sz,$ext,$text,$lcext,$md5);
    my $max = 75;
    $text = lc($in_ext);
    $md5 = "";
    for ($i = 0; $i < $cnt; $i++) {
        $ext = ${$ra}[$i][4];
        $lcext = lc($ext);
        if ($text eq $lcext) {
            $fil = ${$ra}[$i][0];
            $ff = ${$ra}[$i][1];
            $tm = ${$ra}[$i][2];
            $sz = ${$ra}[$i][3];
            $tot_siz += $sz;
            #                  0    1   2   3   4    5    6 7
            push(@found_files,[$fil,$ff,$tm,$sz,$ext,$md5,0,0]);

        }
    }
    my $fcnt = scalar @found_files;
    prt("Found $fcnt with extent $in_ext... total ".get_nn($tot_siz)." bytes... getting MD5 of each...\n");
    my ($j,$mcnt,$min,$len);
    $min = 0;
    for ($i = 0; $i < $fcnt; $i++) {
        $ff = $found_files[$i][1];
        $len = length($ff);
        $min = $len if ($len > $min);
        $md5 = 'unknown';
        if (open(FILE, $ff)) {
            binmode(FILE);
            $md5 = Digest::MD5->new->addfile(*FILE)->hexdigest;
            close(FILE);
            $found_files[$i][5] = $md5;
        } else {
            prtw("WARNING: open file [$ff] FAILED\n");
        }
    }
    prt("Comparing MD5 of each, with others...\n");
    $mcnt = 0;
    for ($i = 0; $i < $fcnt; $i++) {
        next if ($found_files[$i][6]);
        $md5 = $found_files[$i][5];
        for ($j = 0; $j < $fcnt; $j++) {
            next if ($i == $j);
            next if ($found_files[$j][6]);
            if ($md5 eq $found_files[$j][5]) {
                $found_files[$i][6] = $i + 1;
                $found_files[$j][6] = $i + 1;
                $mcnt++;
            }
        }
        if ($i && (($i % 100) == 0)) {
            prt("Done $i, with matches $mcnt...\n");
        }
    }
    prt("Found $mcnt of $fcnt with SAME MD5\n");
    my $ssize = 0;
    my $scnt = 0;
    my $val = 0;
    my $tcnt = 0;
    my $dupes = 0;
    my $tot_dup = 0;
    my $csz = '';
    $min = $max if ($min > $max);
    for ($i = 0; $i < $fcnt; $i++) {
        next if ($found_files[$i][7]);
        $val = $found_files[$i][6];
        if ($val) {
            $ff = $found_files[$i][1];
            $sz = $found_files[$i][3];
            $scnt++;
            $tcnt = 1;
            $ff .= ' ' while (length($ff) < $min);
            $csz = get_nn($sz);
            $csz = ' '.$csz while (length($csz) < 14);
            prt("\n$scnt:$tcnt: $ff $csz\n");
            $found_files[$i][7] = 1;
            $dupes = 0; # zero dupe size
            for ($j = 0; $j < $fcnt; $j++) {
                next if ($i == $j);
                next if ($found_files[$j][7]);
                if ($val == $found_files[$j][6]) {
                    $ff = $found_files[$j][1];
                    $sz = $found_files[$j][3];
                    $dupes += $sz;
                    $tcnt++;
                    $ff .= ' ' while (length($ff) < $min);
                    $csz = get_nn($sz);
                    $csz = ' '.$csz while (length($csz) < 14);
                    prt("$scnt:$tcnt: $ff $csz\n");
                    $found_files[$j][7] = 1;
                }
            }
            $tot_dup += $dupes;
            prt("$tcnt dupes, save ".get_nn($dupes)." if eliminated.\n");
        }
    }
    prt("Save ".get_nn($tot_dup)." bytes if all exact duplicates eliminated...\n");
}


#########################################
### MAIN ###
parse_args(@ARGV);
### prt( "$pgmname: in [$cwd]: Hello, World...\n" );
### process_in_file($in_dir);
process_in_dir($in_dir);
process_files(\@file_list);
pgm_exit(0,"");
########################################
sub give_help {
    prt("$pgmname: version $VERS\n");
    prt("Usage: $pgmname [options] in-file\n");
    prt("Options:\n");
    prt(" --help  (-h or -?) = This help, and exit 0.\n");
    prt(" --ext <ext>   (-e) = Extensions to search for. Must commence with dot (.)\n");
    prt(" --verb[n]     (-v) = Bump [or set] verbosity. def=$verbosity\n");
    prt(" --load        (-l) = Load LOG at end. ($outfile)\n");
    prt(" --out <file>  (-o) = Write output to this file.\n");
    prt("AIM: Search for files of a given extension in a directory, recursive by default.\n");

}

sub need_arg {
    my ($arg,@av) = @_;
    pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av);
}

sub parse_args {
    my (@av) = @_;
    my ($arg,$sarg);
    while (@av) {
        $arg = $av[0];
        if ($arg =~ /^-/) {
            $sarg = substr($arg,1);
            $sarg = substr($sarg,1) while ($sarg =~ /^-/);
            if (($sarg =~ /^h/i)||($sarg eq '?')) {
                give_help();
                pgm_exit(0,"Help exit(0)");
            } elsif ($sarg =~ /^v/) {
                if ($sarg =~ /^v.*(\d+)$/) {
                    $verbosity = $1;
                } else {
                    while ($sarg =~ /^v/) {
                        $verbosity++;
                        $sarg = substr($sarg,1);
                    }
                }
                prt("Verbosity = $verbosity\n") if (VERB1());
            } elsif ($sarg =~ /^l/) {
                $load_log = 1;
                prt("Set to load log at end.\n") if (VERB1());
            } elsif ($sarg =~ /^e/) {
                need_arg(@av);
                shift @av;
                $sarg = $av[0];
                $in_ext = $sarg;
                prt("Set extension to [$in_ext].\n") if (VERB1());
            } elsif ($sarg =~ /^o/) {
                need_arg(@av);
                shift @av;
                $sarg = $av[0];
                $out_xml = $sarg;
                prt("Set out file to [$out_xml].\n") if (VERB1());
            } else {
                pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n");
            }
        } else {
            $in_dir = $arg;
            prt("Set input to [$in_dir]\n");
        }
        shift @av;
    }

    if ($debug_on) {
        prtw("WARNING: DEBUG is ON!\n");
        if (length($in_dir) ==  0) {
            $in_dir = $def_dir;
            prt("Set DEFAULT directory [$in_dir]\n");
        }
        if (length($in_ext) == 0) {
            $in_ext = $def_ext;
            prt("Set DEFAULT extent [$in_ext]\n");
        }
        $load_log = 1;
    }
    if (length($in_dir) ==  0) {
        pgm_exit(1,"ERROR: No input files found in command!\n");
    }
    if (length($in_ext) ==  0) {
        pgm_exit(1,"ERROR: No extension to search for found in command!\n");
    }
    if (! -d $in_dir) {
        pgm_exit(1,"ERROR: Unable to find in directory [$in_dir]! Check name, location...\n");
    }
}

# eof - template.pl
