#!/perl -w
# NAME: wordindex01.pl
# AIM: Given a FILE, load it in WORD, extract the text, and build an alphabetic
# index of words ...
# Uses Word OLE engine
# see http://www.ngbdigital.com/perl_ole_word.html
# 21/08/2007 geoff mclane - http://geoffair.net/mperl
#
use strict;
use warnings;
use Win32::OLE;
use Win32::OLE qw(in with);
use Win32::OLE::Variant;
use Win32::OLE::Const 'Microsoft Word'; 
require 'logfile.pl' or die "Unable to load logfile.pl ...\n";
# log file stuff
my ($LF);
my $outfile = 'temp.'.$0.'.txt';
if ($0 =~ /\w{1}:\\.*/) {
	my @tmpsp = split(/\\/,$0);
	$outfile = 'temp.'.($tmpsp[-1]).'.txt';
}
open_log($outfile);
prt( "$0 ... Hello, World ...\n" );
my $in_file = 'C:\Documents and Settings\Geoff McLane\My Documents\Tidy\Php-01.doc';

my @common = qw( am as be br but by can do eof etc for from get got has
have hi if in it its may my no not now of or re see so some an on such 
sure at to too us is was with you );
sub in_common {
	my ($tx) = shift;
	foreach my $t (@common) {
		if ($t eq $tx) {
			return 1;
		}
	}
	return 0;
}

# debug
my $dbg1 = 0;	# show stored value
my $dbg2 = 0;	# show REPEATED words
my $dbg3 = 0;	# show progress each 100 words
my $dbg9 = 1;	# show actions sent to prtv9 ...

my %distinct = (); # TO HOLD THE FINAL LIST

enumerate_doc( $in_file );

my $wcnt = keys( %distinct );
prt( "Showing sorted output per $wcnt HASH keys ... and the count for each ...\n" );
my $cnt = 0;
foreach my $wort (sort keys %distinct){
	$cnt++;
	if ($cnt < 10) {
		prt("  $cnt ");
	} elsif ($cnt < 100) {
		prt(" $cnt ");
	} else {
		prt("$cnt ");
	}
	prt( "[$wort] $distinct{$wort}\n" ); 
} 
prt( "Done $cnt output of sorted keys, with count ...\n" );

close_log($outfile,1);
exit(0);
#####################################
###### subs

sub Quit {
	my( $Obj ) = @_;
	$Obj->Quit();
}

sub enumerate_doc {
	my ($infile) = shift;
	my $wdcnt = 0;
	my $lcword = '';
	my $newcnt = 0;
	# a stepped approach to openning, or loading Microsoft Word
	prt( "Attaching to Word application ...\n" );
	my $Word = Win32::OLE->GetActiveObject('Word.Application');
	if ($Word) {
		prt( "Using existing running Word application ...\n" );
	} else {
		prt( "Starting NEW Word application ...\n" );
		$Word = Win32::OLE->new('Word.Application', 'Quit'); 
		if ($Word) {
			prt("New application running ...\n");
		} else {
			mydie( "ERROR: Failed to load Word application ...\n" );
		}
	}
	$Word->{'Visible'}     = 0;
	$Word->{DisplayAlerts} = 0;

	# Load the application with the document
	prt( "Openning document $infile ...\n" );
	my $Doc = $Word->Documents->Open($infile) 
	|| mydie("Unable to open [$infile] document!\nError: ". Win32::OLE->LastError() . "\n"); 

	prt( "Getting contents of the ActiveDocument ... wait ...\n" );
	my $myRange = $Word->ActiveDocument->Content; 

	# Collections - Characters Words Sentences Paragraphs Sections HeadersFooters 
	foreach my $word (in $myRange->Words){ 
		$wdcnt++;
		if (($wdcnt % 100) == 0) {
			prt( "Processed $wdcnt words ...\n" ) if ($dbg3);
		}
		$lcword = lc($word->{Text});	# extract the 'word', in lowercase
		# try to trim it up a bit
		chomp $lcword; # remove trailing \n char, if any ...
		$lcword = replace_hibits($lcword);	# have seen 0xA0 in string - replace with SPACE
		$lcword = remove_quotes($lcword);	# remove any QUOTES, " or ' at begin/end
		$lcword = trim_all($lcword);		# trim it up
		$lcword = remove_quotes($lcword);	# remove any onner quotes
		$lcword = trim_all($lcword);		# and trim AGAIN
		###next if not $lcword =~ m/^[a-z]{2,}/; # forget it if not start with 2 alpha 
		next if ( !($lcword =~ /^\w{2}/) );	# forget it if not start with 2 alphanumeric
		if ($lcword =~ /^\d+$/) {
			next if (length($lcword) < 4);	# dump numbers less than length 4
		}
		next if (in_common($lcword));		# exclude a bumch of 'common' words
		if (length($lcword) > 3) {			# tried to exclude plurals, but mainly FAILED
			if (substr($lcword,length($lcword)-1) eq 's') {
				my $tmp = substr($lcword,0,length($lcword)-1);
				next if (defined $distinct{$tmp} );
			}
		}
		# keep count of words collected
		if (defined $distinct{$lcword} ) {
			$distinct{$lcword}++;
			prtv9( "[$lcword] bumped count to $distinct{$lcword}...\n" ) if ($dbg2);
		} else {
			prtv9( "[$lcword] stored ...\n" ) if ($dbg1);
			$distinct{$lcword} = 1;
			$newcnt++;
		}
	} 
	prt( "Processed $wdcnt words ... collected $newcnt ...\n" );
}

sub prtv9 {
	my ($txt) = shift;
	prt( "$txt" ) if ($dbg9);
}

sub remove_quotes {
	my ($tx) = shift;
	$tx =~ s/^'//; # remove any beginning single quotes
	$tx =~ s/^"//g; # remove any beginning double quotes
	$tx =~ s/'$//; # remove any ending single quotes
	$tx =~ s/"$//g; # remove any ending double quotes
	return $tx;
}

sub replace_hibits {
	my ($tx) = shift;
	my $mx = length($tx);
	my $ntx = '';
	my ($ch, $val);
	for (my $i = 0; $i < $mx; $i++) {
		$ch = substr($tx,$i,1);
		$val = ord($ch);
		if ($val > 127) {
			$ch = ' ';
		}
		$ntx .= $ch;
	}
	return $ntx;
}

# eof - wordindex01.htm
