#!/Perl
# Series: Microsoft Word Automation
# original from : http://www.stouk.com/documents/perl/gui/guiref/page04.htm
# purpose: To load a Microsoft Word document, get the contents of the document,
# and put all the words into a HASH ... sort and show the HASH ...
# NOTE: If a NEW application is loaded, or the document is LARGE, the
# script can appear stalled. Increase $verb to 9 to see more action
# happening ... sometimes 30-50 seconds to process even 1 page ... I assume
# something to do with getting the OLE server running ...
# Setting word visible seems to INCREASE the time taken?
# It seems to run FASTEST when a copy of Word is already open ...
# author: geoff mclane - email: geoffair@hotmail.com - 2006-06-12
#
# NOTE WELL: ON LARGE FILES, THIS CAN TAKE A VERY LONG TIME, UNLESS
# YOU ALREADY HAVE A COPY OF WORD RUNNING. WHEN THERE IS A COPY OF
# WORD RUNNING, THE RESULTS CAN BE QUITE QUICK!!!!!
# IN FACT IT MAY BE COMPLETELY STALLING IF AN INSTANCE OF WORD IS NOT
# ALREADY RUNNING - THIS INSTANCE ALSO GETS CLOSED AT THE END ...
# IT CAN TAKE A MINUTE OR TWO - A LONG TIME TO WAIT ;=))
# like Done ... in 80.1525819301605 seconds ...
# WITH FURTHER TESTS, IT DOES APPEAR TO STALL IF NO OTHER
# WORD INSTANCE RUNNING, AND THE APPLICATION IS LEFT HIDDEN?
# HARD TO WORK OUT ...
#
# MORE TRIES - it appears -
# 	prt( "Getting object for $infile ...\n" );
#	$WordObj = Win32::OLE->GetObject($infile);
#   works MUCH FASTER ... 
#   Done ... in 7.13679909706116 seconds ...
# WITH OR WITHOUT ANOTHER INSTANCE RUNNING!!!
#
use strict; 
use Win32::OLE qw(in with); 
use Win32::OLE::Const 'Microsoft Word'; 
use Time::HiRes qw(gettimeofday); # provide more accurate timings

# *** ALTER THIS TO POINT TO YOUR OWN TEST DOCUMENT ***
#######################################################
###my $infile = 'c:\tmp\test.doc';
my $infile = 'c:\tmp\test2.doc';
# #####################################################
my @exclwordlist = qw(
aborting accept action active addition all 
already alter an and any anything appear appears append 
application apply approach archive are as associated assume at 
attaching author background base be before beginning better bit  
breaks build bumped but can caption char characters 
checkspelling close closing cmd cnt code collate collections com command 
commented considerable const content contents continue 
control copies copy could count created dead default defined deleting die 
different distinct do doc docs documents 
does doing done dos double each else elsif email end ending 
enumerations eof error even examples existing 
exit extensible extracts failed faq fastest file fileformat filename 
files filesearch filtered final first following for forget format 
formats found from full function geoff geoffair get getting 
going got guiref had happening have headersfooters help hence hold 
hotmail htm html http hv05213080 if in increase index infile information inhalt 
initial input into is it item items its june just kcnt keep key keys language 
large lasterror lc left library library' like line list load loaded 
loading locate log lot lots mailto maybe mclane me 
microsoft ml mocrosfot modified monday more move ms msdn msg my 
mydie myrange name need needed nerror new next nmay no not note notepad notes 
numbers objects of on one online only open openning options or 
original other ouput out outfile output outputfilename own page page04 pages 
pagesetup pagetype paragraphs parent part passing path per perl pl point 
presently preserved previous print printer printout prints  
printzoomcolumn printzoompaperheight printzoompaperwidth printzoomrow 
processing properly properties property prt prtv9 purpose put quick quit quite 
quitting quotes qw readonly remind remove represents resize result review 
reviewed rich rtf run running saveas script seconds sections see 
seems selection sendfax sentences series server setting shift should show 
showing simply single slow so som some something sometimes sort sorted spaces 
specified stalled standard start starting stepped stored stouk strict stuff sub 
sure system taken template test tested text that the then there these things 
this time tmp to trailing tried tries trim trouble try trying txt 
unable up url us use used username using usually various 
vbawd11 verb version versions wait want warning was wcnt wd wdalertsnone 
wdformatdostext wdformatencodedtext 
wdformatfilteredhtml wdformathtml wdformatrtf wdformattemplate wdformattext 
wdformattextlinebreaks wdformatunicodetext wdformatwebarchive wdformatxml 
wdgreen wdopenformatauto wdprintalldocument  
wdprintdocumentcontent we web what what's whatever when while whole will win32 
window windows winfaq12 with wlog womthsaveas1 word words words' work worked 
wort wrconwordobjectmodeloverview wrcore wrgrfapplicationobject write www xml 
yeek you your yours
);

my $wid = 0;
my $inhalt = '';
my %distinct = (); # TO HOLD THE FINAL LIST
# just for LOG FILE ouput ...
my ($LOG);
my $write_log = 0;
my $verb = 1; # increase to 9 to see more output
my $outfile = "temp.$0.txt"; # note name of perl file used as base
if ( open( $LOG, ">$outfile" ) ) {
    $write_log = 1; # we have a LOG file
} else {
    $write_log = 0;
    prt( "WARNING: Unable to open $outfile LOG ...\n" );
}
my $bt = gettimeofday;

my $WordApp = undef;
my $WordObj = undef;
### Various ways to do this ...
# Collections - Characters Words Sentences Paragraphs Sections HeadersFooters
use_get_object();
### OR
##use_word_app(0);

my $wcnt = keys( %distinct );
prt( "Showing sorted output per $wcnt HASH keys ... and the count for each ...\n" );
my $cnt = 0;
foreach my $wort (sort keys %distinct){
	$cnt++;
	if ($cnt < 10) {
		prt("  $cnt ");
	} elsif ($cnt < 100) {
		prt(" $cnt ");
	} else {
		prt("$cnt ");
	}
	prt( "[$wort] $distinct{$wort} " ); 
	if (is_in_common($wort)) {
		prt( "EXCLUDED" );
	}
	prt("\n");
} 

prt( "Done $cnt output of sorted keys, with count ...\n" );

prt( "Now output a perl qw list ...\n" );
$wid = 0;
my $wmax = 80;
prt( "my \@newwordlist = qw(\n" );
foreach my $word (sort keys %distinct){
	if ( ! is_in_common($word) ) {
		my $len = length($word);
		if (($len + $wid) > $wmax) {
			prt("\n");
			$wid = 0;
		}
		$wid += $len + 1;
		prt($word.' ');
	}
}
prt(");\n");

if ($WordApp) {
	prt( "Closing and quitting word ...\n" );
	$WordApp->Documents->Close; 
	$WordApp->Documents->Quit; 
	$WordApp->Quit; # quit the application
}
prt("Done ... in ". (gettimeofday - $bt) . " seconds ...\n");

exit(0);
#######################################################################

sub use_get_object {
	my $wd;
	# $document = Win32::OLE->GetObject($infile)
	prt( "Getting object for $infile ...\n" );
	$WordObj = Win32::OLE->GetObject($infile);
	if ($WordObj) {
		prt( "Getting the collection ... (object $WordObj)\n" );
		my $cwords = $WordObj->Words(); # get collection
		if ($cwords) {
			prt( "Getting an enumerator ...(collection $cwords)\n" );
			my $enumerate = new Win32::OLE::Enum($cwords);
			if ($enumerate) {
				my $cnt = 0;
				prt( "Processing enumeration ...(enumerator $enumerate)\n" );
				while( defined( $wd = $enumerate->Next() ) ) {
					$cnt++;
					prtv9( "$cnt $wd " );
					# my $txt = lc($wd->{Range}->{Text}); # this is NOT right!
					my $txt = lc($wd->{Text}); # this works fine ...
					prtv9( "[$txt] \n" );
					add_word($txt);
				}
				prt( "Done $cnt enumerations ...\n" );
			} else {
				prt( "Failed to get enumerator ...\n" );
			}
		} else {
			prt( "Failed to get collection ...\n" );
		}
	} else {
		prt("Failed to GetObject ...\n");
	}
}

# or paragraphs
# $paragraphs = $document->Paragraphs();
# $enumerate = new Win32::OLE::Enum($paragraphs);
# while(defined($paragraph = $enumerate->Next())) {
#    $style = $paragraph->{Style}->{NameLocal};
#    $text = $paragraph->{Range}->{Text}; }


sub use_word_app {
	my ($vis) = shift;
	# a stepped approach to openning, or loading Microsoft Word
	prt( "Attaching to Word application ...\n" );
	$WordApp = Win32::OLE->GetActiveObject('Word.Application');
	if ($WordApp) {
		prt( "Using existing running Word application ...\n" );
	} else {
		prt( "Starting NEW Word application ...\n" );
		$WordApp = Win32::OLE->new('Word.Application', 'Quit'); 
		if ($WordApp) {
			prt("New application running ...\n");
		} else {
			mydie( "ERROR: Failed to load Word application ...\n" );
		}
	}

	if ($vis) {
		prt( "Setting word as visible ...\n" );
		$WordApp->{'Visible'} = 1; # if you want to see something, but it seems to SLOW UP things ;=))
	}

	# Load the application with the document
	prt( "Openning document $infile ...\n" );
	$WordApp->Documents->Open($infile) 
	|| mydie("Unable to open [$infile] document!\nError: ". Win32::OLE->LastError() . "\n"); 


	prt( "Getting contents of the ActiveDocument ... wait ...\n" );
	my $myRange = $WordApp->ActiveDocument->Content; 

	prt( "Processing ActiveDocument contents ... wait a while ...\n" );
	foreach my $word (in $myRange->Words){ 
		$inhalt = lc($word->{Text});
		add_word($inhalt);
		###print '.';
	} 
}

sub add_word {
	my ($inwd) = shift;
	# try to trim it up a bit
	chomp $inwd; # remove trailing \n char, if any ...
	$inwd =~ s/^\'//g; # remove any beginning single quotes
	$inwd =~ s/^\"//g; # remove any beginning double quotes
	$inwd =~ s/\'$//g; # remove any ending single quotes
	$inwd =~ s/\"$//g; # remove any ending double quotes
	prtv9( "Got lc text [$inwd] ...\n" );
	return if not $inwd =~ m/^[a-z]{2,}/; # forget it if not start with 2 alpha 
	###$inwd =~ s/[\s] $//i; 
	##$inwd =~ s/[\s] $//g; # remove trailing spaces
	while ($inwd =~ / $/) {
		$inwd =~ s/ $//g; # remove any trailing spaces 
	}
	prtv9( "Modified to [$inwd], " );
	### $distinct{$inwd}; # was this
	### $distinct{$inwd} = $inhalt; # tried this
	# but better to keep count
	if (defined $distinct{$inwd} ) {
		$distinct{$inwd} = $distinct{$inwd} + 1;
		prtv9( "and bumped count to $distinct{$inwd}...\n" );
	} else {
		prtv9( "and stored first time...\n" );
		$distinct{$inwd} = 1;
	}
}

sub is_in_common {
	my ($wd) = shift;
	foreach my $w (@exclwordlist) {
		if ($w eq $wd) {
			return 1;
		}
	}
	return 0;
}

################################
### output and log file
sub wlog {
	my $ml = shift;
	print $LOG $ml;
}

sub prt {
	my $m = shift;
	if ($write_log) {
		wlog($m);
	}
	print $m;
}

sub prtv9 {
	my $ms = shift;
	if ($verb > 8) {
		prt($ms);
	}
}

sub mydie {
	my $msg = shift;
	if ($write_log) {
		wlog($msg);
	}
	die $msg;
}

# eof - autoword03.pl