#!/Perl
# Series: Microsoft Word Automation
# original from : http://www.stouk.com/documents/perl/gui/guiref/page04.htm
# purpose: To load a Microsoft Word document, get the contents of the document,
# and put all the words into a HASH ... sort and show the HASH ...
# NOTE: If a NEW application is loaded, or the document is LARGE, the
# script can appear stalled. Increase $verb to 9 to see more action
# happening ... sometimes 30-50 seconds to process even 1 page ... I assume
# something to do with getting the OLE server running ...
# Setting word visible seems to INCREASE the time taken?
# It seems to run FASTEST when a copy of Word is already open ...
# author: geoff mclane - email: geoffair@hotmail.com - 2006-06-12
#
# NOTE WELL: ON LARGE FILES, THIS CAN TAKE A VERY LONG TIME, UNLESS
# YOU ALREADY HAVE A COPY OF WORD RUNNING. WHEN THERE IS A COPY OF
# WORD RUNNING, THE RESULTS CAN BE QUITE QUICK!!!!!
# IN FACT IT MAY BE COMPLETELY STALLING IF AN INSTANCE OF WORD IS NOT
# ALREADY RUNNING - THIS INSTANCE ALSO GETS CLOSED AT THE END ...
#
use strict; 
use Win32::OLE qw(in with); 
use Win32::OLE::Const 'Microsoft Word'; 

# *** ALTER THIS TO POINT TO YOUR OWN TEST DOCUMENT ***
#######################################################
my $infile = 'c:\tmp\test.doc';
# #####################################################

my $inhalt = '';
my %distinct = (); # TO HOLD THE FINAL LIST
# just for LOG FILE ouput ...
my ($LOG);
my $write_log = 0;
my $verb = 1; # increase to 9 to see more output
my $outfile = "$0.txt"; # note name of perl file used as base
if ( open( $LOG, ">$outfile" ) ) {
    $write_log = 1; # we have a LOG file
} else {
    $write_log = 0;
    prt( "WARNING: Unable to open $outfile LOG ...\n" );
}

# a stepped approach to openning, or loading Microsoft Word
prt( "Attaching to Word application ...\n" );
my $Word = Win32::OLE->GetActiveObject('Word.Application');
if ($Word) {
	prt( "Using existing running Word application ...\n" );
} else {
	prt( "Starting NEW Word application ...\n" );
	$Word = Win32::OLE->new('Word.Application', 'Quit'); 
	if ($Word) {
		prt("New application running ...\n");
	} else {
		mydie( "ERROR: Failed to load Word application ...\n" );
	}
}

###prt( "Setting word as visible ...\n" );
###$Word->{'Visible'} = 1; # if you want to see something, but it seems to SLOW UP things ;=))

# Load the application with the document
prt( "Openning document $infile ...\n" );
$Word->Documents->Open($infile) 
|| mydie("Unable to open [$infile] document!\nError: ". Win32::OLE->LastError() . "\n"); 

prt( "Getting contents of the ActiveDocument ... wait ...\n" );
my $myRange = $Word->ActiveDocument->Content; 

prt( "Processing ActiveDocument contents ... wait ...\n" );

# Collections - Characters Words Sentences Paragraphs Sections HeadersFooters 
foreach my $word (in $myRange->Words){ 
	$inhalt = lc($word->{Text});
	# try to trim it up a bit
	chomp $inhalt; # remove trailing \n char, if any ...
	$inhalt =~ s/^\'//; # remove any beginning single quotes
	$inhalt =~ s/^\"//g; # remove any beginning double quotes
	$inhalt =~ s/\'$//; # remove any ending single quotes
	$inhalt =~ s/\"$//g; # remove any ending double quotes
	prtv9( "Got lc text [$inhalt] ...\n" );
	next if not $inhalt =~ m/^[a-z]{2,}/; # forget it if not start with 2 alpha 
	###$inhalt =~ s/[\s] $//i; 
	##$inhalt =~ s/[\s] $//g; # remove trailing spaces
	while ($inhalt =~ / $/) {
		$inhalt =~ s/ $//g; # remove any trailing spaces 
	}
	prtv9( "Modified to [$inhalt], " );
	### $distinct{$inhalt}; # was this
	### $distinct{$inhalt} = $inhalt; # tried this
	# but better to keep count
	if (defined $distinct{$inhalt} ) {
		$distinct{$inhalt} = $distinct{$inhalt} + 1;
		prtv9( "and bumped count to $distinct{$inhalt}...\n" );
	} else {
		prtv9( "and stored first time...\n" );
		$distinct{$inhalt} = 1;
	}
} 

my $wcnt = keys( %distinct );
prt( "Showing sorted output per $wcnt HASH keys ... and the count for each ...\n" );
my $cnt = 0;
foreach my $wort (sort keys %distinct){
	$cnt++;
	if ($cnt < 10) {
		prt("  $cnt ");
	} elsif ($cnt < 100) {
		prt(" $cnt ");
	} else {
		prt("$cnt ");
	}
	prt( "[$wort] $distinct{$wort}\n" ); 
} 
prt( "Done $cnt output of sorted keys, with count ...\n" );

prt( "Closing and quitting word ...\n" );
$Word->Documents->Close; 
$Word->Documents->Quit; 
$Word->Quit; # quit the application
prt("Done ...\n");

################################
### output and log file
sub wlog {
	my $ml = shift;
	print $LOG $ml;
}

sub prt {
	my $m = shift;
	if ($write_log) {
		wlog($m);
	}
	print $m;
}

sub prtv9 {
	my $ms = shift;
	if ($verb > 8) {
		prt($ms);
	}
}

sub mydie {
	my $msg = shift;
	if ($write_log) {
		wlog($msg);
	}
	die $msg;
}

# eof - autoword03.pl