#!/Perl
# NAME: autoword05.pl
# AIM: To test out the following -
# from : http://www.wellho.net/solutions/perl-using-perl-to-read-microsoft-word-documents.html
# Extracting PARAGRAPHS from a word document.
# 29/08/2007 - geoff mclane - geoffair.net/mperl
use strict;
use warnings;
use Win32::OLE;
use Win32::OLE::Enum;
######################################
# establish in and out file names
my $in_file = 'c:\tmp\test.doc';
###my $in_file = 'C:\Documents and Settings\Geoff McLane\My Documents\Tidy\Php-01.doc';
my $out_file = "temp.$0.txt";
### OR USE COMMAND LINE
###$document = Win32::OLE -> GetObject($ARGV[1]);
###open (FH,">$ARGV[0]");
# $out_file = $ARGV[0];
# $in_file = $ARGV[1];
if ( ! -f $in_file) {
	die "Can not locate file [$in_file] ...\n";
}
my ($i, $msg, $document, $paragraph, $paragraphs, $enumerate, $style, $text, $len, $hex, $ntxt, $ch, $val, $hv);

$document = Win32::OLE->GetObject($in_file) or die "Unable to OPEN $in_file ...\n";
open (FH,">$out_file") or die "Unable to create $out_file ...\n";

$msg = "Extracting Text (in paragraphs) from $in_file ...\n";
print FH $msg;
print $msg;

show_crlf();	# just a DEBUG display ^M and ^J ...

# Collections - Characters Words Sentences Paragraphs Sections HeadersFooters 
$paragraphs = $document->Paragraphs();
$enumerate = new Win32::OLE::Enum($paragraphs);
while(defined($paragraph = $enumerate->Next()))
{
    $style = $paragraph->{Style}->{NameLocal};
    ###print FH "+[$style]\n";
    $text = $paragraph->{Range}->{Text};
	$len = length($text);
	$hex = '';
	$ntxt = '';
	for ($i = 0; $i < $len; $i++) {
		$ch = substr($text,$i,1);
		$val = ord($ch);
		$hv = dec2hex($val);
		if (($val >= 32) && ($val < 128)) {
			$hex .= $ch;
			$ntxt .= $ch;
		} else {
			$hex .= ' ' if length($hex);
			$hex .= $hv;
			if ($val >= 128) {
				$ntxt .= "\@$hv";
			} else {
				$ntxt .= '^';
				$ntxt .= chr($val + 64);
			}
		}
	}
    print FH "=[$text]\n";
	print FH "h[$hex]\n";
    $text =~ s/[\n\r]//g;
    $text =~ s/\x0b/\n/g;
	print "$ntxt ($hex)\n";
}

close FH;
system($out_file);

exit(0);

############################
##### subs
sub dec2hex {
    my $decnum = shift;	# or $_[0]; or @_;    # parameter passed to the subfunction
    my $hexnum = '';     # the final hex number
    my $tempval = 0;
    while ($decnum != 0) {
		# get the remainder (modulus function)
		# by dividing by 16
		$tempval = $decnum % 16;
		# convert to the appropriate letter
		# if the value is greater than 9
		if ($tempval > 9) {
			$tempval = chr($tempval + 55);
		}
		# 'concatenate' the number to 
		# what we have so far in what will
		# be the final variable
		$hexnum = $tempval . $hexnum ;
		# new actually divide by 16, and 
		# keep the integer value of the 
		# answer
		$decnum = int($decnum / 16); 
		# if we cant divide by 16, this is the
		# last step
		if ($decnum < 16) {
			# convert to letters again..
			if ($decnum > 9) {
				$decnum = chr($decnum + 55);
			}
			# add this onto the final answer.. 
			# reset decnum variable to zero so loop
			# will exit
			$hexnum = $decnum . $hexnum; 
			$decnum = 0 
		}
    }
    return $hexnum;
} # end sub

#################
sub show_crlf {
	$ch = "\r";
	$val = ord($ch);
	$hv = dec2hex($val);
	$ntxt = '^';
	$ntxt .= chr($val + 64);
	$msg = "Value $val = 0x$hv = Carriage return = $ntxt ...\n";
	print FH $msg;
	print $msg;
	$ch = "\n";
	$val = ord($ch);
	$hv = dec2hex($val);
	$ntxt = '^';
	$ntxt .= chr($val + 64);
	$msg = "Value $val = 0x$hv = Line feed = $ntxt ...\n";
	print FH $msg;
	print $msg;
}


# eof - autoword05.pl

