#!/usr/bin/perl -w
=head1 NAME

find-hidden-word-text - find hidden text in MS Word documents

=head1 SYNOPSIS

find-hidden-word-text word.doc > hidden.txt

=head1 DESCRIPTION

This is a command-line UNIX tool to ease the task of discovering hidden text
in MS Word documents. 

More specifically, it is an implementation of Method 2 from Simon Byers'
paper, _Scalable Exploitation of, and Responses to Information Leakage
Through Hidden Data in Published Documents_, at
<URL:http://www.user-agent.org/word_docs.pdf>.

This goes a little further in that it removes some common 'noise' strings,
like 'Word.Document.8', 'Title', 'PAGE', 'Microsoft Word Document' and
the like.  It will also remove any strings that do not contain at least
1 whitespace character.

=head1 PREREQUISITES

This tool requires antiword be installed.

=head1 AUTHOR

Justin Mason, C<jm dash wordtext at jmason dot org>

=head1 VERSION

1.0 Aug 15 2003 jm

=cut

my $print_names = 0;
if (scalar @ARGV > 1) { $print_names = 1; }

foreach my $file (@ARGV) {
  if ($print_names) {
    print "\n$file\n\n";
  }

  open (IN, "antiword -t $file |") or die "cannot run antiword";
  my $aw = join ('', <IN>);
  close IN or die "cannot run antiword -t $file";

  open (IN, "strings $file |") or die "cannot run strings";
  my $str = join ('', <IN>);
  close IN;

  # normalize the antiword version
  $aw =~ s/\s+/ /gs;

  # get each string from strings, and see if we can find it in the "visible"
  # text from antiword
  my %count = ();
  foreach (split (/\n/, $str)) {
    s/\s+/ /g; s/^ //gs; s/ $//gs;
    next if ($aw =~ /\Q$_\E/);

    # killfile.
    # skip almost-entirely non-alpha 4-byte snippets
    #next if /^(?:\W\w\W\W|\W\W\w\W|\w\W{3,3}|\W{4,4}|\W{3,3}\w)$/;

    next if (!/ /);	# no spaces!

    # skip 4-to-6-byte snippets with 1 nonalpha and no spaces
    #next if (/^\S{4,6}$/ && /\W/);

    # common word droppings
    next if /^\s*PAGE\s*$/;
    #next if /^Word.Document.\d$/;
    next if /^Microsoft Word 9.0$/;
    next if /^Microsoft Word Document$/;
    #next if /^Normal$/;
    #next if /^Title$/;
    #next if /^MSWordDoc$/;
    next if /^Click to edit Master text styles$/;
    next if /^Click to edit Master title style$/;
    next if /^Embedded OLE Servers$/;

    $count{$_}++;
  }

# output the strings and their counts
  foreach (sort keys %count) {
    my $cnt = $count{$_};
    print "$cnt|$_\n";
  }
}
