#!/usr/bin/perl
#
# $Id: html2words,v 1.2 1997/10/21 20:54:17 vogelke Exp $
#
# NAME:
#	html2words
#
# SYNOPSIS:
#	html2words [-v]
#
# DESCRIPTION:
#	"html2words" is a program to strip tags from HTML files, weed
#	out garbage words, and print what's left to stdout.
#
#	Filenames are read from stdin.
#
# OPTIONS:
#	"-v" prints the current version and exits.
#
# AUTHOR:
#	Karl E. Vogel <vogelke@c17.com>
#	Sumaria Systems, Inc.

eval 'exec perl -S $0 ${1+"$@"}'	# If the shell can't handle "#!",
	if 0;				# fire up perl directly.

require "getopts.pl";			# command line args.

$ENV{"PATH"} = "/bin:/usr/sbin:/usr/local/bin";
($myname) = split (/\//, reverse ($0));
$myname = reverse ($myname);		# script basename.

#
# Trap most common signals.  Handle command line arguments (if any).
#

$SIG{'HUP'}  = 'sigcatcher';
$SIG{'INT'}  = 'sigcatcher';
$SIG{'QUIT'} = 'sigcatcher';
$SIG{'TERM'} = 'sigcatcher';

&usage unless &Getopts ('v');
&version if $opt_v;

#
# Set up list of words to drop.
# Read and process a list of files from stdin.
#

&ignore;

while (<STDIN>)
{
	chop;
	$file = $_;
	&readfile ($file);
}

&exit (0);


#---------------------------------------------------------------------
#	Print a short usage message from the comment header and exit.
#

sub usage
{
	if (open (PROG, "$myname"))
	{
		while (<PROG>)
		{
			last if /^# NAME:/;
		}

		print STDERR " NAME:\n";

		while (<PROG>)
		{
			last if /^\s*$/;
			last if /^# AUTHOR:/;
			s/^#//;
			print STDERR;
		}

		close (PROG);
	}
	else
	{
		print STDERR "No usage information available.\n" ;
	}

	&exit (1);
}


#---------------------------------------------------------------------
#	Do something if we get a signal.
#

sub sigcatcher
{
	local($sig) = @_;
	&exit (2, "caught signal SIG$sig -- shutting down.\n");
}


#---------------------------------------------------------------------
#	Print the current version and exit.
#

sub version
{
        $_ = '$RCSfile: html2words,v $ $Revision: 1.2 $ ' .
                '$Date: 1997/10/21 20:54:17 $';

        s/RCSfile: //;
        s/.Date: //;
        s/,v . .Revision: /  v/;
        s/\$//g;

        print "$_\n";
        exit (0);
}


#---------------------------------------------------------------------
#	Clean up.
#

sub exit
{
	local($code, $msg) = @_;
	unlink ($tmp);

	warn "$myname: $msg\n" if $msg;
	exit ($code);
}


# ------------------------------------------------------------------------
# Set up list of words to ignore.
# Use both lowercase and first-letter-capitalized versions of each word.
#

sub ignore
{
	while (<DATA>)
	{
		chop;
		$ig{"$_"} = 1;

		substr ($_, 0, 1) =~ tr/a-z/A-Z/;
		$ig{"$_"} = 1;
	}
}

# ------------------------------------------------------------------------
# Read a given file.  To delete non-alpha characters in a word:
#	y/a-zA-Z0-9//cd;
#
# To replace non-alpha characters with spaces:
#	y/a-zA-Z0-9/ /cs;
#

sub readfile
{
	local ($file) = @_;

	open (IN, "html2txt < $file |") || die "FILE ($file): $!\n";
	print "FILE: $file\n";

	while (<IN>)
	{
		y/a-zA-Z0-9\'\./ /cs;	# keep alphanumerics, quote, period.
		@words = split (/\W*\s+\W*/);

		foreach (@words)
		{
			$wc{$_} = 1;
		}
	}

	foreach (sort keys %wc)
	{
		# ignore common words.
		next if $ig{$_};

		# ignore words that are entirely decimal or hex integers.
		next if /^[0-9]*$/ || /^0x[0-9]*$/;

		print "$_\n";
	}

	print "\n";
	close (IN);
	undef (%wc);
}

__END__
I
I'll
I've
a
about
above
according
across
actually
after
afterwards
again
against
all
almost
alone
along
already
also
although
always
among
amongst
an
and
another
any
anyhow
anyone
anything
anywhere
are
aren't
around
as
at
b
back
be
became
because
become
becomes
becoming
been
before
beforehand
begin
beginning
behind
being
below
beside
besides
between
beyond
billion
both
but
by
c
came
can
can't
cannot
cant
caption
come
could
couldn't
current
d
day
did
didn't
do
does
doesn't
doesnt
don't
dont
down
during
e
each
eight
eighty
either
else
elsewhere
end
ending
enough
even
ever
every
everyone
everything
everywhere
except
f
fifty
first
five
for
former
formerly
forty
found
four
from
further
g
get
go
good
great
h
had
has
hasn't
have
haven't
he
he'd
he'll
he's
hence
her
here
here's
hereafter
hereby
herein
hereupon
hers
herself
him
himself
his
how
however
hundred
i
i'll
i've
if
in
inc.
indeed
instead
into
is
isn't
it
it's
its
itself
j
just
k
know
l
last
later
latter
latterly
least
less
let's
life
like
likely
little
long
m
made
make
makes
man
many
may
maybe
me
meantime
meanwhile
men
might
million
miss
more
moreover
most
mostly
mr
much
must
my
myself
n
name
namely
neither
never
nevertheless
new
next
nine
ninety
no
nobody
none
nonetheless
noone
not
nothing
now
nowhere
o
of
off
often
old
on
once
one
one's
only
onto
or
other
others
otherwise
our
ours
ourselves
out
over
overall
own
p
part
people
perhaps
point
q
r
rather
re
recent
recently
right
s
said
same
say
see
seem
seemed
seeming
seems
seven
seventy
several
she
she'd
she'll
she's
should
shouldn't
since
sixty
so
some
somehow
someone
something
sometime
sometimes
somewhere
start
state
still
stop
such
t
take
taking
than
that
that'll
that's
that've
the
their
them
themselves
then
thence
there
there'd
there'll
there're
there's
there've
thereafter
thereby
therefore
therein
thereupon
these
they
they'd
they'll
they're
they've
thirty
this
those
though
thousand
three
through
throughout
thru
thus
time
to
together
too
toward
towards
trillion
true
try
twenty
two
u
under
unless
unlike
unlikely
until
up
upon
us
use
used
using
v
value
very
vs
w
was
wasn't
way
we
we'd
we'll
we're
we've
well
were
weren't
what
what'll
what's
what've
whatever
when
whence
whenever
where
where's
whereafter
whereas
whereby
wherein
whereupon
wherever
whether
which
while
whither
who
who'd
who'll
who's
whoever
whole
whom
whomever
whose
why
will
with
within
without
won't
wont
work
world
would
wouldn't
x
y
year
years
yes
yet
you
you'd
you'll
you're
you've
your
yours
yourself
yourselves
z