#!/usr/bin/perl # # $Id: html2words,v 1.2 1997/10/21 20:54:17 vogelke Exp $ # # NAME: # html2words # # SYNOPSIS: # html2words [-v] # # DESCRIPTION: # "html2words" is a program to strip tags from HTML files, weed # out garbage words, and print what's left to stdout. # # Filenames are read from stdin. # # OPTIONS: # "-v" prints the current version and exits. # # AUTHOR: # Karl E. Vogel # Sumaria Systems, Inc. eval 'exec perl -S $0 ${1+"$@"}' # If the shell can't handle "#!", if 0; # fire up perl directly. require "getopts.pl"; # command line args. $ENV{"PATH"} = "/bin:/usr/sbin:/usr/local/bin"; ($myname) = split (/\//, reverse ($0)); $myname = reverse ($myname); # script basename. # # Trap most common signals. Handle command line arguments (if any). # $SIG{'HUP'} = 'sigcatcher'; $SIG{'INT'} = 'sigcatcher'; $SIG{'QUIT'} = 'sigcatcher'; $SIG{'TERM'} = 'sigcatcher'; &usage unless &Getopts ('v'); &version if $opt_v; # # Set up list of words to drop. # Read and process a list of files from stdin. # &ignore; while () { chop; $file = $_; &readfile ($file); } &exit (0); #--------------------------------------------------------------------- # Print a short usage message from the comment header and exit. # sub usage { if (open (PROG, "$myname")) { while () { last if /^# NAME:/; } print STDERR " NAME:\n"; while () { last if /^\s*$/; last if /^# AUTHOR:/; s/^#//; print STDERR; } close (PROG); } else { print STDERR "No usage information available.\n" ; } &exit (1); } #--------------------------------------------------------------------- # Do something if we get a signal. # sub sigcatcher { local($sig) = @_; &exit (2, "caught signal SIG$sig -- shutting down.\n"); } #--------------------------------------------------------------------- # Print the current version and exit. # sub version { $_ = '$RCSfile: html2words,v $ $Revision: 1.2 $ ' . '$Date: 1997/10/21 20:54:17 $'; s/RCSfile: //; s/.Date: //; s/,v . .Revision: / v/; s/\$//g; print "$_\n"; exit (0); } #--------------------------------------------------------------------- # Clean up. # sub exit { local($code, $msg) = @_; unlink ($tmp); warn "$myname: $msg\n" if $msg; exit ($code); } # ------------------------------------------------------------------------ # Set up list of words to ignore. # Use both lowercase and first-letter-capitalized versions of each word. # sub ignore { while () { chop; $ig{"$_"} = 1; substr ($_, 0, 1) =~ tr/a-z/A-Z/; $ig{"$_"} = 1; } } # ------------------------------------------------------------------------ # Read a given file. To delete non-alpha characters in a word: # y/a-zA-Z0-9//cd; # # To replace non-alpha characters with spaces: # y/a-zA-Z0-9/ /cs; # sub readfile { local ($file) = @_; open (IN, "html2txt < $file |") || die "FILE ($file): $!\n"; print "FILE: $file\n"; while () { y/a-zA-Z0-9\'\./ /cs; # keep alphanumerics, quote, period. @words = split (/\W*\s+\W*/); foreach (@words) { $wc{$_} = 1; } } foreach (sort keys %wc) { # ignore common words. next if $ig{$_}; # ignore words that are entirely decimal or hex integers. next if /^[0-9]*$/ || /^0x[0-9]*$/; print "$_\n"; } print "\n"; close (IN); undef (%wc); } __END__ I I'll I've a about above according across actually after afterwards again against all almost alone along already also although always among amongst an and another any anyhow anyone anything anywhere are aren't around as at b back be became because become becomes becoming been before beforehand begin beginning behind being below beside besides between beyond billion both but by c came can can't cannot cant caption come could couldn't current d day did didn't do does doesn't doesnt don't dont down during e each eight eighty either else elsewhere end ending enough even ever every everyone everything everywhere except f fifty first five for former formerly forty found four from further g get go good great h had has hasn't have haven't he he'd he'll he's hence her here here's hereafter hereby herein hereupon hers herself him himself his how however hundred i i'll i've if in inc. indeed instead into is isn't it it's its itself j just k know l last later latter latterly least less let's life like likely little long m made make makes man many may maybe me meantime meanwhile men might million miss more moreover most mostly mr much must my myself n name namely neither never nevertheless new next nine ninety no nobody none nonetheless noone not nothing now nowhere o of off often old on once one one's only onto or other others otherwise our ours ourselves out over overall own p part people perhaps point q r rather re recent recently right s said same say see seem seemed seeming seems seven seventy several she she'd she'll she's should shouldn't since sixty so some somehow someone something sometime sometimes somewhere start state still stop such t take taking than that that'll that's that've the their them themselves then thence there there'd there'll there're there's there've thereafter thereby therefore therein thereupon these they they'd they'll they're they've thirty this those though thousand three through throughout thru thus time to together too toward towards trillion true try twenty two u under unless unlike unlikely until up upon us use used using v value very vs w was wasn't way we we'd we'll we're we've well were weren't what what'll what's what've whatever when whence whenever where where's whereafter whereas whereby wherein whereupon wherever whether which while whither who who'd who'll who's whoever whole whom whomever whose why will with within without won't wont work world would wouldn't x y year years yes yet you you'd you'll you're you've your yours yourself yourselves z