#!/usr/bin/perl -w #; } # # Use the string '\n' to represent newlines without breaking a line. # s!\n! \\n !g; @words = split; # We don't want to ignore the first and last words in our list, # so add some junk to either side of @words. $keyidx is the position # of the keyword, or word in the middle of each index line. $keyidx = int($n / 2); $k = $keyidx; push(@words, '\n') while $k--; $k = $keyidx; unshift(@words, '\n') while $k--; # Since we want to show our keyword in the context of neighboring terms, # we are going to use an n-gram window with an odd-numbered length # (3,5,7,...). The next step is to display each set of words using the # middle word as the key. # # Since Perl indexes start at 0, we can compute the index of this middle # word by dividing n by two and losing the remainder. If we are working # with 7-grams, for example, the left context will consist of terms # indexed by 0, 1, 2, the keyword will be indexed by 3, and the right # context terms indexed by 4, 5, 6. $min = $n - 1; $max = $#words; $k = 0; %kwic = (); my $basic; my $center; my $cmax = 0; my $left; my $lmax = 0; my $lside; my $right; my $rside; my @junk; while ($min <= $max) { $left = $k + $keyidx - 1; $right = $min - $keyidx + 1; $center = $words[$k + $keyidx]; # ^ NOT A TYPO: better written as a scalar value # Ignore junk, words that are mostly punctuation, and newlines. $basic = canonical($center); unless ($ignore{"$basic"}) { $lside = "@words[$k..$left]"; $lmax = $lmax > length($lside) ? $lmax : length($lside); $rside = "@words[$right..$min]"; $cmax = $cmax > length($center) ? $cmax : length($center); store($basic, $lside, $center, $rside); } $k++; $min++; } # Sort and display the index. display($lmax, $cmax); exit(0); # -------------------------------------------------------------------- # Store each window of words. sub store { my ($canon, $lside, $center, $rside) = @_; my ($key, $count); # We use the canonical form of the center word as the key to # index each line, so we need to check for duplicates and change # the key accordingly. $count = 0; $key = sprintf("%s%3.3d", $canon, $count); while (exists $kwic{$key}) { $count++; $key = sprintf("%s%3.3d", $canon, $count); } $kwic{$key} = "$lside\t$center\t$rside"; } # -------------------------------------------------------------------- # Display the generated index. sub display { my ($lmax, $cmax) = @_; my ($lside, $center, $rside); # Output format resembles: "%35s %-15s %s" my $fmt = '%' . $lmax . 's %-' . $cmax . 's %s' . "\n"; foreach my $x (sort keys %kwic) { $_ = $kwic{$x}; # Remove multiple newline indicators. s/\\n \\n/\\n/ while /\\n \\n/; s/^\\n//; s/\\n$//; ($lside, $center, $rside) = split (/\t/); printf "$fmt", $lside, $center, $rside; } } # -------------------------------------------------------------------- # Return canonical form of a word: no punctuation, lowercase. sub canonical { my $word = shift; my $before = length($word); my ($after, $pct); # single-character words are junk. return '\n' if $before == 1; $word = lc($word); $word =~ tr/"'!@#$%^&*()_+=[]{}|.,:;?-//d; # words that are more than 50% punctuation are junk. $after = length($word); $pct = int ($after/$before * 100); $word = '\n' if $pct <= 50; return $word; } # -------------------------------------------------------------------- # Set up English list of stopwords. sub stopwords { my %ignore; @junk = qw( about all also an and another any are as at back be because been before being both but by can can't could current did do don't each end even first for from get go good had have he here her his how if in into is it it's its just know last like long make many may me might more most much must my name new no not now of off on one only or other our out over part people point put right same say see should since so some start state still such than that the their them then there these they they'll they're this those time to too true try two under up us use value very was way we well were what when where which while who why will with without won't work would you your you're ); push(@junk, '\n'); foreach (@junk) { $ignore{$_} = 1; } return %ignore; } #--------------------------------------------------------------------- # Print a usage message from the comments and exit. sub usage { my ($emsg) = @_; use Pod::Usage qw(pod2usage); warn "$emsg\n" if defined $emsg; pod2usage(-verbose => 99, -sections => "NAME|SYNOPSIS|OPTIONS"); } sub manpage { use Pod::Man(); my $parser = Pod::Man->new(); open(STDOUT, "| groff -T ascii -man | gcat -s | less") || die "groff\n"; $parser->parse_from_file($0); close STDOUT || die "$myname: can't close stdout: $!\n"; $? = 1 if $? == 255; # from die exit($?); } #--------------------------------------------------------------------- # Print the UUID, current version, or source location. sub myuuid { my $UUID = sprintf("%s", q$UUID: c950eef1-251c-3a25-87dc-011638c27d35 $ =~ /UUID: (.*) /); print "$UUID\n"; exit(0); } sub version { my $VERSION = sprintf("%d.%02d", q$Revision: 1.7 $ =~ /(\d+)\.(\d+)/); my $DATE = sprintf("%s", q$Date: 2008/06/27 22:59:19 $ =~ /Date: (.*) /); print "$myname $VERSION $DATE\n"; exit(0); } sub where { my $SOURCE = sprintf("%s", /Source: (.*) /); print "$SOURCE\n"; exit(0); } #--------------------------------------------------------------------- __END__ =head1 NAME kwic - generate a keyword-in-context index =head1 SYNOPSIS kwic [-hmuvw] [file ...] =head1 OPTIONS =over 4 =item B<-h> Print a brief help message and exit. =item B<-m> Print the manual page and exit. =item B<-n> Set the total number of words to see on either side of the main word. Default is 5. =item B<-u> Print the script UUID and exit. =item B<-v> Print the version and exit. =item B<-w> Print the source location and exit. =back =head1 DESCRIPTION B will read the given input file(s) and print a KWIC index based on the contents. Common English words will be ignored. =head1 AUTHOR Karl Vogel Sumaria Systems, Inc. =cut