#!/usr/bin/perl -w
#<kwic: generate a keyword-in-context (KWIC) index.

use Getopt::Long;
use File::Basename;
use Carp;
use strict;
use diagnostics;

use subs qw(manpage myuuid usage version where);
$ENV{'PATH'} = join ":", qw(/bin /usr/bin /usr/local/bin /opt/sfw/bin);

my $myname = basename($0);
$myname =~ s/\.\w*$//;   # strip any extension

#
# Handle the command-line options.
#

my %options;
my @getopt_args = (
    'h|?',    # print usage
    'm',      # print manpage
    'n=i',    # set number of words to view
    'u',      # print UUID
    'v',      # print version
    'w',      # print source location
    );

Getopt::Long::config("noignorecase", "bundling");
usage unless GetOptions(\%options, @getopt_args);

manpage if $options{'m'};
myuuid  if $options{'u'};
version if $options{'v'};
where   if $options{'w'};
usage   if $options{'h'};

my $k;
my $keyidx;
my $max;
my $min;
my $n;
my %ignore;
my %kwic;
my @words;

#
# Decide how many words go in our sliding window.
# Set up stopwords.
#

$n = $options{'n'} || 5;
$n = ($n * 2) + 1;
%ignore = stopwords();

#
# Read the entire file, since we'll be going through it
# word-by-word instead of line-by-line.
#

{
    undef $/;
    $_ = <>;
}

#
# Use the string '\n' to represent newlines without breaking a line.
#

s!\n! \\n !g;
@words = split;

# We don't want to ignore the first and last words in our list,
# so add some junk to either side of @words.  $keyidx is the position
# of the keyword, or word in the middle of each index line.

$keyidx = int($n / 2);

$k = $keyidx;
push(@words, '\n') while $k--;

$k = $keyidx;
unshift(@words, '\n') while $k--;

# Since we want to show our keyword in the context of neighboring terms,
# we are going to use an n-gram window with an odd-numbered length
# (3,5,7,...).  The next step is to display each set of words using the
# middle word as the key.
#
# Since Perl indexes start at 0, we can compute the index of this middle
# word by dividing n by two and losing the remainder.  If we are working
# with 7-grams, for example, the left context will consist of terms
# indexed by 0, 1, 2, the keyword will be indexed by 3, and the right
# context terms indexed by 4, 5, 6.

$min = $n - 1;
$max = $#words;
$k = 0;
%kwic = ();

my $basic;
my $center;
my $cmax = 0;
my $left;
my $lmax = 0;
my $lside;
my $right;
my $rside;
my @junk;

while ($min <= $max) {
    $left  = $k + $keyidx - 1;
    $right = $min - $keyidx + 1;

    $center = $words[$k + $keyidx];
    #         ^ NOT A TYPO: better written as a scalar value

    # Ignore junk, words that are mostly punctuation, and newlines.

    $basic = canonical($center);

    unless ($ignore{"$basic"}) {
        $lside = "@words[$k..$left]";
        $lmax = $lmax > length($lside) ? $lmax : length($lside);

        $rside = "@words[$right..$min]";
        $cmax = $cmax > length($center) ? $cmax : length($center);
        store($basic, $lside, $center, $rside);
    }

    $k++;
    $min++;
}

# Sort and display the index.

display($lmax, $cmax);
exit(0);


# --------------------------------------------------------------------
# Store each window of words.

sub store {
    my ($canon, $lside, $center, $rside) = @_;
    my ($key, $count);

    # We use the canonical form of the center word as the key to
    # index each line, so we need to check for duplicates and change
    # the key accordingly.

    $count = 0;
    $key = sprintf("%s%3.3d", $canon, $count);

    while (exists $kwic{$key}) {
        $count++;
        $key = sprintf("%s%3.3d", $canon, $count);
    }

    $kwic{$key} = "$lside\t$center\t$rside";
}


# --------------------------------------------------------------------
# Display the generated index.

sub display {
    my ($lmax, $cmax) = @_;
    my ($lside, $center, $rside);

    # Output format resembles: "%35s  %-15s  %s"
    my $fmt = '%' . $lmax . 's  %-' . $cmax . 's  %s' . "\n";

    foreach my $x (sort keys %kwic) {
        $_ = $kwic{$x};

        # Remove multiple newline indicators.
        s/\\n \\n/\\n/ while /\\n \\n/;
        s/^\\n//;
        s/\\n$//;

        ($lside, $center, $rside) = split (/\t/);
        printf "$fmt", $lside, $center, $rside;
    }
}


# --------------------------------------------------------------------
# Return canonical form of a word: no punctuation, lowercase.

sub canonical {
    my $word = shift;
    my $before = length($word);
    my ($after, $pct);

    # single-character words are junk.
    return '\n' if $before == 1;

    $word = lc($word);
    $word =~ tr/"'!@#$%^&*()_+=[]{}|.,:;?-//d;

    # words that are more than 50% punctuation are junk.
    $after = length($word);
    $pct = int ($after/$before * 100);
    $word = '\n' if $pct <= 50;

    return $word;
}


# --------------------------------------------------------------------
# Set up English list of stopwords.

sub stopwords {
    my %ignore;

    @junk = qw(
      about all also an and another any are as at back be because been
      before being both but by can can't could current did do don't
      each end even first for from get go good had have he here her
      his how if in into is it it's its just know last like long make
      many may me might more most much must my name new no not now
      of off on one only or other our out over part people point put
      right same say see should since so some start state still such
      than that the their them then there these they they'll they're
      this those time to too true try two under up us use value very
      was way we well were what when where which while who why will
      with without won't work would you your you're
      );

    push(@junk, '\n');

    foreach (@junk) {
        $ignore{$_} = 1;
    }

    return %ignore;
}

#---------------------------------------------------------------------
# Print a usage message from the comments and exit.

sub usage {
    my ($emsg) = @_;
    use Pod::Usage qw(pod2usage);
    warn "$emsg\n" if defined $emsg;
    pod2usage(-verbose => 99, -sections => "NAME|SYNOPSIS|OPTIONS");
}

sub manpage {
    my @args = ("perldoc", "$0");
    exec { $args[0] } @args;          # safe even with one-arg list
    die("should not get here\n");
}

#---------------------------------------------------------------------
# Print the UUID, current version, or source location.

sub myuuid {
    my $UUID = sprintf("%s",
        q$UUID: c950eef1-251c-3a25-87dc-011638c27d35 $ =~ /UUID: (.*) /);
    print "$UUID\n";
    exit(0);
}

sub version {
    my $VERSION = sprintf("%d.%02d", q$Revision: 1.8 $ =~ /(\d+)\.(\d+)/);
    my $DATE =
      sprintf("%s", q$Date: 2010-11-02 20:57:27-04 $ =~ /Date: (.*) /);
    print "$myname $VERSION $DATE\n";
    exit(0);
}

sub where {
    my $SOURCE = sprintf("%s",
        q$Source: /home/vogelke/projects/kwic/RCS/kwic,v $ =~
          /Source: (.*) /);
    my $HOST = $1 if q$Host: sys7.com $ =~ /Host: (.*) /;
    print "file://$HOST", "$SOURCE\n";
    exit(0);
}

#---------------------------------------------------------------------
__END__

=head1 NAME

kwic - generate a keyword-in-context index

=head1 SYNOPSIS

kwic [-hmuvw] [file ...]

=head1 OPTIONS

=over 4

=item B<-h>

Print a brief help message and exit.

=item B<-m>

Print the manual page and exit.

=item B<-n>

Set the total number of words to see on either side of the main word.
Default is 5.

=item B<-u>

Print the script UUID and exit.

=item B<-v>

Print the version and exit.

=item B<-w>

Print the source location and exit.

=back

=head1 DESCRIPTION

B<kwic> will read the given input file(s) and print a KWIC index based
on the contents.  Common English words will be ignored.

=head1 AUTHOR

 Karl Vogel <vogelke@pobox.com>
 Sumaria Systems, Inc.

=cut