#!/usr/bin/perl -w

my $rcsid  = '$Id: capitalize,v 1.5 2005/09/07 22:27:59 vogelke Exp $';

# NAME:
#    capitalize
#
# SYNOPSIS:
#    capitalize [-hv] [file ...]
#
# OPTIONS:
#    -h    Print a brief help message and exit.
#    -v    Print the version and exit.
#
# DESCRIPTION:
#    Try to capitalize sentences intelligently.  Uses stdin or files
#    on the command line.
#
#    A nouns file ($nouns) is used to identify proper nouns which
#    should be capitalized within any sentence.
#
#    An exceptions file ($exceptions) is used to identify acronyms,
#    weekdays, month names, etc. which should also be capitalized.
#    This file is used to create a set of substitutions run on each
#    line of input, so more exceptions equals slower performance.
#
# AUTHOR:
#    Most of this script comes from a posting to "comp.emacs"
#    from Larry Wall <lwall@jpl-devvax.JPL.NASA.GOV> about an
#    ALL CAPS to mixed case converter:
#
#    Date: Tue, 11 Sep 1990 16:48:14 -0400
#    Here's what Peter Yee uses to translate the NASA articles he
#    posts.  No doubt he has a more up-to-date exception list by now.
#
#    Minor changes by Karl Vogel <vogelke@pobox.com>
#    Sumaria Systems, Inc.
#
# NOTES:
#    Based on hits for "capitalize sentence perl" in "google groups".
#
#    You could use
#      $string=join '', map {ucfirst} split(/([.?!]\s*)/, $string);
#
#    which takes advantage of the fact that when the expression to
#    split on is enclosed in capturing parens, split returns a list
#    that includes the delimiters.  Unfortunately, you have a problem
#    whenever a delimiter is in the middle of a sentence
#      (like e.g. in this very sentence)
#
#    then you will end up with random capital characters
#      (like e.G. In this very sentence).
#
#    If you put two spaces between sentences on a single line, and never
#    end a line with an abbreviation, you can get pretty good results.
#    For example, here's the regex emacs uses by default to match
#    sentence ends:
#      /([.?!][]"')]*(?:  | ?[^\S ])\s*)/
#
#    (Converted to perl regex syntax, somewhat optimized, and tweaked
#    so that it can be used as a drop-in replacement in the split()
#    example.)
#
#    If we wanted to capitalize every word in a sentence:
#      $sentence =~ s/(\W*)(\S+)/$1\u$2/g;
#
#    works for lots of examples, although I'm probably missing a
#    case that breaks it.  The \W* catches all of the non-alphabet
#    characters at the start of the word, which means that the first
#    \S match has to be an alphabet character.

use strict;
use subs qw(usage version);
use File::Basename;
use Getopt::Long;
$ENV{"PATH"} = "/bin:/usr/bin";

# Things you may have to change.

my $nouns      = '/usr/local/lib/capitalize.nouns';
my $exceptions = '/usr/local/lib/capitalize.special';

# Handle arguments.

my $myname = basename($0, ".pl");
my $hflag  = 0;
my $vflag  = 0;

GetOptions(
    "help"    => \$hflag,
    "version" => \$vflag
    ) or usage();

$hflag and do {
    usage();
};

$vflag and do {
    my $vers = version();
    warn "$vers\n";
    exit(0);
};

# Load proper nouns from system dictionary.

my $lower;
my %proper;

open(N, "< $nouns") or die "$nouns: $!\n";
while (<N>) {
    if (/^[A-Z]/) {
        chomp;
        ($lower = $_) =~ y/A-Z/a-z/;
        $proper{$lower} = $_;
    }
}
close(N);

# Handle exceptions or odd spellings.

my $bar;
my $foo;
my $i;
my $prog = '';

open(PATS, "< $exceptions") or die "$exceptions: $!\n";
while (<PATS>) {
    chomp;
    next if /^$/;
    next if /^#/;

    # allow complete perl expressions.
    if (!/;$/) {
        $foo = $_;
        $foo =~ y/A-Z/a-z/;
        $foo =~ s/([^\w ])/\\$1/g;
        $foo =~ s/ /(\\s+)/g;
        $foo = "\\b" . $foo if $foo =~ /^\w/;
        $foo .= "\\b" if $foo =~ /\w$/;
        $i = 0;
        ($bar = $_) =~ s/ /'$' . ++$i/eg;
        $_ = "s/$foo/$bar/gi;";
    }

    $prog .= ' ' . $_ . "\n";
}
close(PATS);

# real work starts here.

$/ = '';
while (<>) {
    $_ = lc($_);

    # All sentences end with two spaces.
    s/(\w\w+)\. /$1.  /g;

    # Uppercase the first character at the start of a sentence.
    $_ = join('',
        map { ucfirst }
          split(/([.?!][]"')]*(?:  | ?[^\S ])\s*)/, $_));

    # handle proper nouns.
    s/(\w+)/$proper{$1} ? $proper{$1} : $1/eg;

    # start of a quote.
    s/"(\w\w+)/"\u$1/g;

    # handle exceptions.
    eval $prog;
    die $@ if $@;

    print "$_";
}

exit(0);

#---------------------------------------------------------------------
# Print a usage message from the comment header and exit.

sub usage {
    my ($emsg) = @_;

    if (open(P, "$myname")) {
        while (<P>) {
            last if /^# NAME:/;
        }

        print STDERR "\n NAME:\n";

        while (<P>) {
            last if /^\s*$/;
            last if /^# AUTHOR:/;
            s/^#//;
            print STDERR;
        }

        close(P);
    }

    warn("$emsg\n") if $emsg;
    exit(1);
}

#---------------------------------------------------------------------
# Return the current version.

sub version {
    $_ = $rcsid;
    s/,v / /;
    @_ = split;
    return "$_[1]  v$_[2]  $_[3] $_[4]";
}