#!/usr/bin/perl -w my $rcsid = '$Id: capitalize,v 1.5 2005/09/07 22:27:59 vogelke Exp $'; # NAME: # capitalize # # SYNOPSIS: # capitalize [-hv] [file ...] # # OPTIONS: # -h Print a brief help message and exit. # -v Print the version and exit. # # DESCRIPTION: # Try to capitalize sentences intelligently. Uses stdin or files # on the command line. # # A nouns file ($nouns) is used to identify proper nouns which # should be capitalized within any sentence. # # An exceptions file ($exceptions) is used to identify acronyms, # weekdays, month names, etc. which should also be capitalized. # This file is used to create a set of substitutions run on each # line of input, so more exceptions equals slower performance. # # AUTHOR: # Most of this script comes from a posting to "comp.emacs" # from Larry Wall about an # ALL CAPS to mixed case converter: # # Date: Tue, 11 Sep 1990 16:48:14 -0400 # Here's what Peter Yee uses to translate the NASA articles he # posts. No doubt he has a more up-to-date exception list by now. # # Minor changes by Karl Vogel # Sumaria Systems, Inc. # # NOTES: # Based on hits for "capitalize sentence perl" in "google groups". # # You could use # $string=join '', map {ucfirst} split(/([.?!]\s*)/, $string); # # which takes advantage of the fact that when the expression to # split on is enclosed in capturing parens, split returns a list # that includes the delimiters. Unfortunately, you have a problem # whenever a delimiter is in the middle of a sentence # (like e.g. in this very sentence) # # then you will end up with random capital characters # (like e.G. In this very sentence). # # If you put two spaces between sentences on a single line, and never # end a line with an abbreviation, you can get pretty good results. # For example, here's the regex emacs uses by default to match # sentence ends: # /([.?!][]"')]*(?: | ?[^\S ])\s*)/ # # (Converted to perl regex syntax, somewhat optimized, and tweaked # so that it can be used as a drop-in replacement in the split() # example.) # # If we wanted to capitalize every word in a sentence: # $sentence =~ s/(\W*)(\S+)/$1\u$2/g; # # works for lots of examples, although I'm probably missing a # case that breaks it. The \W* catches all of the non-alphabet # characters at the start of the word, which means that the first # \S match has to be an alphabet character. use strict; use subs qw(usage version); use File::Basename; use Getopt::Long; $ENV{"PATH"} = "/bin:/usr/bin"; # Things you may have to change. my $nouns = '/usr/local/lib/capitalize.nouns'; my $exceptions = '/usr/local/lib/capitalize.special'; # Handle arguments. my $myname = basename($0, ".pl"); my $hflag = 0; my $vflag = 0; GetOptions( "help" => \$hflag, "version" => \$vflag ) or usage(); $hflag and do { usage(); }; $vflag and do { my $vers = version(); warn "$vers\n"; exit(0); }; # Load proper nouns from system dictionary. my $lower; my %proper; open(N, "< $nouns") or die "$nouns: $!\n"; while () { if (/^[A-Z]/) { chomp; ($lower = $_) =~ y/A-Z/a-z/; $proper{$lower} = $_; } } close(N); # Handle exceptions or odd spellings. my $bar; my $foo; my $i; my $prog = ''; open(PATS, "< $exceptions") or die "$exceptions: $!\n"; while () { chomp; next if /^$/; next if /^#/; # allow complete perl expressions. if (!/;$/) { $foo = $_; $foo =~ y/A-Z/a-z/; $foo =~ s/([^\w ])/\\$1/g; $foo =~ s/ /(\\s+)/g; $foo = "\\b" . $foo if $foo =~ /^\w/; $foo .= "\\b" if $foo =~ /\w$/; $i = 0; ($bar = $_) =~ s/ /'$' . ++$i/eg; $_ = "s/$foo/$bar/gi;"; } $prog .= ' ' . $_ . "\n"; } close(PATS); # real work starts here. $/ = ''; while (<>) { $_ = lc($_); # All sentences end with two spaces. s/(\w\w+)\. /$1. /g; # Uppercase the first character at the start of a sentence. $_ = join('', map { ucfirst } split(/([.?!][]"')]*(?: | ?[^\S ])\s*)/, $_)); # handle proper nouns. s/(\w+)/$proper{$1} ? $proper{$1} : $1/eg; # start of a quote. s/"(\w\w+)/"\u$1/g; # handle exceptions. eval $prog; die $@ if $@; print "$_"; } exit(0); #--------------------------------------------------------------------- # Print a usage message from the comment header and exit. sub usage { my ($emsg) = @_; if (open(P, "$myname")) { while (

) { last if /^# NAME:/; } print STDERR "\n NAME:\n"; while (

) { last if /^\s*$/; last if /^# AUTHOR:/; s/^#//; print STDERR; } close(P); } warn("$emsg\n") if $emsg; exit(1); } #--------------------------------------------------------------------- # Return the current version. sub version { $_ = $rcsid; s/,v / /; @_ = split; return "$_[1] v$_[2] $_[3] $_[4]"; }