#!/usr/bin/perl -w
#<notags: strip anything resembling HTML, replace char entities.
# http://www.perlmonks.org/?node_id=161281

use Getopt::Long;
use File::Basename;
use Carp;
use strict;
use diagnostics;

use subs qw(manpage myuuid spchar untag usage version where);
$ENV{'PATH'} = join ":", qw(/bin /usr/bin /usr/local/bin /opt/sfw/bin);

my $myname = basename($0);
$myname =~ s/\.\w*$//;   # strip any extension

my %options;
my @getopt_args = (
    'h|?',    # print usage
    'm',      # print manpage
    's',      # skip header
    'u',      # print UUID
    'v',      # print version
    'w',      # print source location
    );

Getopt::Long::config("noignorecase", "bundling");
usage unless GetOptions(\%options, @getopt_args);

manpage if $options{'m'};
myuuid  if $options{'u'};
version if $options{'v'};
where   if $options{'w'};
usage   if $options{'h'};

# Real work starts here.  Slurp the whole file, and skip
# the header if we're looking at a mail message.

undef $/;
$_ = <>;

if ($options{'s'}) {
    my $nl = index($_, "\n\n");

    if ($nl > -1) {
        print substr($_, 0, $nl), "\n\n";
        $_ = substr($_, $nl+2);
    }
}

$_ = untag($_);      # Kill HTML tags...
$_ = spchar($_);     # ...and special chars.
print;
exit(0);

#---------------------------------------------------------------------
# Strip HTML tags.

sub untag {
    local $_ = $_[0] || $_;

# ALGORITHM:
#   find < ,
#       comment <!-- ... -->,
#       or comment <? ... ?> ,
#       or one of the start tags which require correspond
#           end tag plus all to end tag
#       or if \s or ="
#           then skip to next "
#           else [^>]
#   >
    s{
      <               # open tag
      (?:             # open group (A)
        (!--) |       #   comment (1) or
        (\?) |        #   another comment (2) or
        (?i:          #   open group (B) for /i
          ( TITLE  |  #     one of start tags
            SCRIPT |  #     for which
            APPLET |  #     must be skipped
            OBJECT |  #     all content
            STYLE     #     to correspond
          )           #     end tag (3)
        ) |           #   close group (B), or
        ([!/A-Za-z])  #   one of these chars, remember in (4)
      )               # close group (A)
      (?(4)           # if previous case is (4)
        (?:           #   open group (C)
          (?!         #     and next is not : (D)
            [\s=]     #       \s or "="
            ["`']     #       with open quotes
          )           #     close (D)
          [^>] |      #     and not close tag or
          [\s=]       #     \s or "=" with
          `[^`]*` |   #     something in quotes ` or
          [\s=]       #     \s or "=" with
          '[^']*' |   #     something in quotes ' or
          [\s=]       #     \s or "=" with
          "[^"]*"     #     something in quotes "
        )*            #   repeat (C) 0 or more times
      |               # else (if previous case is not (4))
        .*?           #   minimum of any chars
      )               # end if previous char is (4)
      (?(1)           # if comment (1)
        (?<=--)       #   wait for "--"
      )               # end if comment (1)
      (?(2)           # if another comment (2)
        (?<=\?)       #   wait for "?"
      )               # end if another comment (2)
      (?(3)           # if one of tags-containers (3)
        </            #   wait for end
        (?i:\3)       #   of this tag
        (?:\s[^>]*)?  #   skip junk to ">"
      )               # end if (3)
      >               # tag closed
     }{}gsx;          # STRIP THIS TAG

    s/\n\s\s*\n/\n/gs;  # strip extra whitespace.
    return $_ ? $_ : "";
}

#---------------------------------------------------------------------
# Handle special characters.
# http://en.wikipedia.org/wiki/Wikipedia:Village_pump/December_2003_archive_1
# http://ascii.cl/htmlcodes.htm

sub spchar {
    local $_ = $_[0] || $_;

    # smart quotes
    s/[\x93\x94]+/\"/gs;
    s/[\x92\xb2\xb9]+/\'/gs;
    s/[\xb3]+/\`/gs;
    s/[\x96]+/-/gs;

    # HTML escapes
    s/\&#8211;/-/gs;    # &ndash;
    s/\&#8212;/--/gs;
    s/\&#8216;/\`/gs;   # &lsquo;
    s/\&#8217;/\'/gs;   # &rsquo;
    s/\&#8220;/\"/gs;   # &ldquo;
    s/\&#8221;/\"/gs;   # &rdquo;

    # unwanted HTML escapes
    s/\&ndash;/-/gs;
    s/\&lsquo;/\`/gs;
    s/\&rsquo;/\'/gs;
    s/\&ldquo;/\"/gs;
    s/\&rdquo;/\"/gs;

    # nonbreaking spaces --KV
    s/[\xA0]+/ /gs;

    # others --KV
    s/\&quot;/\"/gs;
    s/\&nbsp;/ /gs;
    s/\&bull;/\*/gs;
    s/\&amp;/\&/gs;
    s/\&lt;/</gs;
    s/\&gt;/>/gs;
    s/\&AElig;/AE/gs;
    s/\&Aacute;/A'/gs;
    s/\&Acirc;/A\^/gs;
    s/\&Agrave;/A`/gs;
    s/\&Aring;/AA/gs;
    s/\&Atilde;/A\~/gs;
    s/\&Auml;/A:/gs;
    s/\&Ccedil;/C,/gs;
    s/\&ETH;/D-/gs;
    s/\&Eacute;/E'/gs;
    s/\&Ecirc;/E\^/gs;
    s/\&Egrave;/E`/gs;
    s/\&Euml;/E:/gs;
    s/\&Iacute;/I'/gs;
    s/\&Icirc;/I\^/gs;
    s/\&Igrave;/I`/gs;
    s/\&Iuml;/I:/gs;
    s/\&Ntilde;/N~/gs;
    s/\&Oacute;/O'/gs;
    s/\&Ocirc;/O\^/gs;
    s/\&Ograve;/O`/gs;
    s/\&Oslash;/O\//gs;
    s/\&Otilde;/O\~/gs;
    s/\&Ouml;/O:/gs;
    s/\&THORN;/TH/gs;
    s/\&Uacute;/U'/gs;
    s/\&Ucirc;/U\^/gs;
    s/\&Ugrave;/U`/gs;
    s/\&Uuml;/U:/gs;
    s/\&Yacute;/Y'/gs;
    s/\&aacute;/a'/gs;
    s/\&acirc;/a\^/gs;
    s/\&acute;/'/gs;
    s/\&aelig;/ae/gs;
    s/\&agrave;/a`/gs;
    s/\&aring;/aa/gs;
    s/\&atilde;/a\~/gs;
    s/\&auml;/a:/gs;
    s/\&brvbar;/\|/gs;
    s/\&ccedil;/c,/gs;
    s/\&cedil;/,/gs;
    s/\&cent;/-c-/gs;
    s/\&copy;/(C)/gs;
    s/\&curren;/CUR/gs;
    s/\&deg;/DEG/gs;
    s/\&divide;/-:/gs;
    s/\&eacute;/e'/gs;
    s/\&ecirc;/e\^/gs;
    s/\&egrave;/e`/gs;
    s/\&eth;/d-/gs;
    s/\&euml;/e:/gs;
    s/\&euro;/-e-/gs;
    s/\&frac12;/1\/2/gs;
    s/\&frac14;/1\/4/gs;
    s/\&frac34;/3\/4/gs;
    s/\&iacute;/i'/gs;
    s/\&icirc;/i\^/gs;
    s/\&iexcl;/!/gs;
    s/\&igrave;/i`/gs;
    s/\&iquest;/?/gs;
    s/\&iuml;/i:/gs;
    s/\&laquo;/<</gs;
    s/\&macr;/-/gs;
    s/\&micro;/u/gs;
    s/\&middot;/./gs;
    s/\&not;/NOT/gs;
    s/\&ntilde;/n\~/gs;
    s/\&oacute;/o'/gs;
    s/\&ocirc;/o\^/gs;
    s/\&ograve;/o`/gs;
    s/\&ordf;/-a/gs;
    s/\&ordm;/-o/gs;
    s/\&oslash;/o\//gs;
    s/\&otilde;/o\~/gs;
    s/\&ouml;/o:/gs;
    s/\&para;/P:/gs;
    s/\&plusmn;/+-/gs;
    s/\&pound;/-L-/gs;
    s/\&raquo;/>>/gs;
    s/\&reg;/(R)/gs;
    s/\&sect;/S:/gs;
    s/\&shy;/-/gs;
    s/\&sup1;/\^1/gs;
    s/\&sup2;/\^2/gs;
    s/\&sup3;/\^3/gs;
    s/\&szlig;/ss/gs;
    s/\&thorn;/th/gs;
    s/\&times;/x/gs;
    s/\&uacute;/u'/gs;
    s/\&ucirc;/u\^/gs;
    s/\&ugrave;/u`/gs;
    s/\&uml;/"/gs;
    s/\&uuml;/u:/gs;
    s/\&yacute;/y'/gs;
    s/\&yen;/=Y=/gs;
    s/\&yuml;/y:/gs;

    return $_ ? $_ : "";
}

#---------------------------------------------------------------------
# Print a usage message from the comments and exit.

sub usage {
    my ($emsg) = @_;
    use Pod::Usage qw(pod2usage);
    warn "$emsg\n" if defined $emsg;
    pod2usage(-verbose => 99, -sections => "NAME|SYNOPSIS|OPTIONS");
}

sub manpage {
    my @args = ("perldoc", "$0");
    exec { $args[0] } @args;          # safe even with one-arg list
    die("should not get here\n");
}

#---------------------------------------------------------------------
# Print the UUID, current version, or source location.

sub myuuid {
    my $UUID = $1
        if q$UUID: 4de9fae6-7140-3212-bf0c-b94fe0096002 $ =~ /UUID: (.*) /;
    print "$UUID\n";
    exit(0);
}

sub version {
    my $VERSION = sprintf("%d.%02d", q$Revision: 1.5 $ =~ /(\d+)\.(\d+)/);
    my $DATE = $1 if q$Date: 2012-07-26 17:11:48-04 $ =~ /Date: (.*) /;
    print "$myname $VERSION $DATE\n";
    exit(0);
}

sub where {
    my $SOURCE = $1
        if q$Source: /home/vogelke/bin/RCS/notags,v $ =~
          /Source: (.*) /;
    my $HOST = $1 if q$Host: sys7.com $ =~ /Host: (.*) /;
    print "file://$HOST", "$SOURCE\n";
    exit(0);
}

#---------------------------------------------------------------------
__END__

=head1 NAME

notags - strip HTML tags and replace character entities with ASCII

=head1 SYNOPSIS

notags [-hmsuvw] [file ...]

=head1 OPTIONS

=over 4

=item B<-h>

Print a brief help message and exit.

=item B<-m>

Print the manual page and exit.

=item B<-s>

We're handling a mail message, so skip the header.

=item B<-u>

Print the script UUID and exit.

=item B<-v>

Print the version and exit.

=item B<-w>

Print the source location and exit.

=back

=head1 DESCRIPTION

B<notags> strips anything resembling HTML from the input, and replaces
character entities with ASCII equivalents.

=head1 AUTHOR

 Karl Vogel <vogelke@pobox.com>
 Oasis Systems, Inc.

=cut
