#!/usr/bin/perl -w #; if ($options{'s'}) { my $nl = index($_, "\n\n"); if ($nl > -1) { print substr($_, 0, $nl), "\n\n"; $_ = substr($_, $nl+2); } } $_ = untag($_); # Kill HTML tags... $_ = spchar($_); # ...and special chars. print; exit(0); #--------------------------------------------------------------------- # Strip HTML tags. sub untag { local $_ = $_[0] || $_; # ALGORITHM: # find < , # comment , # or comment , # or one of the start tags which require correspond # end tag plus all to end tag # or if \s or =" # then skip to next " # else [^>] # > s{ < # open tag (?: # open group (A) (!--) | # comment (1) or (\?) | # another comment (2) or (?i: # open group (B) for /i ( TITLE | # one of start tags SCRIPT | # for which APPLET | # must be skipped OBJECT | # all content STYLE # to correspond ) # end tag (3) ) | # close group (B), or ([!/A-Za-z]) # one of these chars, remember in (4) ) # close group (A) (?(4) # if previous case is (4) (?: # open group (C) (?! # and next is not : (D) [\s=] # \s or "=" ["`'] # with open quotes ) # close (D) [^>] | # and not close tag or [\s=] # \s or "=" with `[^`]*` | # something in quotes ` or [\s=] # \s or "=" with '[^']*' | # something in quotes ' or [\s=] # \s or "=" with "[^"]*" # something in quotes " )* # repeat (C) 0 or more times | # else (if previous case is not (4)) .*? # minimum of any chars ) # end if previous char is (4) (?(1) # if comment (1) (?<=--) # wait for "--" ) # end if comment (1) (?(2) # if another comment (2) (?<=\?) # wait for "?" ) # end if another comment (2) (?(3) # if one of tags-containers (3) ]*)? # skip junk to ">" ) # end if (3) > # tag closed }{}gsx; # STRIP THIS TAG s/\n\s\s*\n/\n/gs; # strip extra whitespace. return $_ ? $_ : ""; } #--------------------------------------------------------------------- # Handle special characters. # http://en.wikipedia.org/wiki/Wikipedia:Village_pump/December_2003_archive_1 # http://ascii.cl/htmlcodes.htm sub spchar { local $_ = $_[0] || $_; # smart quotes s/[\x93\x94]+/\"/gs; s/[\x92\xb2\xb9]+/\'/gs; s/[\xb3]+/\`/gs; s/[\x96]+/-/gs; # HTML escapes s/\–/-/gs; # – s/\—/--/gs; s/\‘/\`/gs; # ‘ s/\’/\'/gs; # ’ s/\“/\"/gs; # “ s/\”/\"/gs; # ” # unwanted HTML escapes s/\–/-/gs; s/\‘/\`/gs; s/\’/\'/gs; s/\“/\"/gs; s/\”/\"/gs; # nonbreaking spaces --KV s/[\xA0]+/ /gs; # others --KV s/\"/\"/gs; s/\ / /gs; s/\•/\*/gs; s/\&/\&/gs; s/\<//gs; s/\Æ/AE/gs; s/\Á/A'/gs; s/\Â/A\^/gs; s/\À/A`/gs; s/\Å/AA/gs; s/\Ã/A\~/gs; s/\Ä/A:/gs; s/\Ç/C,/gs; s/\Ð/D-/gs; s/\É/E'/gs; s/\Ê/E\^/gs; s/\È/E`/gs; s/\Ë/E:/gs; s/\Í/I'/gs; s/\Î/I\^/gs; s/\Ì/I`/gs; s/\Ï/I:/gs; s/\Ñ/N~/gs; s/\Ó/O'/gs; s/\Ô/O\^/gs; s/\Ò/O`/gs; s/\Ø/O\//gs; s/\Õ/O\~/gs; s/\Ö/O:/gs; s/\Þ/TH/gs; s/\Ú/U'/gs; s/\Û/U\^/gs; s/\Ù/U`/gs; s/\Ü/U:/gs; s/\Ý/Y'/gs; s/\á/a'/gs; s/\â/a\^/gs; s/\´/'/gs; s/\æ/ae/gs; s/\à/a`/gs; s/\å/aa/gs; s/\ã/a\~/gs; s/\ä/a:/gs; s/\¦/\|/gs; s/\ç/c,/gs; s/\¸/,/gs; s/\¢/-c-/gs; s/\©/(C)/gs; s/\¤/CUR/gs; s/\°/DEG/gs; s/\÷/-:/gs; s/\é/e'/gs; s/\ê/e\^/gs; s/\è/e`/gs; s/\ð/d-/gs; s/\ë/e:/gs; s/\€/-e-/gs; s/\½/1\/2/gs; s/\¼/1\/4/gs; s/\¾/3\/4/gs; s/\í/i'/gs; s/\î/i\^/gs; s/\¡/!/gs; s/\ì/i`/gs; s/\¿/?/gs; s/\ï/i:/gs; s/\«/<>/gs; s/\®/(R)/gs; s/\§/S:/gs; s/\­/-/gs; s/\¹/\^1/gs; s/\²/\^2/gs; s/\³/\^3/gs; s/\ß/ss/gs; s/\þ/th/gs; s/\×/x/gs; s/\ú/u'/gs; s/\û/u\^/gs; s/\ù/u`/gs; s/\¨/"/gs; s/\ü/u:/gs; s/\ý/y'/gs; s/\¥/=Y=/gs; s/\ÿ/y:/gs; return $_ ? $_ : ""; } #--------------------------------------------------------------------- # Print a usage message from the comments and exit. sub usage { my ($emsg) = @_; use Pod::Usage qw(pod2usage); warn "$emsg\n" if defined $emsg; pod2usage(-verbose => 99, -sections => "NAME|SYNOPSIS|OPTIONS"); } sub manpage { use Pod::Man(); my $parser = Pod::Man->new(); open(STDOUT, "| groff -T ascii -man | gcat -s | less") || die "groff\n"; $parser->parse_from_file($0); close STDOUT || die "$myname: can't close stdout: $!\n"; $? = 1 if $? == 255; # from die exit($?); } #--------------------------------------------------------------------- # Print the UUID, current version, or source location. sub myuuid { my $UUID = $1 if q$UUID: 4de9fae6-7140-3212-bf0c-b94fe0096002 $ =~ /UUID: (.*) /; print "$UUID\n"; exit(0); } sub version { my $VERSION = sprintf("%d.%02d", q$Revision: 1.3 $ =~ /(\d+)\.(\d+)/); my $DATE = $1 if q$Date: 2009/11/29 02:31:06 $ =~ /Date: (.*) /; print "$myname $VERSION $DATE\n"; exit(0); } sub where { my $SOURCE = $1 if q$Source: /home/vogelke/bin/RCS/notags,v $ =~ /Source: (.*) /; my $HOST = $1 if q$Host: example.com $ =~ /Host: (.*) /; print "file://$HOST", "$SOURCE\n"; exit(0); } #--------------------------------------------------------------------- __END__ =head1 NAME notags - strip HTML tags and replace character entities with ASCII =head1 SYNOPSIS notags [-hmsuvw] [file ...] =head1 OPTIONS =over 4 =item B<-h> Print a brief help message and exit. =item B<-m> Print the manual page and exit. =item B<-s> We're handling a mail message, so skip the header. =item B<-u> Print the script UUID and exit. =item B<-v> Print the version and exit. =item B<-w> Print the source location and exit. =back =head1 DESCRIPTION B strips anything resembling HTML from the input, and replaces character entities with ASCII equivalents. =head1 AUTHOR Karl Vogel Oasis Systems, Inc. =cut