#!/usr/bin/perl # # $Id: fetchurl,v 1.4 1999/10/30 00:13:19 vogelke Exp $ # # NAME: # fetchurl # # SYNOPSIS: # fetchurl [-dhlv] file # # DESCRIPTION: # "fetchurl" reads a set of URLs from a file or stdin, and uses # lynx to store a nice text copy of each one in its own file. # # The original URL is written as the first line. The URL basename # is used as the filename. We append an integer if the basename # is not unique. # # OPTIONS: # "-d" turns debugging on; show what URLs will be downloaded and where. # "-h" just shows the URLs; intended as filter. # "-l" show URLs as links; quick and dirty way to take a list of URLs # and make a short page for viewing them. # "-p" prints results without prepending URL. # "-s" prints results to stdout. # "-v" prints the current version and exits. # # NOTES: # Sometimes I need to fetch a bunch of docs from a site that doesn't # allow automatic downloading. To handle this: # # 1. Get a list of the URLs using # grep ... | fetchurl -h | fetchurl -l > /some/temp/file.htm # # 2. Fire up each one in turn, and save them as text in the same # directory (say, "/home/directory"). # # 3. When we're done, do something like this: # % cd /work/directory # % fetchurl -h /some/temp/file.htm > 1 # % count=`wc -l /some/temp/file.htm` # % ls -rt /home/directory | tail -$count > 2 # % paste 1 2 > 3 # % ./doit < 3 # # where the "doit" script looks like this: # # #!/usr/bin/perl # # $home = "/home/directory"; # # while (<>) # { # chomp; # ($url, $file) = split (/\t/); # print "$url $home/$file\n"; # # open (IN, "$home/$file") || die; # open (OUT, "> $file") || die; # print OUT "$url\n\n"; # print OUT while ; # close (IN); # close (OUT); # } # # This will prepend the original URL to each file read. This assumes # you read each file in order from the temporary list in file.htm. # # AUTHOR: # Karl E. Vogel # Sumaria Systems, Inc. eval 'exec perl -S $0 ${1+"$@"}' # If the shell can't handle "#!", if 0; # fire up perl directly. require "getopts.pl"; # command line args. $ENV{"PATH"} = "/bin:/usr/sbin:/usr/local/bin"; ($myname) = split (/\//, reverse ($0)); $myname = reverse ($myname); # script basename. # # Trap most common signals. Handle command line arguments (if any). # $SIG{'HUP'} = 'sigcatcher'; $SIG{'INT'} = 'sigcatcher'; $SIG{'QUIT'} = 'sigcatcher'; $SIG{'TERM'} = 'sigcatcher'; &usage unless &Getopts ('dhlpsv'); $debug = 1 if $opt_d; $filter = 1 if $opt_h; $links = 1 if $opt_l; $plain = 1 if $opt_p; &version if $opt_v; # # If listing links, start an HTML page. # if ($links) { print " Links \n"; print "\n"; print "
    \n"; } # # Handle a file on the command line or stdin. # while (<>) { chomp; next unless /(ftp:|http:)\/\//; # # If we're getting binary data, set a flag so we don't # try prepending the URL. # $binary = 0; $binary = 1 if /\.pdf|\.tgz|\.gz/i; # # Get the URL. Allow the user to include "raw" HTML lines # containing an HREF= field, if desired. # s/HREF/href/g; $save = $_; if (/href=\"([^\"]*)\"/) { $save = $1; } elsif (/(http:\/\/[^\s]*)/) { $save = $1; } elsif (/(ftp:\/\/[^\s]*)/) { $save = $1; } $_ = $save; # # Use the URL to make the base part of the filename. # Change odd characters into slashes, so the basename part will # behave predictably. Try to pick a sensible name. # $url = $_; s/\?OpenDocument$//g; s/\?st\.ne\..*$//; s/\&col=.*//; s/\&//g; s/maxfieldsize=.*//; s/_noframes//; tr,!?%&=#,/,; tr,$,,d; @a = split (/\//, reverse ($_)); foreach $x (@a) { $_ = reverse ($x); s/.*\.cgi\?//g; s/(.*\.html)\?.*/$1/g; s/\.html$/.txt/; s/\.[sp]html$/.txt/; s/\.htm$/.txt/; s/\.asp$/.txt/; $file = $_; last if length ($file) > 3; } # # Listing, not fetching. # if ($debug) { print "$url --> ($file)\n"; next; } if ($links) { print "
  1. $url
  2. \n"; next; } if ($filter) { print "$url\n"; next; } # # If we have a duplicate filename, append an integer. # if (-f $file) { $k = 1; $newfile = $file; while (-f $newfile) { $newfile = $file . ".$k"; $k++; } $file = $newfile; } # # Read from lynx, write to either stdout or the generated file. # open (IN, "lynx -width=85 -dump '$url' -auth=vogelke:clueless |") || die "lynx: $!\n"; if ($opt_s) # stdout { open (OUT, ">&STDOUT"); } else { open (OUT, "> $file") || die "$file: $!\n"; print OUT "$url\n\n" unless $binary || $plain; print "$file\n"; } while () { chomp; s/^\s*$//g; print OUT "$_\n"; } close (IN); close (OUT); } # # If printing links, clean up here. # if ($links) { print "
\n"; } &exit (0); #--------------------------------------------------------------------- # Print a short usage message from the comment header and exit. # sub usage { if (open (PROG, "$0")) { while () { last if /^# NAME:/; } print STDERR " NAME:\n"; while () { last if /^\s*$/; last if /^# AUTHOR:/; s/^#//; print STDERR; } close (PROG); } else { print STDERR "No usage information available.\n" ; } &exit (1); } #--------------------------------------------------------------------- # Do something if we get a signal. # sub sigcatcher { local($sig) = @_; &exit (2, "caught signal SIG$sig -- shutting down.\n"); } #--------------------------------------------------------------------- # Print the current version and exit. # sub version { $_ = '$RCSfile: fetchurl,v $ $Revision: 1.4 $ ' . '$Date: 1999/10/30 00:13:19 $'; s/RCSfile: //; s/.Date: //; s/,v . .Revision: / v/; s/\$//g; print "$_\n"; exit (0); } #--------------------------------------------------------------------- # Clean up. # sub exit { local($code, $msg) = @_; unlink ($tmp); warn "$myname: $msg\n" if $msg; exit ($code); }