Links

#!/usr/bin/perl # # $Id: fetchurl,v 1.4 1999/10/30 00:13:19 vogelke Exp $ # # NAME: # fetchurl # # SYNOPSIS: # fetchurl [-dhlv] file # # DESCRIPTION: # "fetchurl" reads a set of URLs from a file or stdin, and uses # lynx to store a nice text copy of each one in its own file. # # The original URL is written as the first line. The URL basename # is used as the filename. We append an integer if the basename # is not unique. # # OPTIONS: # "-d" turns debugging on; show what URLs will be downloaded and where. # "-h" just shows the URLs; intended as filter. # "-l" show URLs as links; quick and dirty way to take a list of URLs # and make a short page for viewing them. # "-p" prints results without prepending URL. # "-s" prints results to stdout. # "-v" prints the current version and exits. # # NOTES: # Sometimes I need to fetch a bunch of docs from a site that doesn't # allow automatic downloading. To handle this: # # 1. Get a list of the URLs using # grep ... | fetchurl -h | fetchurl -l > /some/temp/file.htm # # 2. Fire up each one in turn, and save them as text in the same # directory (say, "/home/directory"). # # 3. When we're done, do something like this: # % cd /work/directory # % fetchurl -h /some/temp/file.htm > 1 # % count=`wc -l /some/temp/file.htm` # % ls -rt /home/directory | tail -$count > 2 # % paste 1 2 > 3 # % ./doit < 3 # # where the "doit" script looks like this: # # #!/usr/bin/perl # # $home = "/home/directory"; # # while (<>) # { # chomp; # ($url, $file) = split (/\t/); # print "$url $home/$file\n"; # # open (IN, "$home/$file") || die; # open (OUT, "> $file") || die; # print OUT "$url\n\n"; # print OUT while ; # close (IN); # close (OUT); # } # # This will prepend the original URL to each file read. This assumes # you read each file in order from the temporary list in file.htm. # # AUTHOR: # Karl E. Vogel # Sumaria Systems, Inc. eval 'exec perl -S $0 ${1+"$@"}' # If the shell can't handle "#!", if 0; # fire up perl directly. require "getopts.pl"; # command line args. $ENV{"PATH"} = "/bin:/usr/sbin:/usr/local/bin"; ($myname) = split (/\//, reverse ($0)); $myname = reverse ($myname); # script basename. # # Trap most common signals. Handle command line arguments (if any). # $SIG{'HUP'} = 'sigcatcher'; $SIG{'INT'} = 'sigcatcher'; $SIG{'QUIT'} = 'sigcatcher'; $SIG{'TERM'} = 'sigcatcher'; &usage unless &Getopts ('dhlpsv'); $debug = 1 if $opt_d; $filter = 1 if $opt_h; $links = 1 if $opt_l; $plain = 1 if $opt_p; &version if $opt_v; # # If listing links, start an HTML page. # if ($links) { print " Links \n"; print "\n"; print "

$url

) { chomp; s/^\s*$//g; print OUT "$_\n"; } close (IN); close (OUT); } # # If printing links, clean up here. # if ($links) { print "

\n"; } &exit (0); #--------------------------------------------------------------------- # Print a short usage message from the comment header and exit. # sub usage { if (open (PROG, "$0")) { while () { last if /^# NAME:/; } print STDERR " NAME:\n"; while () { last if /^\s*$/; last if /^# AUTHOR:/; s/^#//; print STDERR; } close (PROG); } else { print STDERR "No usage information available.\n" ; } &exit (1); } #--------------------------------------------------------------------- # Do something if we get a signal. # sub sigcatcher { local($sig) = @_; &exit (2, "caught signal SIG$sig -- shutting down.\n"); } #--------------------------------------------------------------------- # Print the current version and exit. # sub version { $_ = '$RCSfile: fetchurl,v $ $Revision: 1.4 $ ' . '$Date: 1999/10/30 00:13:19 $'; s/RCSfile: //; s/.Date: //; s/,v . .Revision: / v/; s/\$//g; print "$_\n"; exit (0); } #--------------------------------------------------------------------- # Clean up. # sub exit { local($code, $msg) = @_; unlink ($tmp); warn "$myname: $msg\n" if $msg; exit ($code); }