#!/usr/bin/perl -w
#< file2hash: print MD4 signature for files or stdin

use Getopt::Long;
use File::Basename;
use Digest::MD4;
use strict;

$ENV{'PATH'} = join ":", qw(/bin /usr/bin /usr/local/bin /opt/sfw/bin);
@ARGV = ("-") unless @ARGV;    # handle stdin.

my $myname = basename($0);
$myname =~ s/\.\w*$//;   # strip any extension

my %options;
my @getopt_args = (
    'h|?',    # print usage
    'm',      # print manpage
    'u',      # print UUID
    'v',      # print version
    'w',      # print source location
    );

Getopt::Long::config("noignorecase", "bundling");
usage() unless GetOptions(\%options, @getopt_args);

manpage() if $options{'m'};
myuuid()  if $options{'u'};
version() if $options{'v'};
where()   if $options{'w'};
usage()   if $options{'h'};

# Display the signature for any requested files.

push(@ARGV, '-') unless @ARGV;
for my $file (@ARGV) {
    my $md = makesig($file);
    print "$md $file\n" if $md;
}

exit(0);

# --------------------------------------------------------------------
# saferead($file): opens a file (or stdin) for reading.

sub saferead {
    my $sname = (caller(0))[3];
    my $file = shift;
    my $fh;

    if ($file eq '-') {
        unless (open($fh, "<&STDIN")) {
            warn "$sname: can't dup stdin: $!\n";
            $fh = undef;
        }
    }
    else {
        unless (open($fh, '<', $file)) {
            warn "$sname: can't read $file: $!\n";
            $fh = undef;
        }
    }

    return $fh;
}

# --------------------------------------------------------------------
# makesig($file): computes $file digest, appends file-size.

sub makesig {
    my $sname = (caller(0))[3];
    my $file = shift;
    my $fh;
    my $size;
    my $digest = new Digest::MD4;

    return unless $fh = saferead($file);
    return unless (-f $fh || -p $fh);
    $size = (stat($fh))[7];

    eval { $digest->addfile($fh) };
    if ($@) {
        warn "$sname: sum $file: $!\n";
        return;
    }

    close($fh);
    return $digest->hexdigest . sprintf("%8.8x", $size);
}

#---------------------------------------------------------------------
# Print a usage message from the comments and exit.

sub usage {
    my ($emsg) = @_;
    use Pod::Usage qw(pod2usage);
    warn "$emsg\n" if defined $emsg;
    pod2usage(-verbose => 99, -sections => "NAME|SYNOPSIS|OPTIONS");
}

sub manpage {
    my @args = ("perldoc", "$0");
    exec { $args[0] } @args;          # safe even with one-arg list
    die("should not get here\n");
}

#---------------------------------------------------------------------
# Print the UUID, current version, or source location.

sub myuuid {
    my $UUID = sprintf("%s",
        q$UUID: 5ba9072d-f664-378e-9666-2fa6d1544870 $ =~ /UUID: (.*) /);
    print "$UUID\n";
    exit(0);
}

sub version {
    my $VERSION = sprintf("%d.%02d", q$Revision: 1.5 $ =~ /(\d+)\.(\d+)/);
    my $DATE =
      sprintf("%s", q$Date: 2010-11-02 17:46:14-04 $ =~ /Date: (.*) /);
    print "$myname $VERSION $DATE\n";
    exit(0);
}

sub where {
    my $SOURCE = sprintf("%s",
        q$Source: /home/vogelke/notebook/2008/0327/RCS/file2hash,v $ =~ /Source: (.*) /);
    print "$SOURCE\n";
    exit(0);
}

#---------------------------------------------------------------------
__END__

=head1 NAME

file2hash - print a unique signature for files

=head1 SYNOPSIS

file2hash [-hmuvw] [file ...]

=head1 OPTIONS

=over 4

=item B<-h>

Print a brief help message and exit.

=item B<-m>

Print the manual page and exit.

=item B<-u>

Print the script UUID and exit.

=item B<-v>

Print the version and exit.

=item B<-w>

Print the source location and exit.

=back

=head1 DESCRIPTION

B<file2hash> will read stdin or the given input file(s).  Output goes to
stdout, and consists of a unique signature followed by the filename (or
"-" for stdin).  The signature is 40 hex characters: the MD4 hash of the
contents followed by the file size.

Only regular files are printed.

=head1 NOTES

Always use the three-argument version of open(); our filenames have
all sorts of weird characters in them which need to be protected.

We have several million files that are stored according to their hash
value, so a fast signature method was needed.  Here are some benchmarks;
first time is wall-clock seconds.

 Benchmark: timing 20 iterations of haval, md4, sha256, sha512...
     haval: 3 secs ( 2.92 usr + 0.14 sys = 3.06 CPU) @ 6.54/s (n=20)
       md4: 1 secs ( 0.43 usr + 0.10 sys = 0.53 CPU) @ 37.74/s (n=20)
    sha256: 2 secs ( 2.74 usr + 0.11 sys = 2.85 CPU) @ 7.02/s (n=20)
    sha512: 6 secs ( 5.97 usr + 0.12 sys = 6.09 CPU) @ 3.28/s (n=20)

=head1 AUTHOR

 Karl Vogel <vogelke@pobox.com>
 Sumaria Systems, Inc.

=cut
