#!/usr/bin/perl -w
#<linkdups: hard-link identical files to save diskspace

use Getopt::Long;
use File::Basename;
use strict;
use subs qw(manpage myuuid usage version where);

my $myname = basename($0);
$myname =~ s/\.\w*$//;   # strip any extension

my %options;
my @getopt_args = (
    'd',      # debug - only show what would be done
    'h|?',    # print usage
    'm',      # print manpage
    'q',      # don't print names being linked
    'u',      # print UUID
    'v',      # print version
    'w',      # print source location
    );

Getopt::Long::config("noignorecase", "bundling");
usage unless GetOptions(\%options, @getopt_args);

manpage if $options{'m'};
myuuid  if $options{'u'};
version if $options{'v'};
where   if $options{'w'};
usage   if $options{'h'};

#
# Read hashfile, create links for regular files.
#

my $ohash = '';
my $opath = '';
my $nhash;
my $npath;
my $sep;

while (<>) {
    chomp;

    # The format should be "HASH-VALUE <spaces> FILENAME".
    # The first space is the field separator.

    $sep = index($_, ' ') + 1;
    die "no separator: line $.\n" if $sep <= 0;

    $nhash = substr($_, 0, $sep);
    $npath = substr($_, $sep);
    $npath =~ s/^\s*//;

    if ($ohash eq $nhash) {
        print "[$opath] --> [$npath]\n" unless $options{'q'};

        unless ($options{'d'}) {
            unlink($npath)
              || warn("$npath unlink failed\n");
            link($opath, $npath)
              || warn("link ($opath, $npath) failed\n");
        }
    }

    $ohash = $nhash;
    $opath = $npath;
}

exit(0);

#---------------------------------------------------------------------
# Print a usage message from the comments and exit.

sub usage {
    my ($emsg) = @_;
    use Pod::Usage qw(pod2usage);
    warn "$emsg\n" if defined $emsg;
    pod2usage(-verbose => 99, -sections => "NAME|SYNOPSIS|OPTIONS");
}

sub manpage {
    my @args = ("perldoc", "$0");
    exec { $args[0] } @args;          # safe even with one-arg list
    die("should not get here\n");
}

#---------------------------------------------------------------------
# Print the UUID, current version, or source location.

sub myuuid {
    my $UUID = sprintf("%s",
        q$UUID: b79d686c-8162-3058-b027-1a399279664c $ =~ /UUID: (.*) /);
    print "$UUID\n";
    exit(0);
}

sub version {
    my $VERSION = sprintf("%d.%02d", q$Revision: 1.5 $ =~ /(\d+)\.(\d+)/);
    my $DATE =
      sprintf("%s", q$Date: 2010-11-02 17:50:12-04 $ =~ /Date: (.*) /);
    print "$myname $VERSION $DATE\n";
    exit(0);
}

sub where {
    my $SOURCE = sprintf("%s",
        q$Source: /home/vogelke/bin/RCS/linkdups,v $ =~
          /Source: (.*) /);
    print "$SOURCE\n";
    exit(0);
}

#---------------------------------------------------------------------
__END__

=head1 NAME

linkdups - hard-link identical files to save diskspace

=head1 SYNOPSIS

linkdups [-dhmquvw] [hashfile]

=head1 OPTIONS

=over 4

=item B<-d>

Debug: prints what would be done without doing it.

=item B<-h>

Print a brief help message and exit.

=item B<-m>

Print the manual page and exit.

=item B<-q>

Don't print filenames being linked.

=item B<-u>

Print the script UUID and exit.

=item B<-v>

Print the version and exit.

=item B<-w>

Print the source location and exit.

=back

=head1 DESCRIPTION

B<linkdups> reads sorted hashes (MD5, RMD160, etc.) for regular files
in the same filesystem, looks for duplicate entries, and makes hard-links
to save space.

The hash-file format should be "HASH-VALUE (one or more spaces) FILENAME".

=head1 EXAMPLE

 root# cd /some/place
 root# find . -size +70000 -print0 | xargs -0 md5sum | sort > /tmp/hash

 root# cat /tmp/hash
 f9203cea58151e423367d147bd334242  ./a/b/c/summary.pdf
 f9203cea58151e423367d147bd334242  ./d/e/f/report.pdf
 ...

Then, later on:

 root# cd /some/place
 root# linkdups /tmp/hash
 linking ./a/b/c/summary.pdf ./d/e/f/report.pdf
 ...

=head1 AUTHOR

 Karl Vogel <vogelke@pobox.com>
 Sumaria Systems, Inc.

=cut
