#!/usr/bin/perl
#
# dups - identify duplicate files
#
# usage: find | dups [-hv]
#
# -h: count hard-linked files as duplicates
# -v: verbose output
#
# 1.0 (May 05 1995): written by Peter Couvares <pfcouvar@unix.amherst.edu>
#
# Possible Improvements:
# - optionally accept filenames as arguments rather than via stdin
# - replace call to "cmp" with perl code
# - use pack() to store file names (would be more memory-efficient for
#   huge file lists at some expense in speed...an option maybe?)
#
# NOTES:
#    From: pfcouvar@amhux4.amherst.edu (Peter F. Couvares)
#    Subject: dups-1.0 (Perl 5 script to identify duplicate files)
#    Date: 7 May 1995 04:46:13 GMT
#    Organization: Amherst College, Amherst MA, USA
#    Message-ID: <3ohjal$paq@amhux3.amherst.edu>
#    Archive-name: dups-1.0
#    Submitted-by: pfcouvar@unix.amherst.edu
#    X-News: alt.sources
#
#    cvance@empedocles.cfar.umd.edu (Christopher Vance) writes:
#
#    > A while back, I found a Unix utility called FindDupe that
#    > would locate duplicate files in a bunch of given paths. I
#    > seemed to have misplaced the source for it, and now all I have
#    > is an old binary. [...] Does anyone have the source for this?
#    > It's been very useful and I need it for other systems (SunOS,
#    > and FreeBSD). Or do you know of something something similar?
#
#    Here's something I whipped up for myself--it's in Perl so it should
#    work most anywhere (assuming you're on top of things enough to have
#    Perl 5 installed).

require 5.0;
use FileHandle;
use Getopt::Std;

# strip path from $0 and put what's left
# into $progname (for prettier errors)
$0 =~ m/([^\/]+)$/;
$progname = $1;

$usage = "usage: find | $progname [-hv]\n";

die $usage if -t STDIN;    # make sure we're getting piped STDIN
die $usage unless getopts("hv");    # check arguments

$HLINKS  = $opt_h;
$VERBOSE = $opt_v;

FILE: while (<STDIN>)               # for each given filename
{
    chop;
    $filename = $_;

    unless (lstat $filename)        # get file's stats
    {
        print STDERR "$progname: warning: can't stat $filename\n";
        next FILE;
    }
    next FILE unless -f _;          # skip special or nonexistant files
    $size = -s _;                   # get size

    # compare with those files we've seen of the same size

    foreach $filenameB (@{$files{$size}}) {

        # check if they're hard-linked by comparing devices and inodes
        ($devA, $inoA) = lstat($filename);
        ($devB, $inoB) = lstat($filenameB);

        if (($devA == $devB) and ($inoA == $inoB)) {
            next FILE unless $HLINKS;
            print "$filenameB $filename";
            print " (hard link)" if $VERBOSE;
            print "\n";
            next FILE;
        }

        # otherwise do a good 'ol "cmp"
        unless (system('cmp', '-s', $filename, $filenameB)) {
            print "$filenameB $filename\n";
            next FILE;
        }
    }

    # remember this file (index by its size)
    push(@{$files{$size}}, $filename);
}