#!/usr/bin/perl # # dups - identify duplicate files # # usage: find | dups [-hv] # # -h: count hard-linked files as duplicates # -v: verbose output # # 1.0 (May 05 1995): written by Peter Couvares # # Possible Improvements: # - optionally accept filenames as arguments rather than via stdin # - replace call to "cmp" with perl code # - use pack() to store file names (would be more memory-efficient for # huge file lists at some expense in speed...an option maybe?) # # NOTES: # From: pfcouvar@amhux4.amherst.edu (Peter F. Couvares) # Subject: dups-1.0 (Perl 5 script to identify duplicate files) # Date: 7 May 1995 04:46:13 GMT # Organization: Amherst College, Amherst MA, USA # Message-ID: <3ohjal$paq@amhux3.amherst.edu> # Archive-name: dups-1.0 # Submitted-by: pfcouvar@unix.amherst.edu # X-News: alt.sources # # cvance@empedocles.cfar.umd.edu (Christopher Vance) writes: # # > A while back, I found a Unix utility called FindDupe that # > would locate duplicate files in a bunch of given paths. I # > seemed to have misplaced the source for it, and now all I have # > is an old binary. [...] Does anyone have the source for this? # > It's been very useful and I need it for other systems (SunOS, # > and FreeBSD). Or do you know of something something similar? # # Here's something I whipped up for myself--it's in Perl so it should # work most anywhere (assuming you're on top of things enough to have # Perl 5 installed). require 5.0; use FileHandle; use Getopt::Std; # strip path from $0 and put what's left # into $progname (for prettier errors) $0 =~ m/([^\/]+)$/; $progname = $1; $usage = "usage: find | $progname [-hv]\n"; die $usage if -t STDIN; # make sure we're getting piped STDIN die $usage unless getopts("hv"); # check arguments $HLINKS = $opt_h; $VERBOSE = $opt_v; FILE: while () # for each given filename { chop; $filename = $_; unless (lstat $filename) # get file's stats { print STDERR "$progname: warning: can't stat $filename\n"; next FILE; } next FILE unless -f _; # skip special or nonexistant files $size = -s _; # get size # compare with those files we've seen of the same size foreach $filenameB (@{$files{$size}}) { # check if they're hard-linked by comparing devices and inodes ($devA, $inoA) = lstat($filename); ($devB, $inoB) = lstat($filenameB); if (($devA == $devB) and ($inoA == $inoB)) { next FILE unless $HLINKS; print "$filenameB $filename"; print " (hard link)" if $VERBOSE; print "\n"; next FILE; } # otherwise do a good 'ol "cmp" unless (system('cmp', '-s', $filename, $filenameB)) { print "$filenameB $filename\n"; next FILE; } } # remember this file (index by its size) push(@{$files{$size}}, $filename); }