simple/dup/removedups

58 lines
1.4 KiB
Perl
Executable File

#!/usr/bin/perl
use warnings;
use strict;
use File::Find;
use Digest::SHA1;
use Data::Dumper;
my %files;
sub check_file {
unless (-f) {
return;
}
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
$atime,$mtime,$ctime,$blksize,$blocks) = stat(_);
$files{$size}{i}{"$dev.$ino"}{n}{$File::Find::name} = 1;
}
print STDERR "sorting files by size\n";
find(\&check_file, @ARGV);
print STDERR "... done\n";
for my $s (sort { $a <=> $b } keys %files) {
print STDERR "checking files of size $s\n";
if (scalar keys %{$files{$s}{i}} == 1) {
print STDERR "only one file of size $s: skipping\n";
} else {
for my $i (keys %{$files{$s}{i}}) {
my $f = (keys %{$files{$s}{i}{$i}{n}})[0];
if (open (F, "<", $f)) {
# print STDERR "\tcomputing checksum of $f\n";
my $sha1 = Digest::SHA1->new;
$sha1->addfile(*F);
my $d = $sha1->b64digest;
if ($files{$s}{d}{$d}) {
print STDERR "\t\tduplicate found\n";
my $fo = (keys %{$files{$s}{d}{$d}{n}})[0];
for my $fd (keys %{$files{$s}{i}{$i}{n}}) {
print "\t\t\tlinking $fd to $fo\n";
my $ft = $fd;
$ft =~ s|(.*/)(.*)|$1|;
$ft .= "removedups.$$." . rand;
link ($fo, $ft) && rename ($ft, $fd) || do {
print STDERR "\t\t\t\terror: $!\n";
}
}
} else {
$files{$s}{d}{$d} = $files{$s}{i}{$i};
}
} else {
print STDERR "cannot open $f: $!: ignoring\n";
}
delete $files{$s}{i}{$i};
}
# print Dumper $files{$s};
}
}