From 0ad0e43211843b87e7203f7a72f50a501c95ae9a Mon Sep 17 00:00:00 2001 From: "Peter J. Holzer" Date: Sun, 12 Jan 2014 21:06:46 +0100 Subject: [PATCH] Find duplicates of files in one dir in another --- finddup | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100755 finddup diff --git a/finddup b/finddup new file mode 100755 index 0000000..73a2b7a --- /dev/null +++ b/finddup @@ -0,0 +1,78 @@ +#!/usr/bin/perl + +=head1 NAME + +finddup - yet another tool for finding duplicates + +=cut + +use v5.10; +no autovivification 'fetch'; +use File::Find; +use File::stat; +use File::Slurp; +use Digest::SHA qw(sha256_hex); + +my ($needledir, $haystackdir) = @ARGV; + +my $haystack; + +find(sub { + if (-f $_) { + my $st = lstat($_); + my $size = $st->size; + my $hash = ""; + unless ($haystack->{$size}{$hash}) { + $haystack->{$size}{$hash} = []; + } + push $haystack->{$size}{$hash}, { path => $File::Find::name, + size => $size, + dev => $st->dev, + ino => $st->ino, + }; + } + }, + $haystackdir +); + +find( + { + wanted => sub { + if (-f $_) { + my $st = lstat($_); + my $size = $st->size; + my $found; + if ($haystack->{$size}) { + if ($haystack->{$size}{""}) { + for my $file (@{ $haystack->{$size}{""} }) { + my $content = read_file($file->{path}); + my $hash = sha256_hex($content); + unless ($haystack->{$size}{$hash}) { + $haystack->{$size}{$hash} = []; + } + push $haystack->{$size}{$hash}, $file; + } + delete $haystack->{$size}{""}; + } + my $content = read_file($_); + my $hash = sha256_hex($content); + if ($haystack->{$size}{$hash}) { + for my $file (@{ $haystack->{$size}{$hash} }) { + unless ($st->dev == $file->{dev} && $st->ino == $file->{ino}) { + say "$_ -> $file->{path}"; + $found = 1; + } + } + } + } + unless ($found) { + say "$_ (NONE)"; + } + } + }, + no_chdir => 1, + }, + $needledir +); + +# vim: tw=132 sw=4 expandtab