diff --git a/dup/removeduplicatemails b/dup/removeduplicatemails new file mode 100755 index 0000000..19a0124 --- /dev/null +++ b/dup/removeduplicatemails @@ -0,0 +1,52 @@ +#!/usr/bin/perl +use warnings; +use strict; + +use File::Find; + +use MIME::Parser; +my $parser = new MIME::Parser; +$parser->output_to_core(1); +$parser->tmp_to_core(1); + +my %files; +my %messages; +sub check_file { + unless (-f) { + return; + } + my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, + $atime,$mtime,$ctime,$blksize,$blocks) = stat(_); + if ($files{"$dev.$ino"}) { + # we already looked at this file - skip + return; + } + my $entity = $parser->parse_open($_); + unless ($entity) { + print STDERR "$File::Find::name cannot be parsed: skipping\n"; + return; + } + my $mid = $entity->head->get('Message-Id'); + unless ($mid) { + print STDERR "$File::Find::name contains no message id: skipping\n"; + return; + } + if ($messages{$mid}) { + # duplicate! + print STDERR "$File::Find::name is a duplicate of ", + $files{$messages{$mid}}, + "\n"; + my $ft = $File::Find::name; + $ft =~ s|(.*/)(.*)|$1|; + $ft .= "removedups.$$." . rand; + link ($files{$messages{$mid}}, $ft) && rename ($ft, $File::Find::name) || do { + print STDERR "\terror: $!\n"; + }; + return; + } + $messages{$mid} = "$dev.$ino"; + $files{"$dev.$ino"} = $File::Find::name; +} + +find(\&check_file, @ARGV); + diff --git a/dup/removedups b/dup/removedups new file mode 100755 index 0000000..508b503 --- /dev/null +++ b/dup/removedups @@ -0,0 +1,57 @@ +#!/usr/bin/perl +use warnings; +use strict; + +use File::Find; +use Digest::SHA1; +use Data::Dumper; + +my %files; +sub check_file { + unless (-f) { + return; + } + my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, + $atime,$mtime,$ctime,$blksize,$blocks) = stat(_); + $files{$size}{i}{"$dev.$ino"}{n}{$File::Find::name} = 1; +} + +print STDERR "sorting files by size\n"; +find(\&check_file, @ARGV); +print STDERR "... done\n"; + +for my $s (sort { $a <=> $b } keys %files) { + print STDERR "checking files of size $s\n"; + if (scalar keys %{$files{$s}{i}} == 1) { + print STDERR "only one file of size $s: skipping\n"; + } else { + for my $i (keys %{$files{$s}{i}}) { + my $f = (keys %{$files{$s}{i}{$i}{n}})[0]; + if (open (F, "<", $f)) { + # print STDERR "\tcomputing checksum of $f\n"; + my $sha1 = Digest::SHA1->new; + $sha1->addfile(*F); + my $d = $sha1->b64digest; + if ($files{$s}{d}{$d}) { + print STDERR "\t\tduplicate found\n"; + my $fo = (keys %{$files{$s}{d}{$d}{n}})[0]; + for my $fd (keys %{$files{$s}{i}{$i}{n}}) { + print "\t\t\tlinking $fd to $fo\n"; + my $ft = $fd; + $ft =~ s|(.*/)(.*)|$1|; + $ft .= "removedups.$$." . rand; + link ($fo, $ft) && rename ($ft, $fd) || do { + print STDERR "\t\t\t\terror: $!\n"; + } + } + } else { + $files{$s}{d}{$d} = $files{$s}{i}{$i}; + } + } else { + print STDERR "cannot open $f: $!: ignoring\n"; + } + delete $files{$s}{i}{$i}; + } + # print Dumper $files{$s}; + } +}