Find duplicate files even if file name is different.

This commit is contained in:
hjp 2007-11-17 21:03:10 +00:00
parent 474ef9e7ef
commit 579b0a33b3
4 changed files with 119 additions and 9 deletions

8
Notes
View File

@ -63,11 +63,7 @@ Tape performance:
Equality checking doesn't work for setuid files.
Implement equality checking via saved checksum.
checksum is null if file is linked. Is that ok? Can this be solved
together with the previous point?
exit if disk full
Move prune list to config file.
On my 800 MHz PIII, the CPU usage is rather high. Some profiling seems
to be necessary (or I should get a faster backup server :-)).

View File

@ -156,8 +156,15 @@ sub backup2disk {
} else {
$self->log(5, "unexpected trailer $trailer\n");
}
$self->setmeta($f);
$self->log(10, "stored");
my $oldfile = $self->finddup($f);
if ($oldfile) {
unlink($backup_filename) or die "cannot unlink $backup_filename: $!";
link($oldfile, $backup_filename) or die "cannot link $oldfile to $backup_filename: $!";
$self->log(10, "linked (dup)");
} else {
$self->setmeta($f);
$self->log(10, "stored");
}
} else {
$self->log(5, "unexpected header $header\n");
}
@ -421,6 +428,7 @@ sub close_session {
sub get_last_session_id {
my ($self) = @_;
return unless $self->{last_backup};
my $sessions = $self->{dbh}->selectall_arrayref("select * from sessions where prefix=?",
{ Slice => {} },
$self->{last_backup});
@ -428,5 +436,49 @@ sub get_last_session_id {
return $sessions->[0]{id};
}
=head2 finddup
Find a duplicate of the current file in the database. This is useful if you
have multiple copies of a file stored in different locations.
=cut
sub finddup {
my ($self, $f) = @_;
my $sth = $self->{dbh}->prepare("select * from versions, files, sessions
where file_type=? and file_size=? and file_mtime=?
and file_owner=? and file_group=? and file_acl=?
and file_unix_bits=?
and checksum=? and online=1
and versions.file=files.id and versions.session=sessions.id");
$sth->execute(
$f->{t}, $f->{s}, $f->{m},
$f->{o}, $f->{g}, $f->{acl},
join(',', map {$f->{$_} ? ($_) : ()} qw(setuid setgid sticky)),
$f->{checksum}
);
while (my $r = $sth->fetchrow_hashref()) {
my $oldfile = "$r->{prefix}/$r->{path}";
if (my $st = lstat($oldfile)) {
if ($st->mtime == $f->{m} &&
$st->size == $f->{s} &&
$st->uid == $self->name2uid($f->{o}) &&
$st->gid == $self->name2gid($f->{g}) &&
($st->mode & 07777) == $self->acl2mode($f)
) {
$sth->finish();
return $oldfile;
}
}
}
return;
}
sub DESTROY {
my ($self) = @_;
$self->{dbh}->disconnect();
}
# vim: tw=0 expandtab
1;

View File

@ -6,7 +6,8 @@ use Test::More tests => 15;
BEGIN { use_ok( 'Simba::CA' ); }
my $ca = Simba::CA->new({ dbi_file => $ENV{SIMBA_DB_CONN} || "$ENV{HOME}/.dbi/simba"});
my $ca = Simba::CA->new({ dbi_file => $ENV{SIMBA_DB_CONN} ||
"$ENV{HOME}/.dbi/simba_test"});
ok($ca, 'new CA');
my $uid;

61
t/02_ca.t Normal file
View File

@ -0,0 +1,61 @@
#!/usr/bin/perl
use warnings;
use strict;
# Live tests.
#
# These tests need to be run as simba_ca and it needs to be able to
# connect to simba_da@localhost.
use Test::More tests => 8;
use File::stat;
BEGIN { use_ok( 'Simba::CA' ); }
my $ca = Simba::CA->new({ dbi_file => $ENV{SIMBA_DB_CONN} ||
"$ENV{HOME}/.dbi/simba_test"});
ok($ca, 'new CA');
# make sure filesets contains test data then connect again:
$ca->{dbh}->do("delete from filesets");
$ca->{dbh}->do("insert into filesets(host, dir) values('localhost', '/var/tmp/simba_test')");
$ca = Simba::CA->new({ dbi_file => $ENV{SIMBA_DB_CONN} ||
"$ENV{HOME}/.dbi/simba_test"});
ok($ca, 'new CA 2');
SKIP: {
skip "not running as root", 1 unless $> == 0;
mkdir "/var/tmp/simba_test";
mkdir "/var/tmp/simba_test/d1";
mkdir "/var/tmp/simba_test/d2";
open my $fh, '>:raw', '/var/tmp/simba_test/d1/f1';
print $fh "test\n";
close($fh);
open $fh, '>:raw', '/var/tmp/simba_test/d2/f2';
print $fh "test\n";
close($fh);
$ca->run();
my $this_backup = $ca->{this_backup};
my $st1 = lstat("$this_backup/d1/f1");
ok($st1, "file 1 exists");
is($st1->nlink, 2, "file 1 has 2 links");
my $st2 = lstat("$this_backup/d2/f2");
ok($st2, "file 2 exists");
is($st2->nlink, 2, "file 2 has 2 links");
is($st1->ino, $st2->ino, , "file 1 and 2 are the same");
# cleanup
system("rm", "-rf", $this_backup);
$ca->{dbh}->do("delete from versions");
$ca->{dbh}->do("delete from files");
$ca->{dbh}->do("delete from filesets");
}