From 4f4e5540fffce864d78a9a84d3b26f0ba0d76de2 Mon Sep 17 00:00:00 2001 From: hjp Date: Thu, 2 Sep 2010 08:24:07 +0000 Subject: [PATCH] lock fileset during backup to avoid concurrent backups of the same dataset. Only search the last few (well, currently 30) sessions with a backup on the same device for matching files in linkdup. Started to work on support for tokyocabinet --- lib/Simba/CA.pm | 41 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/lib/Simba/CA.pm b/lib/Simba/CA.pm index 1445d7c..20bc78b 100644 --- a/lib/Simba/CA.pm +++ b/lib/Simba/CA.pm @@ -76,7 +76,7 @@ Readonly my $BUFSIZE => 128 * 1024; sub new { my ($class, $opt) = @_; - + my $self = {}; bless $self, $class; @@ -104,7 +104,12 @@ sub new { RaiseError => 1 } ); + } elsif ($opt->{tokyocabinet}) { + my $tdb = $self->{tdb} = TokyoCabinet::TDB->new(); + $tdb->open($opt->{tokyocabinet}, $tdb->WRITER, $tdb->OCREAT) + or die "open $opt->{tokyocabinet} failed: " . $tdb->errmsg($tdb->ecode()); } + # XXX - DBI $self->{targets} = $self->{dbh}->selectall_arrayref("select * from filesets", { Slice => {} }); if ($opt->{filesets}) { $self->{targets} = @@ -148,6 +153,12 @@ sub run { sub backup2disk { my ($self, $target) = @_; + # XXX - lock fileset here, we don't want to backup the same fileset twice concurrently. + # theoretically something like + $self->{dbh}->selectall_arrayref(q{select * from filesets where id = ?}, {}, $target->{id}); + # should suffice, but I'm not sure if that blocks too much (does that block reads? block the whole table?) + # Just try it and we will see ... + $self->log(3, "starting backup for target host " . $target->{host} . " dir " . $target->{dir}); $self->{target} = $target; @@ -570,6 +581,27 @@ sub linkdup { my ($self, $f, $backup_filename) = @_; my $t0 = gettimeofday(); # XXX - this seems to be slow + # XXX - creates huge temp files. Restrict to last few sessions or at least sessions on the same device? + # XXX - that's not quite as simple: We only have the prefix, but there are many prefixes on the same + # device. We can create a list of them of them at first call, though and then pass the list + # to the query. Maybe even shorten the list. ($n newest sessions only) + # XXX - another possible optimization is to check the last few files we've written: .svn/prop-base + # normally contains a lot of identical files. + + unless ($self->{sessions_on_same_device}) { + my $st = stat($backup_filename); + my $my_dev = defined $st ? $st->dev : ""; # can this happen? + my $sth = $self->{dbh}->prepare("select * sessions oder yb order by id desc"); + $sth->execute(); + while (my $r = $sth->fetchrow_hashref()) { + my $st = lstat $r->{prefix}; + my $dev = defined $st ? $st->dev : "";; + next unless $dev ne $my_dev; + last if @{ $self->{sessions_on_same_device} } > 30; + push @{ $self->{sessions_on_same_device} }, $r; + } + + } my $sth = $self->{dbh}->prepare("select * from versions2, instances, files, sessions where file_type=? and file_size=? and file_mtime=? and file_owner=? and file_group=? and file_acl=? @@ -578,19 +610,22 @@ sub linkdup { and instances.file=files.id and versions2.id=instances.version and instances.session=sessions.id - order by instances.session desc + and sessions.id in (" . join(", ", map("?", @{ $self->{sessions_on_same_device} })) . ")" . + " order by instances.session desc "); $sth->execute( $f->{t}, $f->{s}, $f->{m}, $f->{o}, $f->{g}, $f->{acl}, join(',', map {$f->{$_} ? ($_) : ()} qw(setuid setgid sticky)), - $f->{checksum} + $f->{checksum}, + @{ $self->{sessions_on_same_device} }, ); my $st = stat($backup_filename); my $my_dev = defined $st ? $st->dev : ""; while (my $r = $sth->fetchrow_hashref()) { # check if old file is on same device. If it isn't, skip it. + # XXX - this should now be obsolete because we already selected only matching sessions above. unless ($self->{prefix_device}{$r->{prefix}}) { my $st = lstat $r->{prefix}; $self->{prefix_device}{$r->{prefix}}