From 4f4e5540fffce864d78a9a84d3b26f0ba0d76de2 Mon Sep 17 00:00:00 2001
From: hjp <hjp@d47eeb7a-11fe-0310-b519-e5a4e12f9765>
Date: Thu, 2 Sep 2010 08:24:07 +0000
Subject: [PATCH] lock fileset during backup to avoid concurrent backups of the
 same dataset. Only search the last few (well, currently 30) sessions with a
 backup on the same device for matching files in linkdup. Started to work on
 support for tokyocabinet

---
 lib/Simba/CA.pm | 41 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/lib/Simba/CA.pm b/lib/Simba/CA.pm
index 1445d7c..20bc78b 100644
--- a/lib/Simba/CA.pm
+++ b/lib/Simba/CA.pm
@@ -76,7 +76,7 @@ Readonly my $BUFSIZE => 128 * 1024;
 
 sub new {
     my ($class, $opt) = @_;
-
+    
     my $self = {};
     bless $self, $class;
 
@@ -104,7 +104,12 @@ sub new {
                                          RaiseError => 1
                                        }
                            );
+    } elsif ($opt->{tokyocabinet}) {
+        my $tdb = $self->{tdb} = TokyoCabinet::TDB->new();
+        $tdb->open($opt->{tokyocabinet}, $tdb->WRITER, $tdb->OCREAT)
+            or die "open $opt->{tokyocabinet} failed: " . $tdb->errmsg($tdb->ecode());
     }
+    # XXX - DBI
     $self->{targets} = $self->{dbh}->selectall_arrayref("select * from filesets", { Slice => {} });
     if ($opt->{filesets}) {
         $self->{targets} =
@@ -148,6 +153,12 @@ sub run {
 sub backup2disk {
     my ($self, $target) = @_;
 
+    # XXX - lock fileset here, we don't want to backup the same fileset twice concurrently.
+    # theoretically something like
+    $self->{dbh}->selectall_arrayref(q{select * from filesets where id = ?}, {}, $target->{id});
+    # should suffice, but I'm not sure if that blocks too much (does that block reads? block the whole table?)
+    # Just try it and we will see ...
+
     $self->log(3, "starting backup for target host " . $target->{host} . " dir " . $target->{dir});
     $self->{target} = $target;
 
@@ -570,6 +581,27 @@ sub linkdup {
     my ($self, $f, $backup_filename) = @_;
     my $t0 = gettimeofday();
     # XXX - this seems to be slow
+    # XXX - creates huge temp files. Restrict to last few sessions or at least sessions on the same device?
+    # XXX - that's not quite as simple: We only have the prefix, but there are many prefixes on the same
+    #       device. We can create a list of them of them at first call, though and then pass the list
+    #       to the query. Maybe even shorten the list. ($n newest sessions only)
+    # XXX - another possible optimization is to check the last few files we've written: .svn/prop-base
+    #       normally contains a lot of identical files.
+
+    unless ($self->{sessions_on_same_device}) {
+        my $st = stat($backup_filename);
+        my $my_dev = defined $st ? $st->dev : ""; # can this happen?
+        my $sth = $self->{dbh}->prepare("select * sessions oder yb order by id desc");
+        $sth->execute();
+        while (my $r = $sth->fetchrow_hashref()) {
+            my $st = lstat $r->{prefix};
+            my $dev = defined $st ? $st->dev : "";;
+            next unless $dev ne $my_dev;
+            last if @{ $self->{sessions_on_same_device} } > 30;
+            push @{ $self->{sessions_on_same_device} }, $r;
+        }
+
+    }
     my $sth = $self->{dbh}->prepare("select * from versions2, instances, files, sessions
                                      where file_type=? and file_size=? and file_mtime=?
                                        and file_owner=? and file_group=? and file_acl=?
@@ -578,19 +610,22 @@ sub linkdup {
                                        and instances.file=files.id
                                        and versions2.id=instances.version
                                        and instances.session=sessions.id
-                                     order by instances.session desc
+                                       and sessions.id in (" . join(", ", map("?", @{ $self->{sessions_on_same_device} })) . ")" .
+                                    " order by instances.session desc
                                     ");
     $sth->execute(
              $f->{t}, $f->{s}, $f->{m},
              $f->{o}, $f->{g}, $f->{acl},
              join(',', map {$f->{$_} ? ($_) : ()} qw(setuid setgid sticky)),
-             $f->{checksum}
+             $f->{checksum},
+             @{ $self->{sessions_on_same_device} },
           );
     my $st = stat($backup_filename);
     my $my_dev = defined $st ? $st->dev : "";
     while (my $r = $sth->fetchrow_hashref()) {
 
         # check if old file is on same device. If it isn't, skip it.
+        # XXX - this should now be obsolete because we already selected only matching sessions above.
         unless ($self->{prefix_device}{$r->{prefix}}) {
             my $st = lstat $r->{prefix};
             $self->{prefix_device}{$r->{prefix}}