From 289670b313fe029161a872a250403f2d041f8a95 Mon Sep 17 00:00:00 2001 From: "Peter J. Holzer" Date: Sun, 25 Jul 2021 12:19:18 +0200 Subject: [PATCH] Treat zero length files specially for better performance --- lib/Simba/CA.pm | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/lib/Simba/CA.pm b/lib/Simba/CA.pm index 15a624e..051587c 100644 --- a/lib/Simba/CA.pm +++ b/lib/Simba/CA.pm @@ -818,6 +818,25 @@ sub store_file { $self->log(5, "cannot link $self->{last_backup}/$f->{name} to $self->{this_backup}/$f->{name}: $!"); } } + # If size is zero, check if we have seen a matching file before. If we have, link to it. + # Ubuntu contains a lot of zero sized files (about 8000 per installed kernel). + # Searching for them in the database is slow, so we special-case that here. + # We could generalize that, but I don't think that there will ever be enough identical + # non-empty files to make that worthwhile. + # XXX - not yet implemented. + if ($f->{s} == 0 && $f->{t} eq 'f') { + no warnings 'uninitialized'; # unix bits may not exist + my $k = "$f->{m} $f->{o} $f->{g} $f->{acl} $f->{setuid} $f->{setgid} $f->{sticky}"; + if ($self->{null_files}{$k}) { + my $oldfile = $self->{null_files}{$k}{name}; + my $backup_filename = "$self->{this_backup}/$f->{name}"; + if (link($oldfile, $backup_filename)) { + $self->log(10, "linked (empty)"); + $self->{counts}{dup10}++; + return $success; + } + } + } # else request from da unless ($self->{file_pid}) { @@ -884,6 +903,11 @@ sub store_file { $self->setmeta($f); $self->log(10, "stored"); } + if ($f->{s} == 0 && $f->{t} eq 'f') { + no warnings 'uninitialized'; # unix bits may not exist + my $k = "$f->{m} $f->{o} $f->{g} $f->{acl} $f->{setuid} $f->{setgid} $f->{sticky}"; + $self->{null_files}{$k}{name} = $backup_filename; + } } else { $self->log(5, "unexpected header $header\n"); $self->close_file_connection;