simba/lib/Simba/CA.pm

583 lines
18 KiB
Perl
Raw Normal View History

#!/usr/bin/perl
=head1 NAME
Simba::CA
=head1 DESCRIPTION
Collecting Agent of the Simba backup system.
This class represents one instance of a running collecting agent.
The only user-callable methods are the constructor new and the instance
method run, which collects all the files from various disk agents.
The Simba::CA package is a hashref with the following keys:
=over
=item basedir
=item unknown_uid
=item unknown_gid
=item fh_log
=item log_level
=item dbh
=item targets
=item ssh_id_file
=item target
=item last_backup
=item last_backup_id
=item timestamp
=item this_backup
=item session_id
=item file_pid
=item file_cfd
=item file_dfd
=back
=cut
2006-10-02 12:56:36 +02:00
package Simba::CA;
use strict;
use warnings;
2006-10-02 12:56:36 +02:00
use Encode;
use IPC::Open2;
use POSIX qw(strftime);
use Simba::Util qw(quote unquote typestr);
use Readonly;
use Digest::SHA1;
use List::Util qw(min);
use IO::Handle;
use File::stat;
use Scalar::Util qw(tainted);
use DBI;
Readonly my $BUFSIZE => 128 * 1024;
2006-10-02 12:56:36 +02:00
sub new {
2006-11-30 15:50:23 +01:00
my ($class, $opt) = @_;
2006-10-02 12:56:36 +02:00
my $self = {};
bless $self, $class;
2006-11-20 12:26:30 +01:00
$self->{basedir} = '/backup';
2006-11-30 15:50:23 +01:00
$self->{unknown_uid} = 65533;
$self->{unknown_gid} = 65533;
$self->{fh_log} = exists($opt->{fh_log}) ? $opt->{fh_log} : \*STDERR;
$self->{log_level} = 99;
2007-06-17 23:40:04 +02:00
if ($opt->{dbi}) {
$self->{dbh} = DBI->connect(@{ $opt->{dbi} },
{ AutoCommit => 0,
PrintError => 1,
RaiseError => 1
}
);
} elsif ($opt->{dbi_file}) {
my $fn = $opt->{dbi_file};
open(FN, "<$fn") or die "cannot open $fn: $!";
my $line = <FN>;
close(FN);
my @cred = split(/[\s\n]+/, $line);
$self->{dbh} = DBI->connect(@cred,
{ AutoCommit => 0,
PrintError => 1,
RaiseError => 1
}
);
}
$self->{targets} = $self->{dbh}->selectall_arrayref("select * from filesets", { Slice => {} });
if ($opt->{filesets}) {
$self->{targets} =
[
grep {
my $id = $_->{id};
grep { $id == $_ } @{ $opt->{filesets} }
} @{ $self->{targets} }
];
}
if ($ENV{HOME} =~ m{([/\w]*)}) {
if (-f "$1/.ssh/id_rsa") {
if (my $st = stat("$1/.ssh/id_rsa")) {
if ($st->uid == $>) {
$self->{ssh_id_file} = "$1/.ssh/id_rsa";
}
}
}
}
2006-10-02 12:56:36 +02:00
return $self;
}
sub run {
my ($self) = @_;
# run sequentially for prototype. In production we probably
# want some concurrency
for my $target (@{$self->{targets}}) {
$self->backup2disk($target);
}
}
sub backup2disk {
my ($self, $target) = @_;
$self->log(3, "starting backup for target host " . $target->{host} . " dir " . $target->{dir});
$self->{target} = $target;
2006-10-02 12:56:36 +02:00
# get previous generation
my @dirs = glob($self->{basedir} . '/????-??-??T??.??.??/' . $target->{host} . '/' . $target->{dir});
$self->{last_backup} = $dirs[-1];
$self->{last_backup} = $1 if $self->{last_backup} =~ /(.*)/; # detaint
$self->{last_backup_id} = $self->get_last_session_id();
my $timestamp = $self->{timestamp} || strftime('%Y-%m-%dT%H.%M.%S', localtime);
$self->{this_backup} = $self->{basedir} . "/$timestamp/" . $target->{host} . '/' . $target->{dir};
2007-06-18 22:11:27 +02:00
$self->new_session();
2006-10-02 12:56:36 +02:00
my ($list_pid, $list_cfd, $list_dfd); # connection to get list of files
$list_pid = open2($list_dfd, $list_cfd,
"/usr/bin/ssh",
"-l", "simba_da",
$self->{ssh_id_file} ? ("-i", $self->{ssh_id_file}) : (),
$target->{host}, "da");
$list_cfd->printflush("list $target->{dir}\n"); # XXX - encode!
close($list_cfd);
my $count = 0;
2006-10-02 12:56:36 +02:00
while (<$list_dfd>) {
$count++;
2007-06-18 03:00:29 +02:00
chomp;
$self->log(10, "file: $_");
2006-10-02 12:56:36 +02:00
# split into fields
chomp;
2006-10-02 12:56:36 +02:00
my $f = $self->parse($_);
2006-10-02 12:56:36 +02:00
if ($f->{t} eq 'f') {
$self->store_file($f);
} elsif ($f->{t} eq 'd') {
my $d = "$self->{this_backup}/$f->{name}";
$d =~ s,//+,/,g;
mkdir_p($d) or die "cannot mkdir $d: $!"; # XXX
$self->setmeta($f);
2006-11-28 17:27:59 +01:00
} elsif ($f->{t} eq 'l') {
my $l = "$self->{this_backup}/$f->{name}";
2007-06-17 23:58:57 +02:00
unless (symlink($f->{lt}, $l)) {
die "cannot symlink $l -> $f->{lt}: $!"; # XXX
}
2006-11-28 17:27:59 +01:00
# $self->setmeta($f); ignore for symlinks. would need to use
# lchown, lchmod, etc.
2006-10-02 12:56:36 +02:00
} else {
# create local copy (or insert into DB only?)
2007-06-18 03:00:29 +02:00
$self->log(5, "ignored $_\n");
2006-10-02 12:56:36 +02:00
}
# insert into DB.
$self->db_record_version($target, $f);
2006-10-02 12:56:36 +02:00
}
2007-06-18 22:11:27 +02:00
$self->close_session();
$self->log(3, "finished backup for target host " . $target->{host} . " dir " . $target->{dir} . ": $count files");
2006-10-02 12:56:36 +02:00
}
sub parse {
my ($self, $s) = @_;
my @s = split(/ +/, $s);
my $f = {};
$f->{name} = shift @s;
$f->{name} = $1 if ($f->{name} =~ /(.*)/); # detaint XXX
for (@s) {
my ($k, $v) = split(/=/, $_, 2);
$f->{$k} = $v;
# special processing for permissions etc, here?
}
$f->{o} = unquote($f->{o});
$f->{g} = unquote($f->{g});
$f->{acl} = unquote($f->{acl});
$f->{m} = $1 if $f->{m} =~ /^(\d+)$/;
2006-11-30 14:55:53 +01:00
$f->{lt} = unquote($1) if defined $f->{lt} && $f->{lt} =~ /(.*)/;
return $f;
}
sub present {
my ($self, $f) = @_;
return unless $self->{last_backup};
my $st = lstat("$self->{last_backup}/$f->{name}");
return unless $st;
if ($st->mtime == $f->{m} &&
$st->size == $f->{s} &&
2006-11-30 15:50:23 +01:00
$st->uid == $self->name2uid($f->{o}) &&
$st->gid == $self->name2gid($f->{g}) &&
($st->mode & 07777) == $self->acl2mode($f)
) {
return 1;
} else {
return 0;
}
}
sub mkdir_p {
my ($dir, $perm) = @_;
$perm = 0777 unless(defined($perm));
if (-d $dir) {
return 1;
} elsif (mkdir($dir, $perm)) {
return 1;
} elsif ($!{ENOENT}) {
my $parentdir = $dir;
$parentdir =~ s|(.*)/.+|$1|;
mkdir_p($parentdir, $perm);
if (-d $dir) {
return 1;
} else {
return mkdir($dir, $perm);
}
} else {
return undef;
}
}
2006-11-20 12:26:30 +01:00
sub basedir {
my ($self, $dir) = @_;
$self->{basedir} = $dir if defined($dir);
return $self->{basedir};
}
sub targets {
my ($self, $targets) = @_;
$self->{targets} = $targets if defined($targets);
return $self->{targets};
}
sub add_target {
my ($self, $target) = @_;
push @{ $self->{targets} }, $target;
return $self->{targets};
}
my %permstrbits = (
'---' => 0,
'--x' => 1,
'-w-' => 2,
'-wx' => 3,
'r--' => 4,
'r-x' => 5,
'rw-' => 6,
'rwx' => 7,
);
sub setmeta {
my ($self, $f) = @_;
my $fn = "$self->{this_backup}/$f->{name}";
2007-06-18 03:00:29 +02:00
$self->log(3, "$fn is tainted!") if tainted($fn);
my $mode = $self->acl2mode($f);
2007-06-18 03:00:29 +02:00
$self->log(3, "$mode is tainted!") if tainted($mode);
chmod($mode, $fn);
2006-11-30 15:50:23 +01:00
chown($self->name2uid($f->{o}), $self->name2gid($f->{g}), $fn);
utime(time, $f->{m}, $fn);
}
# computes the mode from the acl (and the set[ug]id and sticky bits)
# and returns it. Optional ACL entries are currently ignored but should
# eventually be returned as a second value.
sub acl2mode {
my ($self, $f) = @_;
my $mode = 0;
if ($f->{acl}) {
for my $ace (split(',', $f->{acl})) {
if ($ace =~ /^u::(...)$/) {
$mode |= ($permstrbits{$1} << 6);
} elsif ($ace =~ /^g::(...)$/) {
$mode |= ($permstrbits{$1} << 3);
} elsif ($ace =~ /^o:(...)$/) {
$mode |= ($permstrbits{$1} << 0);
} else {
$self->log(5, "warning: unknown ACE $ace ignored");
}
}
}
if ($f->{setuid}) { $mode |= 04000 }
if ($f->{setgid}) { $mode |= 02000 }
if ($f->{sticky}) { $mode |= 01000 }
return $mode;
}
my %ucache;
sub name2uid {
2006-11-30 15:50:23 +01:00
my ($self, $uname) = @_;
$uname = $1 if $uname =~ /(.*)/; # detaint
return $ucache{$uname} if (defined $ucache{$uname});
if ($uname =~ /^\d+$/) {
return $ucache{$uname} = $uname;
} else {
my $uid = getpwnam($uname);
if (defined($uid)) {
return $ucache{$uname} = $uid;
} else {
2006-11-30 15:50:23 +01:00
return $ucache{$uname} = $self->{unknown_uid};
}
}
}
my %gcache;
sub name2gid {
2006-11-30 15:50:23 +01:00
my ($self, $gname) = @_;
$gname = $1 if $gname =~ /(.*)/; # detaint
return $gcache{$gname} if (defined $gcache{$gname});
if ($gname =~ /^\d+$/) {
return $gcache{$gname} = $gname;
} else {
my $gid = getgrnam($gname);
if (defined($gid)) {
return $gcache{$gname} = $gid;
} else {
2006-11-30 15:50:23 +01:00
return $gcache{$gname} = $self->{unknown_gid};
}
}
}
2006-11-30 15:50:23 +01:00
sub log {
my ($self, $level, $msg) = @_;
if ($level <= $self->{log_level}) {
2007-06-22 22:41:56 +02:00
$self->{fh_log}->print(strftime("%Y-%m-%dT%H:%M:%S%z", localtime), " $$ [$level]: $msg\n")
or die "write to log failed: $!";
2006-11-30 15:50:23 +01:00
}
}
sub log_level {
my ($self, $log_level) = @_;
$self->{log_level} = $log_level if defined($log_level);
return $self->{log_level};
}
sub db_record_version {
my ($self, $target, $f) = @_;
my $db_f = $self->{dbh}->selectall_arrayref("select * from files where fileset=? and path=?",
{ Slice => {} },
$target->{id}, $f->{name});
unless (@$db_f) {
$self->{dbh}->do("insert into files(fileset, path) values(?, ?)",
{},
$target->{id}, $f->{name});
$db_f = $self->{dbh}->selectall_arrayref("select * from files where fileset=? and path=?",
{ Slice => {} },
$target->{id}, $f->{name});
}
if ($f->{t} eq 'f' && !defined($f->{checksum})) {
# this must be a link to the previous version
my $db_pv = $self->{dbh}->selectall_arrayref("select * from versions where file=? and session=?",
{ Slice => {} },
$db_f->[0]{id},
$self->{last_backup_id});
$f->{checksum} = $db_pv->[0]{checksum};
}
$self->{dbh}->do("insert into versions(file,
file_id, file_type, file_size, file_mtime,
file_owner, file_group, file_acl,
file_unix_bits,
file_rdev,
2007-06-22 22:41:56 +02:00
date, checksum, online, file_linktarget,
session)
values(?,
?, ?, ?, ?,
?, ?, ?,
?,
?,
2007-06-18 22:11:27 +02:00
?, ?, ?, ?,
?)",
{},
$db_f->[0]{id},
$f->{id}, $f->{t}, $f->{s}, $f->{m},
$f->{o}, $f->{g}, $f->{acl},
join(',', map {$f->{$_} ? ($_) : ()} qw(setuid setgid sticky)),
$f->{rdev},
2007-06-18 22:11:27 +02:00
time(), $f->{checksum}, 1, $f->{lt},
$self->{session_id}
);
}
2007-06-18 22:11:27 +02:00
sub new_session {
my ($self) = @_;
2007-06-22 22:41:56 +02:00
$self->{dbh}->do("insert into sessions(start_date, prefix) values(?, ?)", {}, time(), $self->{this_backup});
2007-06-18 22:11:27 +02:00
$self->{session_id} = $self->{dbh}->{'mysql_insertid'};
}
sub close_session {
my ($self) = @_;
$self->{dbh}->do("update sessions set end_date=? where id=?", {}, time(), $self->{session_id});
if ($self->{file_pid}) {
close($self->{file_cfd});
close($self->{file_dfd});
$self->log(3, "waiting for $self->{file_pid}");
waitpid $self->{file_pid}, 0;
$self->log(3, "$self->{file_pid} terminated with status $?");
delete $self->{file_cfd};
delete $self->{file_dfd};
delete $self->{file_pid};
}
delete $self->{target};
2007-06-18 22:11:27 +02:00
}
sub get_last_session_id {
my ($self) = @_;
return unless $self->{last_backup};
my $sessions = $self->{dbh}->selectall_arrayref("select * from sessions where prefix=?",
{ Slice => {} },
$self->{last_backup});
die "$self->{last_backup} not a unique prefix" unless @$sessions == 1;
return $sessions->[0]{id};
}
=head2 finddup
Find a duplicate of the current file in the database. This is useful if you
have multiple copies of a file stored in different locations.
=cut
sub finddup {
my ($self, $f) = @_;
my $sth = $self->{dbh}->prepare("select * from versions, files, sessions
where file_type=? and file_size=? and file_mtime=?
and file_owner=? and file_group=? and file_acl=?
and file_unix_bits=?
and checksum=? and online=1
and versions.file=files.id and versions.session=sessions.id");
$sth->execute(
$f->{t}, $f->{s}, $f->{m},
$f->{o}, $f->{g}, $f->{acl},
join(',', map {$f->{$_} ? ($_) : ()} qw(setuid setgid sticky)),
$f->{checksum}
);
while (my $r = $sth->fetchrow_hashref()) {
my $oldfile = "$r->{prefix}/$r->{path}";
if (my $st = lstat($oldfile)) {
if ($st->mtime == $f->{m} &&
$st->size == $f->{s} &&
$st->uid == $self->name2uid($f->{o}) &&
$st->gid == $self->name2gid($f->{g}) &&
($st->mode & 07777) == $self->acl2mode($f)
) {
$sth->finish();
return $oldfile;
}
}
}
return;
}
=head2 store_file
store a file in the local filesystem. If the file appears to be unchanged since
the last backup, try to create a hard link. Otherwise, get the contents of the
file from the DA, and search for a file with the same contents (i.e., checksum)
and metadata, but possibly different name and try to link to that. If no link
can be created to an existing file, create a new one.
=cut
sub store_file {
my ($self, $f) = @_;
if($self->present($f)) {
link("$self->{last_backup}/$f->{name}", "$self->{this_backup}/$f->{name}") or die; # XXX
$self->log(10, "linked");
} else {
# else request from da
unless ($self->{file_pid}) {
$self->{file_pid} = open2($self->{file_dfd}, $self->{file_cfd},
"/usr/bin/ssh",
"-l", "simba_da",
$self->{ssh_id_file} ? ("-i", $self->{ssh_id_file}) : (),
$self->{target}->{host}, "da");
}
$self->{file_cfd}->printflush("get $self->{target}->{dir}/$f->{name}\n"); # XXX - encode!
my $header = $self->{file_dfd}->getline; # this should be the same as $_ - check?
if ($header =~ /^data (.*)/) {
my $f2 = $self->parse($1);
my $backup_filename = "$self->{this_backup}/$f->{name}";
open(my $file_bfd, '>:raw', $backup_filename) or die "cannot open backup file $backup_filename: $!";
my $size = $f2->{s};
my $err;
my $sha1 = Digest::SHA1->new;
while ($size > 0) {
my $buffer;
my $rc = read($self->{file_dfd}, $buffer, min($size, $BUFSIZE));
if (!defined($rc)) {
# I/O error
$self->log(5, "error reading from data socket: $!");
last;
} elsif ($rc == 0) {
# premature EOF.
$self->log(5, "unexpected EOF reading from data socket");
last;
}
$file_bfd->print($buffer) or die "write to backup failed: $!";
$size -= length($buffer);
$sha1->add($buffer);
}
close($file_bfd) or die "write to backup failed: $!";
my $trailer = $self->{file_dfd}->getline; # should be empty line
$trailer = $self->{file_dfd}->getline;
if ($trailer =~ /^fail /) {
$self->log(5, $trailer);
} elsif ($trailer =~ /^chk sha1 (\w+)/) {
my $checksum = $sha1->hexdigest;
if ($checksum ne $1) {
$self->log(5, "checksum error\n");
}
$f->{checksum} = $checksum;
} else {
$self->log(5, "unexpected trailer $trailer\n");
}
my $oldfile = $self->finddup($f);
if ($oldfile) {
unlink($backup_filename) or die "cannot unlink $backup_filename: $!";
link($oldfile, $backup_filename) or die "cannot link $oldfile to $backup_filename: $!";
$self->log(10, "linked (dup)");
} else {
$self->setmeta($f);
$self->log(10, "stored");
}
} else {
$self->log(5, "unexpected header $header\n");
}
}
}
sub DESTROY {
my ($self) = @_;
$self->{dbh}->disconnect();
}
# vim: tw=0 expandtab
1;