simba/scripts/remove_session

#!/usr/bin/perl

=head1 SYNOPSIS

remove_session [--age time] [session_ids ...]

=head1 DESCRIPTION

The C<remove_session> script removes the specified sessions from the
simba database and then cleans up orphaned entries.

The sessions can be specified by minimum age or by listing their ids (or
both, but that is probably not very useful).

The age is specified as a fractional number followed by a unit: "y",
"m", "w", or "d". So C<remove_session --age 2.5y> removes all sessions
older than 2.5 years (actually 2.5 * 365 * 86400 seconds -- leap years
and DST are ignored), and C<remove_session --age 52w> removes all
sessions older than 52 weeks.

If neither the age nor a list of sessions is specified, no sessions are
removed, but the cleanup phase is still run, which may take considerable
time (and might be considered a bug).

=cut

# This script removes all data associated with the given sessions.
# For each session it first removes all instances of that session and
# then cleans up any orphans.

use warnings;
use strict;

use Simba::CA;
use Bit::Vector::Judy;
use Getopt::Long;
use Pod::Usage;


$| = 1;

my $ca = Simba::CA->new({
			    dbi_file => $ENV{SIMBA_DB_CONN} || "$ENV{HOME}/.dbi/simba",
			});

my $dbh            = $ca->{dbh};
my $partition_size = $ca->{instances_part_size};

my %opt;
GetOptions(
    \%opt,
    "age=s",
    "help",
) or pod2usage(verbose => 0);

if ($opt{help}) {
    pod2usage(verbose => 2);
}

if ($opt{age}) {
    my ($num, $unit) = $opt{age} =~ /(\d(?:.\d+)?)(y|m|w|d)/;
    my $scale = { y => 365 * 86400,
                  m =>  30 * 86400,
                  w =>   7 * 86400,
                  d =>   1 * 86400 }->{$unit};
    die "unknown time unit $unit" unless $scale;
    my $expired_sessions
        = $dbh->selectcol_arrayref("select id from sessions where start_date < ? order by id",
                                   {},
                                   time() - $num * $scale);
    push @ARGV, @$expired_sessions;
}
for my $session (@ARGV) {
    print "deleting instances of session $session\n";
    my $old_min_id = $dbh->selectrow_array("select min(id) from instances");
    my $n_instances = $dbh->do("delete from instances where session=?", {}, $session);
    print "\t$n_instances instances deleted\n";

    # Check if we just crossed into a new partition, if so, the old one is empty
    # and should be shrunk to minimum size. 
    #
    # Note: This will not shrink partitions if we delete a session somewhere 
    # in the middle, but I expect to do that rarely, and rebuilding a partition
    # after expiring a single session isn't worthwhile anyway. If I delete lots
    # of instances in the middle, I can always rebuild the affected partitions 
    # manually.
    my $new_min_id = $dbh->selectrow_array("select min(id) from instances");
    if (int($new_min_id/$partition_size) > int($old_min_id/$partition_size)) {
        my $partition = sprintf("p%03d", int($new_min_id/$partition_size));
        $dbh->do("alter table instances rebuild partition $partition");
        print "\trebuilt partition $partition\n";
    }
    $dbh->commit();
}
remove_orphaned_sessions();
remove_orphaned_files();
remove_orphaned_versions();
$dbh->disconnect();
exit();

sub remove_orphaned_sessions {
    print "deleting orphaned sessions\n";
    my $sessions
	= $dbh->selectcol_arrayref(
	    q{select s.id from instances i right outer join sessions s  on i.session=s.id where i.id is null}
	  );

    for my $session (@$sessions) {
	$dbh->do(q{delete from sessions where id=?}, {}, $session);
	print "\tsession $session deleted\n";
    }
    $dbh->commit();
}

sub remove_orphaned_files {
    print "deleting orphaned files\n";
    my $files
	= $dbh->selectcol_arrayref(
	    q{select f.id from instances i right outer join files f  on i.file=f.id where i.id is null}
	  );

    for my $file (@$files) {
	$dbh->do(q{delete from files where id=?}, {}, $file);
	print "\tfile $file deleted\n";
    }
    $dbh->commit();
}

sub remove_orphaned_versions {
    # This differs from the other two because mysql doesn't find a good plan for
    # the outer join: It does an index lookup on instances for every row of
    # versions2. For the other tables that's good because sessions and files are
    # much smaller than instances, but there is only about a factor of 10
    # between versions2 and instances, so reading both sequentally is much
    # better. Surprisingly, perl is also faster at eliminating duplicates than 
    # mysql, so just doing two selects and doing all the work in perl is faster
    # than “select distinct … minus …” though not much.
    print "deleting orphaned versions\n";
    my $sth;

    $dbh->{'mysql_use_result'} = 1;
    my $versions = Bit::Vector::Judy->new;
    $sth = $dbh->prepare("select id from versions2");
    $sth->execute;
    my $i = 0;
    while (my $version = $sth->fetchrow_array) {
	if ($i % 1_000_000 == 0) {
	    print "\t$i records from versions processed, ", $versions->count(0, -1), " versions found\n";
	}
	$versions->set($version);
	$i++;
    }

    $sth = $dbh->prepare("select version from instances");
    $sth->execute;
    $i = 0;
    while (my $version = $sth->fetchrow_array) {
	if ($i % 1_000_000 == 0) {
	    print "\t$i records from instances processed, ", $versions->count(0, -1), " versions left\n";
	}
	$versions->unset($version);
	$i++;
    }
    $dbh->{'mysql_use_result'} = 0;
    print "\t$i records from instances processed, ", $versions->count(0, -1), " versions left\n";

    for (my $version = $versions->first(0); $version; $version = $versions->next($version)) {
	$dbh->do(q{delete from versions2 where id=?}, {}, $version);
	print "\tversion $version deleted\n";
    }
    $dbh->commit();
}

# vim: tw=132
script to remove session(s) from database. 2013-11-26 22:50:50 +01:00			`#!/usr/bin/perl`

Add documentation 2019-09-08 11:06:01 +02:00			`=head1 SYNOPSIS`

			`remove_session [--age time] [session_ids ...]`

			`=head1 DESCRIPTION`

			`The C<remove_session> script removes the specified sessions from the`
			`simba database and then cleans up orphaned entries.`

			`The sessions can be specified by minimum age or by listing their ids (or`
			`both, but that is probably not very useful).`

			`The age is specified as a fractional number followed by a unit: "y",`
			`"m", "w", or "d". So C<remove_session --age 2.5y> removes all sessions`
			`older than 2.5 years (actually 2.5 * 365 * 86400 seconds -- leap years`
			`and DST are ignored), and C<remove_session --age 52w> removes all`
			`sessions older than 52 weeks.`

			`If neither the age nor a list of sessions is specified, no sessions are`
			`removed, but the cleanup phase is still run, which may take considerable`
			`time (and might be considered a bug).`

			`=cut`

script to remove session(s) from database. 2013-11-26 22:50:50 +01:00			`# This script removes all data associated with the given sessions.`
			`# For each session it first removes all instances of that session and`
			`# then cleans up any orphans.`

			`use warnings;`
			`use strict;`

			`use Simba::CA;`
reimplemented the outer join in perl with two separate queries and a judy bitmap. This is about 20 times faster (1 hour instead of 20 for 1.3E9 instances and 125E6 versions). 2013-12-01 13:10:35 +01:00			`use Bit::Vector::Judy;`
Added option --age to expire all instances beyond a certain age. 2014-12-14 10:34:47 +01:00			`use Getopt::Long;`
Add documentation 2019-09-08 11:06:01 +02:00			`use Pod::Usage;`
script to remove session(s) from database. 2013-11-26 22:50:50 +01:00
rebuild empty partitions. 2014-12-14 11:07:25 +01:00
Commit. Flush stdout. 2013-12-01 10:01:58 +01:00			`$\| = 1;`
script to remove session(s) from database. 2013-11-26 22:50:50 +01:00
			`my $ca = Simba::CA->new({`
			`dbi_file => $ENV{SIMBA_DB_CONN} \|\| "$ENV{HOME}/.dbi/simba",`
			`});`

rebuild empty partitions. 2014-12-14 11:07:25 +01:00			`my $dbh = $ca->{dbh};`
			`my $partition_size = $ca->{instances_part_size};`
script to remove session(s) from database. 2013-11-26 22:50:50 +01:00
Added option --age to expire all instances beyond a certain age. 2014-12-14 10:34:47 +01:00			`my %opt;`
			`GetOptions(`
			`\%opt,`
Add documentation 2019-09-08 11:06:01 +02:00			`"age=s",`
			`"help",`
			`) or pod2usage(verbose => 0);`

			`if ($opt{help}) {`
			`pod2usage(verbose => 2);`
			`}`
Added option --age to expire all instances beyond a certain age. 2014-12-14 10:34:47 +01:00
			`if ($opt{age}) {`
			`my ($num, $unit) = $opt{age} =~ /(\d(?:.\d+)?)(y\|m\|w\|d)/;`
			`my $scale = { y => 365 * 86400,`
			`m => 30 * 86400,`
			`w => 7 * 86400,`
			`d => 1 * 86400 }->{$unit};`
			`die "unknown time unit $unit" unless $scale;`
			`my $expired_sessions`
			`= $dbh->selectcol_arrayref("select id from sessions where start_date < ? order by id",`
			`{},`
			`time() - $num * $scale);`
			`push @ARGV, @$expired_sessions;`
			`}`
fixed trivial errors 2013-11-26 22:51:49 +01:00			`for my $session (@ARGV) {`
reimplemented the outer join in perl with two separate queries and a judy bitmap. This is about 20 times faster (1 hour instead of 20 for 1.3E9 instances and 125E6 versions). 2013-12-01 13:10:35 +01:00			`print "deleting instances of session $session\n";`
rebuild empty partitions. 2014-12-14 11:07:25 +01:00			`my $old_min_id = $dbh->selectrow_array("select min(id) from instances");`
script to remove session(s) from database. 2013-11-26 22:50:50 +01:00			`my $n_instances = $dbh->do("delete from instances where session=?", {}, $session);`
			`print "\t$n_instances instances deleted\n";`
rebuild empty partitions. 2014-12-14 11:07:25 +01:00
			`# Check if we just crossed into a new partition, if so, the old one is empty`
			`# and should be shrunk to minimum size.`
			`#`
			`# Note: This will not shrink partitions if we delete a session somewhere`
			`# in the middle, but I expect to do that rarely, and rebuilding a partition`
			`# after expiring a single session isn't worthwhile anyway. If I delete lots`
			`# of instances in the middle, I can always rebuild the affected partitions`
			`# manually.`
			`my $new_min_id = $dbh->selectrow_array("select min(id) from instances");`
			`if (int($new_min_id/$partition_size) > int($old_min_id/$partition_size)) {`
			`my $partition = sprintf("p%03d", int($new_min_id/$partition_size));`
			`$dbh->do("alter table instances rebuild partition $partition");`
			`print "\trebuilt partition $partition\n";`
			`}`
moved removal of orphaned records into their own functions and after the loop. 2013-12-01 10:08:38 +01:00			`$dbh->commit();`
			`}`
			`remove_orphaned_sessions();`
reactivated remove_orphaned_files. 2013-12-01 13:17:15 +01:00			`remove_orphaned_files();`
moved removal of orphaned records into their own functions and after the loop. 2013-12-01 10:08:38 +01:00			`remove_orphaned_versions();`
			`$dbh->disconnect();`
			`exit();`
script to remove session(s) from database. 2013-11-26 22:50:50 +01:00
moved removal of orphaned records into their own functions and after the loop. 2013-12-01 10:08:38 +01:00			`sub remove_orphaned_sessions {`
reimplemented the outer join in perl with two separate queries and a judy bitmap. This is about 20 times faster (1 hour instead of 20 for 1.3E9 instances and 125E6 versions). 2013-12-01 13:10:35 +01:00			`print "deleting orphaned sessions\n";`
fixed trivial errors 2013-11-26 22:51:49 +01:00			`my $sessions`
script to remove session(s) from database. 2013-11-26 22:50:50 +01:00			`= $dbh->selectcol_arrayref(`
			`q{select s.id from instances i right outer join sessions s on i.session=s.id where i.id is null}`
			`);`

			`for my $session (@$sessions) {`
			`$dbh->do(q{delete from sessions where id=?}, {}, $session);`
			`print "\tsession $session deleted\n";`
			`}`
moved removal of orphaned records into their own functions and after the loop. 2013-12-01 10:08:38 +01:00			`$dbh->commit();`
			`}`
script to remove session(s) from database. 2013-11-26 22:50:50 +01:00
moved removal of orphaned records into their own functions and after the loop. 2013-12-01 10:08:38 +01:00			`sub remove_orphaned_files {`
reimplemented the outer join in perl with two separate queries and a judy bitmap. This is about 20 times faster (1 hour instead of 20 for 1.3E9 instances and 125E6 versions). 2013-12-01 13:10:35 +01:00			`print "deleting orphaned files\n";`
script to remove session(s) from database. 2013-11-26 22:50:50 +01:00			`my $files`
			`= $dbh->selectcol_arrayref(`
			`q{select f.id from instances i right outer join files f on i.file=f.id where i.id is null}`
			`);`

			`for my $file (@$files) {`
			`$dbh->do(q{delete from files where id=?}, {}, $file);`
			`print "\tfile $file deleted\n";`
			`}`
moved removal of orphaned records into their own functions and after the loop. 2013-12-01 10:08:38 +01:00			`$dbh->commit();`
			`}`
script to remove session(s) from database. 2013-11-26 22:50:50 +01:00
moved removal of orphaned records into their own functions and after the loop. 2013-12-01 10:08:38 +01:00			`sub remove_orphaned_versions {`
reimplemented the outer join in perl with two separate queries and a judy bitmap. This is about 20 times faster (1 hour instead of 20 for 1.3E9 instances and 125E6 versions). 2013-12-01 13:10:35 +01:00			`# This differs from the other two because mysql doesn't find a good plan for`
			`# the outer join: It does an index lookup on instances for every row of`
			`# versions2. For the other tables that's good because sessions and files are`
			`# much smaller than instances, but there is only about a factor of 10`
			`# between versions2 and instances, so reading both sequentally is much`
typo 2014-01-11 19:44:43 +01:00			`# better. Surprisingly, perl is also faster at eliminating duplicates than`
reimplemented the outer join in perl with two separate queries and a judy bitmap. This is about 20 times faster (1 hour instead of 20 for 1.3E9 instances and 125E6 versions). 2013-12-01 13:10:35 +01:00			`# mysql, so just doing two selects and doing all the work in perl is faster`
			`# than “select distinct … minus …” though not much.`
			`print "deleting orphaned versions\n";`
			`my $sth;`

			`$dbh->{'mysql_use_result'} = 1;`
			`my $versions = Bit::Vector::Judy->new;`
			`$sth = $dbh->prepare("select id from versions2");`
			`$sth->execute;`
			`my $i = 0;`
			`while (my $version = $sth->fetchrow_array) {`
			`if ($i % 1_000_000 == 0) {`
			`print "\t$i records from versions processed, ", $versions->count(0, -1), " versions found\n";`
			`}`
			`$versions->set($version);`
			`$i++;`
			`}`
script to remove session(s) from database. 2013-11-26 22:50:50 +01:00
reimplemented the outer join in perl with two separate queries and a judy bitmap. This is about 20 times faster (1 hour instead of 20 for 1.3E9 instances and 125E6 versions). 2013-12-01 13:10:35 +01:00			`$sth = $dbh->prepare("select version from instances");`
			`$sth->execute;`
			`$i = 0;`
			`while (my $version = $sth->fetchrow_array) {`
			`if ($i % 1_000_000 == 0) {`
			`print "\t$i records from instances processed, ", $versions->count(0, -1), " versions left\n";`
			`}`
			`$versions->unset($version);`
			`$i++;`
			`}`
			`$dbh->{'mysql_use_result'} = 0;`
Added missing parameter to $versions->first(). How could this have worked before? 2013-12-01 19:49:32 +01:00			`print "\t$i records from instances processed, ", $versions->count(0, -1), " versions left\n";`
reimplemented the outer join in perl with two separate queries and a judy bitmap. This is about 20 times faster (1 hour instead of 20 for 1.3E9 instances and 125E6 versions). 2013-12-01 13:10:35 +01:00
Added missing parameter to $versions->first(). How could this have worked before? 2013-12-01 19:49:32 +01:00			`for (my $version = $versions->first(0); $version; $version = $versions->next($version)) {`
script to remove session(s) from database. 2013-11-26 22:50:50 +01:00			`$dbh->do(q{delete from versions2 where id=?}, {}, $version);`
			`print "\tversion $version deleted\n";`
			`}`
Commit. Flush stdout. 2013-12-01 10:01:58 +01:00			`$dbh->commit();`
script to remove session(s) from database. 2013-11-26 22:50:50 +01:00			`}`
reimplemented the outer join in perl with two separate queries and a judy bitmap. This is about 20 times faster (1 hour instead of 20 for 1.3E9 instances and 125E6 versions). 2013-12-01 13:10:35 +01:00
			`# vim: tw=132`