reimplemented the outer join in perl with two separate queries and a
judy bitmap. This is about 20 times faster (1 hour instead of 20 for 1.3E9 instances and 125E6 versions).
This commit is contained in:
parent
71e2f16454
commit
6982561a6d
|
@ -8,6 +8,7 @@ use warnings;
|
|||
use strict;
|
||||
|
||||
use Simba::CA;
|
||||
use Bit::Vector::Judy;
|
||||
|
||||
$| = 1;
|
||||
|
||||
|
@ -18,18 +19,19 @@ my $ca = Simba::CA->new({
|
|||
my $dbh = $ca->{dbh};
|
||||
|
||||
for my $session (@ARGV) {
|
||||
print "deleting session $session\n";
|
||||
print "deleting instances of session $session\n";
|
||||
my $n_instances = $dbh->do("delete from instances where session=?", {}, $session);
|
||||
print "\t$n_instances instances deleted\n";
|
||||
$dbh->commit();
|
||||
}
|
||||
remove_orphaned_sessions();
|
||||
remove_orphaned_files();
|
||||
#remove_orphaned_files();
|
||||
remove_orphaned_versions();
|
||||
$dbh->disconnect();
|
||||
exit();
|
||||
|
||||
sub remove_orphaned_sessions {
|
||||
print "deleting orphaned sessions\n";
|
||||
my $sessions
|
||||
= $dbh->selectcol_arrayref(
|
||||
q{select s.id from instances i right outer join sessions s on i.session=s.id where i.id is null}
|
||||
|
@ -43,6 +45,7 @@ sub remove_orphaned_sessions {
|
|||
}
|
||||
|
||||
sub remove_orphaned_files {
|
||||
print "deleting orphaned files\n";
|
||||
my $files
|
||||
= $dbh->selectcol_arrayref(
|
||||
q{select f.id from instances i right outer join files f on i.file=f.id where i.id is null}
|
||||
|
@ -56,14 +59,47 @@ sub remove_orphaned_files {
|
|||
}
|
||||
|
||||
sub remove_orphaned_versions {
|
||||
my $versions
|
||||
= $dbh->selectcol_arrayref(
|
||||
q{select v.id from instances i right outer join versions2 v on i.version=v.id where i.id is null}
|
||||
);
|
||||
# This differs from the other two because mysql doesn't find a good plan for
|
||||
# the outer join: It does an index lookup on instances for every row of
|
||||
# versions2. For the other tables that's good because sessions and files are
|
||||
# much smaller than instances, but there is only about a factor of 10
|
||||
# between versions2 and instances, so reading both sequentally is much
|
||||
# better. Suprisingly, perl is also faster at eliminating duplicates than
|
||||
# mysql, so just doing two selects and doing all the work in perl is faster
|
||||
# than “select distinct … minus …” though not much.
|
||||
print "deleting orphaned versions\n";
|
||||
my $sth;
|
||||
|
||||
for my $version (@$versions) {
|
||||
$dbh->{'mysql_use_result'} = 1;
|
||||
my $versions = Bit::Vector::Judy->new;
|
||||
$sth = $dbh->prepare("select id from versions2");
|
||||
$sth->execute;
|
||||
my $i = 0;
|
||||
while (my $version = $sth->fetchrow_array) {
|
||||
if ($i % 1_000_000 == 0) {
|
||||
print "\t$i records from versions processed, ", $versions->count(0, -1), " versions found\n";
|
||||
}
|
||||
$versions->set($version);
|
||||
$i++;
|
||||
}
|
||||
|
||||
$sth = $dbh->prepare("select version from instances");
|
||||
$sth->execute;
|
||||
$i = 0;
|
||||
while (my $version = $sth->fetchrow_array) {
|
||||
if ($i % 1_000_000 == 0) {
|
||||
print "\t$i records from instances processed, ", $versions->count(0, -1), " versions left\n";
|
||||
}
|
||||
$versions->unset($version);
|
||||
$i++;
|
||||
}
|
||||
$dbh->{'mysql_use_result'} = 0;
|
||||
|
||||
for (my $version = $versions->first; $version; $version = $versions->next($version)) {
|
||||
$dbh->do(q{delete from versions2 where id=?}, {}, $version);
|
||||
print "\tversion $version deleted\n";
|
||||
}
|
||||
$dbh->commit();
|
||||
}
|
||||
|
||||
# vim: tw=132
|
||||
|
|
Loading…
Reference in New Issue