reimplemented the outer join in perl with two separate queries and a

judy bitmap. This is about 20 times faster (1 hour instead of 20 for 1.3E9 instances and 125E6 versions).
2013-12-01 12:10:35 +00:00 · 2013-12-01 12:10:35 +00:00 · 6982561a6d
parent 71e2f16454
commit 6982561a6d
1 changed files with 43 additions and 7 deletions
--- a/scripts/remove_session
+++ b/scripts/remove_session
@ -8,6 +8,7 @@ use warnings;
 use strict;

 use Simba::CA;
+use Bit::Vector::Judy;

 $| = 1;

@ -18,18 +19,19 @@ my $ca = Simba::CA->new({
 my $dbh = $ca->{dbh};

 for my $session (@ARGV) {
-    print "deleting session $session\n";
+    print "deleting instances of session $session\n";
    my $n_instances = $dbh->do("delete from instances where session=?", {}, $session);
    print "\t$n_instances instances deleted\n";
    $dbh->commit();
 }
 remove_orphaned_sessions();
-remove_orphaned_files();
+#remove_orphaned_files();
 remove_orphaned_versions();
 $dbh->disconnect();
 exit();

 sub remove_orphaned_sessions {
+    print "deleting orphaned sessions\n";
    my $sessions
 	= $dbh->selectcol_arrayref(
 	    q{select s.id from instances i right outer join sessions s  on i.session=s.id where i.id is null}
@ -43,6 +45,7 @@ sub remove_orphaned_sessions {
 }

 sub remove_orphaned_files {
+    print "deleting orphaned files\n";
    my $files
 	= $dbh->selectcol_arrayref(
 	    q{select f.id from instances i right outer join files f  on i.file=f.id where i.id is null}
@ -56,14 +59,47 @@ sub remove_orphaned_files {
 }

 sub remove_orphaned_versions {
-    my $versions
-	= $dbh->selectcol_arrayref(
-	    q{select v.id from instances i right outer join versions2 v  on i.version=v.id where i.id is null}
-	  );
+    # This differs from the other two because mysql doesn't find a good plan for
+    # the outer join: It does an index lookup on instances for every row of
+    # versions2. For the other tables that's good because sessions and files are
+    # much smaller than instances, but there is only about a factor of 10
+    # between versions2 and instances, so reading both sequentally is much
+    # better. Suprisingly, perl is also faster at eliminating duplicates than 
+    # mysql, so just doing two selects and doing all the work in perl is faster
+    # than “select distinct … minus …” though not much.
+    print "deleting orphaned versions\n";
+    my $sth;

-    for my $version (@$versions) {
+    $dbh->{'mysql_use_result'} = 1;
+    my $versions = Bit::Vector::Judy->new;
+    $sth = $dbh->prepare("select id from versions2");
+    $sth->execute;
+    my $i = 0;
+    while (my $version = $sth->fetchrow_array) {
+	if ($i % 1_000_000 == 0) {
+	    print "\t$i records from versions processed, ", $versions->count(0, -1), " versions found\n";
+	}
+	$versions->set($version);
+	$i++;
+    }
+
+    $sth = $dbh->prepare("select version from instances");
+    $sth->execute;
+    $i = 0;
+    while (my $version = $sth->fetchrow_array) {
+	if ($i % 1_000_000 == 0) {
+	    print "\t$i records from instances processed, ", $versions->count(0, -1), " versions left\n";
+	}
+	$versions->unset($version);
+	$i++;
+    }
+    $dbh->{'mysql_use_result'} = 0;
+
+    for (my $version = $versions->first; $version; $version = $versions->next($version)) {
 	$dbh->do(q{delete from versions2 where id=?}, {}, $version);
 	print "\tversion $version deleted\n";
    }
    $dbh->commit();
 }
+
+# vim: tw=132