Read all features for component in one query

Instead of retrieving from the database only features which actually occur in the message retrieve all of them above a certain interestingness threshold (0.4, because that's the minimum I've observed so far) and then match them in in-process. This seems to be a little faster but not by much. May have to revisit if my database grows.
2019-09-14 15:13:50 +02:00 · 2019-09-14 15:13:50 +02:00 · f4983e2472
parent e6dab8395f
commit f4983e2472
1 changed files with 16 additions and 18 deletions
--- a/34
+++ b/34
@ -45,29 +45,25 @@ def extract_features(msgid, verbose, used_evidence):
    msg_pk = csr.fetchone()["id"]
    evidence = []
    for t in components:
-        prev = {""}
-        length = 1
-        while prev:
+        #raise RuntimeError("Baustelle")
+        q = "select * from features where type = %s and interesting > 0.4"
+        csr.execute(q, (t,))
+        db_features = {}
+        maxlen = 0
+        for f in csr:
+            db_features[f["feature"]] = f
+            if f["length"] > maxlen:
+                maxlen = f["length"]
+        for length in range(1, maxlen + 1):
            current = set()
            for c in components[t]:
                for o in range(0, len(c) - length + 1):
                    f = c[o:o+length]
-                    fp = f[:-1]
-                    if fp in prev:
-                        current.add(f)
+                    current.add(f)

-            # We keep only those as "prev" values which already existed
-            # in the database
-            prev = set()
-            if current:
-                q = "select * from features where type = %s and length = %s and feature in (" + ", ".join(("%s",) * len(current)) + ")"
-                q += " order by interesting desc nulls last"
-                csr.execute(q, (t, length, *current))
-                for i, r in enumerate(csr):
-                    prev.add(r["feature"])
-                    if i < EVIDENCE_MAX and r["interesting"]:
-                        evidence.append(r)
-            length += 1
+            for f in current:
+                if f in db_features:
+                    evidence.append(db_features[f])
    evidence = sorted(evidence, key=lambda x: -x["length"])
    evidence = sorted(evidence, key=lambda x: -x["interesting"])
    interesting_evidence = []
@ -115,3 +111,5 @@ def main():
    print(p, "spam" if p > 0.5 else "ham")

 main()
+
+# vim: tw=79