Read all features for component in one query

Instead of retrieving from the database only features which actually occur in the message retrieve all of them above a certain interestingness threshold (0.4, because that's the minimum I've observed so far) and then match them in in-process. This seems to be a little faster but not by much. May have to revisit if my database grows.
2019-09-14 15:13:50 +02:00 · 2019-09-14 15:13:50 +02:00 · f4983e2472
parent e6dab8395f
commit f4983e2472
1 changed files with 16 additions and 18 deletions
--- a/32
+++ b/32
@ -45,29 +45,25 @@ def extract_features(msgid, verbose, used_evidence):
    msg_pk = csr.fetchone()["id"]
    evidence = []
    for t in components:
-        prev = {""}
+        #raise RuntimeError("Baustelle")
-        length = 1
+        q = "select * from features where type = %s and interesting > 0.4"
-        while prev:
+        csr.execute(q, (t,))
        db_features = {}
        maxlen = 0
        for f in csr:
            db_features[f["feature"]] = f
            if f["length"] > maxlen:
                maxlen = f["length"]
        for length in range(1, maxlen + 1):
            current = set()
            for c in components[t]:
                for o in range(0, len(c) - length + 1):
                    f = c[o:o+length]
                    fp = f[:-1]
                    if fp in prev:
                    current.add(f)
-            # We keep only those as "prev" values which already existed
+            for f in current:
-            # in the database
+                if f in db_features:
-            prev = set()
+                    evidence.append(db_features[f])
            if current:
                q = "select * from features where type = %s and length = %s and feature in (" + ", ".join(("%s",) * len(current)) + ")"
                q += " order by interesting desc nulls last"
                csr.execute(q, (t, length, *current))
                for i, r in enumerate(csr):
                    prev.add(r["feature"])
                    if i < EVIDENCE_MAX and r["interesting"]:
                        evidence.append(r)
            length += 1
    evidence = sorted(evidence, key=lambda x: -x["length"])
    evidence = sorted(evidence, key=lambda x: -x["interesting"])
    interesting_evidence = []
@ -115,3 +111,5 @@ def main():
    print(p, "spam" if p > 0.5 else "ham")
 main()
 # vim: tw=79