From f4983e24722679d28ba5c65c05b5822b66fd908a Mon Sep 17 00:00:00 2001 From: "Peter J. Holzer" Date: Sat, 14 Sep 2019 15:13:50 +0200 Subject: [PATCH] Read all features for component in one query Instead of retrieving from the database only features which actually occur in the message retrieve all of them above a certain interestingness threshold (0.4, because that's the minimum I've observed so far) and then match them in in-process. This seems to be a little faster but not by much. May have to revisit if my database grows. --- judge_message | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/judge_message b/judge_message index 9e5bc09..45167ed 100755 --- a/judge_message +++ b/judge_message @@ -45,29 +45,25 @@ def extract_features(msgid, verbose, used_evidence): msg_pk = csr.fetchone()["id"] evidence = [] for t in components: - prev = {""} - length = 1 - while prev: + #raise RuntimeError("Baustelle") + q = "select * from features where type = %s and interesting > 0.4" + csr.execute(q, (t,)) + db_features = {} + maxlen = 0 + for f in csr: + db_features[f["feature"]] = f + if f["length"] > maxlen: + maxlen = f["length"] + for length in range(1, maxlen + 1): current = set() for c in components[t]: for o in range(0, len(c) - length + 1): f = c[o:o+length] - fp = f[:-1] - if fp in prev: - current.add(f) + current.add(f) - # We keep only those as "prev" values which already existed - # in the database - prev = set() - if current: - q = "select * from features where type = %s and length = %s and feature in (" + ", ".join(("%s",) * len(current)) + ")" - q += " order by interesting desc nulls last" - csr.execute(q, (t, length, *current)) - for i, r in enumerate(csr): - prev.add(r["feature"]) - if i < EVIDENCE_MAX and r["interesting"]: - evidence.append(r) - length += 1 + for f in current: + if f in db_features: + evidence.append(db_features[f]) evidence = sorted(evidence, key=lambda x: -x["length"]) evidence = sorted(evidence, key=lambda x: -x["interesting"]) interesting_evidence = [] @@ -115,3 +111,5 @@ def main(): print(p, "spam" if p > 0.5 else "ham") main() + +# vim: tw=79