Read all features for component in one query

Instead of retrieving from the database only features which actually
occur in the message retrieve all of them above a certain
interestingness threshold (0.4, because that's the minimum I've observed
so far) and then match them in in-process.

This seems to be a little faster but not by much. May have to revisit if
my database grows.
This commit is contained in:
Peter J. Holzer 2019-09-14 15:13:50 +02:00
parent e6dab8395f
commit f4983e2472
1 changed files with 16 additions and 18 deletions

View File

@ -45,29 +45,25 @@ def extract_features(msgid, verbose, used_evidence):
msg_pk = csr.fetchone()["id"]
evidence = []
for t in components:
prev = {""}
length = 1
while prev:
#raise RuntimeError("Baustelle")
q = "select * from features where type = %s and interesting > 0.4"
csr.execute(q, (t,))
db_features = {}
maxlen = 0
for f in csr:
db_features[f["feature"]] = f
if f["length"] > maxlen:
maxlen = f["length"]
for length in range(1, maxlen + 1):
current = set()
for c in components[t]:
for o in range(0, len(c) - length + 1):
f = c[o:o+length]
fp = f[:-1]
if fp in prev:
current.add(f)
current.add(f)
# We keep only those as "prev" values which already existed
# in the database
prev = set()
if current:
q = "select * from features where type = %s and length = %s and feature in (" + ", ".join(("%s",) * len(current)) + ")"
q += " order by interesting desc nulls last"
csr.execute(q, (t, length, *current))
for i, r in enumerate(csr):
prev.add(r["feature"])
if i < EVIDENCE_MAX and r["interesting"]:
evidence.append(r)
length += 1
for f in current:
if f in db_features:
evidence.append(db_features[f])
evidence = sorted(evidence, key=lambda x: -x["length"])
evidence = sorted(evidence, key=lambda x: -x["interesting"])
interesting_evidence = []
@ -115,3 +111,5 @@ def main():
print(p, "spam" if p > 0.5 else "ham")
main()
# vim: tw=79