Read all features for component in one query

Instead of retrieving from the database only features which actually
occur in the message retrieve all of them above a certain
interestingness threshold (0.4, because that's the minimum I've observed
so far) and then match them in in-process.

This seems to be a little faster but not by much. May have to revisit if
my database grows.
This commit is contained in:
Peter J. Holzer 2019-09-14 15:13:50 +02:00
parent e6dab8395f
commit f4983e2472
1 changed files with 16 additions and 18 deletions

View File

@ -45,29 +45,25 @@ def extract_features(msgid, verbose, used_evidence):
msg_pk = csr.fetchone()["id"] msg_pk = csr.fetchone()["id"]
evidence = [] evidence = []
for t in components: for t in components:
prev = {""} #raise RuntimeError("Baustelle")
length = 1 q = "select * from features where type = %s and interesting > 0.4"
while prev: csr.execute(q, (t,))
db_features = {}
maxlen = 0
for f in csr:
db_features[f["feature"]] = f
if f["length"] > maxlen:
maxlen = f["length"]
for length in range(1, maxlen + 1):
current = set() current = set()
for c in components[t]: for c in components[t]:
for o in range(0, len(c) - length + 1): for o in range(0, len(c) - length + 1):
f = c[o:o+length] f = c[o:o+length]
fp = f[:-1] current.add(f)
if fp in prev:
current.add(f)
# We keep only those as "prev" values which already existed for f in current:
# in the database if f in db_features:
prev = set() evidence.append(db_features[f])
if current:
q = "select * from features where type = %s and length = %s and feature in (" + ", ".join(("%s",) * len(current)) + ")"
q += " order by interesting desc nulls last"
csr.execute(q, (t, length, *current))
for i, r in enumerate(csr):
prev.add(r["feature"])
if i < EVIDENCE_MAX and r["interesting"]:
evidence.append(r)
length += 1
evidence = sorted(evidence, key=lambda x: -x["length"]) evidence = sorted(evidence, key=lambda x: -x["length"])
evidence = sorted(evidence, key=lambda x: -x["interesting"]) evidence = sorted(evidence, key=lambda x: -x["interesting"])
interesting_evidence = [] interesting_evidence = []
@ -115,3 +111,5 @@ def main():
print(p, "spam" if p > 0.5 else "ham") print(p, "spam" if p > 0.5 else "ham")
main() main()
# vim: tw=79