Read all features for component in one query
Instead of retrieving from the database only features which actually occur in the message retrieve all of them above a certain interestingness threshold (0.4, because that's the minimum I've observed so far) and then match them in in-process. This seems to be a little faster but not by much. May have to revisit if my database grows.
This commit is contained in:
parent
e6dab8395f
commit
f4983e2472
|
@ -45,29 +45,25 @@ def extract_features(msgid, verbose, used_evidence):
|
||||||
msg_pk = csr.fetchone()["id"]
|
msg_pk = csr.fetchone()["id"]
|
||||||
evidence = []
|
evidence = []
|
||||||
for t in components:
|
for t in components:
|
||||||
prev = {""}
|
#raise RuntimeError("Baustelle")
|
||||||
length = 1
|
q = "select * from features where type = %s and interesting > 0.4"
|
||||||
while prev:
|
csr.execute(q, (t,))
|
||||||
|
db_features = {}
|
||||||
|
maxlen = 0
|
||||||
|
for f in csr:
|
||||||
|
db_features[f["feature"]] = f
|
||||||
|
if f["length"] > maxlen:
|
||||||
|
maxlen = f["length"]
|
||||||
|
for length in range(1, maxlen + 1):
|
||||||
current = set()
|
current = set()
|
||||||
for c in components[t]:
|
for c in components[t]:
|
||||||
for o in range(0, len(c) - length + 1):
|
for o in range(0, len(c) - length + 1):
|
||||||
f = c[o:o+length]
|
f = c[o:o+length]
|
||||||
fp = f[:-1]
|
|
||||||
if fp in prev:
|
|
||||||
current.add(f)
|
current.add(f)
|
||||||
|
|
||||||
# We keep only those as "prev" values which already existed
|
for f in current:
|
||||||
# in the database
|
if f in db_features:
|
||||||
prev = set()
|
evidence.append(db_features[f])
|
||||||
if current:
|
|
||||||
q = "select * from features where type = %s and length = %s and feature in (" + ", ".join(("%s",) * len(current)) + ")"
|
|
||||||
q += " order by interesting desc nulls last"
|
|
||||||
csr.execute(q, (t, length, *current))
|
|
||||||
for i, r in enumerate(csr):
|
|
||||||
prev.add(r["feature"])
|
|
||||||
if i < EVIDENCE_MAX and r["interesting"]:
|
|
||||||
evidence.append(r)
|
|
||||||
length += 1
|
|
||||||
evidence = sorted(evidence, key=lambda x: -x["length"])
|
evidence = sorted(evidence, key=lambda x: -x["length"])
|
||||||
evidence = sorted(evidence, key=lambda x: -x["interesting"])
|
evidence = sorted(evidence, key=lambda x: -x["interesting"])
|
||||||
interesting_evidence = []
|
interesting_evidence = []
|
||||||
|
@ -115,3 +111,5 @@ def main():
|
||||||
print(p, "spam" if p > 0.5 else "ham")
|
print(p, "spam" if p > 0.5 else "ham")
|
||||||
|
|
||||||
main()
|
main()
|
||||||
|
|
||||||
|
# vim: tw=79
|
||||||
|
|
Loading…
Reference in New Issue