Read all features for component in one query
Instead of retrieving from the database only features which actually occur in the message retrieve all of them above a certain interestingness threshold (0.4, because that's the minimum I've observed so far) and then match them in in-process. This seems to be a little faster but not by much. May have to revisit if my database grows.
This commit is contained in:
parent
e6dab8395f
commit
f4983e2472
|
@ -45,29 +45,25 @@ def extract_features(msgid, verbose, used_evidence):
|
|||
msg_pk = csr.fetchone()["id"]
|
||||
evidence = []
|
||||
for t in components:
|
||||
prev = {""}
|
||||
length = 1
|
||||
while prev:
|
||||
#raise RuntimeError("Baustelle")
|
||||
q = "select * from features where type = %s and interesting > 0.4"
|
||||
csr.execute(q, (t,))
|
||||
db_features = {}
|
||||
maxlen = 0
|
||||
for f in csr:
|
||||
db_features[f["feature"]] = f
|
||||
if f["length"] > maxlen:
|
||||
maxlen = f["length"]
|
||||
for length in range(1, maxlen + 1):
|
||||
current = set()
|
||||
for c in components[t]:
|
||||
for o in range(0, len(c) - length + 1):
|
||||
f = c[o:o+length]
|
||||
fp = f[:-1]
|
||||
if fp in prev:
|
||||
current.add(f)
|
||||
|
||||
# We keep only those as "prev" values which already existed
|
||||
# in the database
|
||||
prev = set()
|
||||
if current:
|
||||
q = "select * from features where type = %s and length = %s and feature in (" + ", ".join(("%s",) * len(current)) + ")"
|
||||
q += " order by interesting desc nulls last"
|
||||
csr.execute(q, (t, length, *current))
|
||||
for i, r in enumerate(csr):
|
||||
prev.add(r["feature"])
|
||||
if i < EVIDENCE_MAX and r["interesting"]:
|
||||
evidence.append(r)
|
||||
length += 1
|
||||
for f in current:
|
||||
if f in db_features:
|
||||
evidence.append(db_features[f])
|
||||
evidence = sorted(evidence, key=lambda x: -x["length"])
|
||||
evidence = sorted(evidence, key=lambda x: -x["interesting"])
|
||||
interesting_evidence = []
|
||||
|
@ -115,3 +111,5 @@ def main():
|
|||
print(p, "spam" if p > 0.5 else "ham")
|
||||
|
||||
main()
|
||||
|
||||
# vim: tw=79
|
||||
|
|
Loading…
Reference in New Issue