Avoid overlapping tokens

For each used token, record the first, second and last third and exclude
all tokens which include those.
This commit is contained in:
Peter J. Holzer 2019-08-17 11:12:34 +02:00
parent f3817c4355
commit 631f97abe5
1 changed files with 14 additions and 5 deletions

View File

@ -10,6 +10,8 @@ import psycopg2.extras as ppe
components = {} components = {}
EVIDENCE_MAX = 20
def add_component(t, c): def add_component(t, c):
if t not in components: if t not in components:
components[t] = [] components[t] = []
@ -55,24 +57,31 @@ def extract_features():
csr.execute(q, (t, length, *current)) csr.execute(q, (t, length, *current))
for i, r in enumerate(csr): for i, r in enumerate(csr):
prev.add(r["feature"]) prev.add(r["feature"])
if i < 15 and r["interesting"]: if i < EVIDENCE_MAX and r["interesting"]:
evidence.append(r) evidence.append(r)
length += 1 length += 1
evidence = sorted(evidence, key=lambda x: -x["length"]) evidence = sorted(evidence, key=lambda x: -x["length"])
evidence = sorted(evidence, key=lambda x: -x["interesting"]) evidence = sorted(evidence, key=lambda x: -x["interesting"])
interesting_evidence = [] interesting_evidence = []
seen = []
for e in evidence: for e in evidence:
for i in interesting_evidence: for s in seen:
if e["type"] == i["type"] and e["feature"] in i["feature"]: if e["type"] == s["type"] and (e["feature"] in s["feature"] or s["feature"] in e["feature"]):
break break
else: else:
interesting_evidence.append(e) interesting_evidence.append(e)
if len(interesting_evidence) >= 15: seen.append(e)
ln = len(e["feature"])
if ln >= 6:
seen.append({"type": e["type"], "feature": e["feature"][0 : ln // 3]})
seen.append({"type": e["type"], "feature": e["feature"][ln // 3 : ln * 2 // 3]})
seen.append({"type": e["type"], "feature": e["feature"][ln * 2 // 3 : ln]})
if len(interesting_evidence) >= EVIDENCE_MAX:
break break
p1 = 1 p1 = 1
p2 = 1 p2 = 1
for i in interesting_evidence: for i in interesting_evidence:
print("#", i["spam_prob"], i["type"], i["feature"]) print("#", i["spam_prob"], i["type"], i["length"], i["feature"], sep="\t")
p1 *= i["spam_prob"] p1 *= i["spam_prob"]
p2 *= 1 - i["spam_prob"] p2 *= 1 - i["spam_prob"]
p = p1 / (p1 + p2) p = p1 / (p1 + p2)