Avoid overlapping tokens
For each used token, record the first, second and last third and exclude all tokens which include those.
This commit is contained in:
parent
f3817c4355
commit
631f97abe5
|
@ -10,6 +10,8 @@ import psycopg2.extras as ppe
|
||||||
|
|
||||||
components = {}
|
components = {}
|
||||||
|
|
||||||
|
EVIDENCE_MAX = 20
|
||||||
|
|
||||||
def add_component(t, c):
|
def add_component(t, c):
|
||||||
if t not in components:
|
if t not in components:
|
||||||
components[t] = []
|
components[t] = []
|
||||||
|
@ -55,24 +57,31 @@ def extract_features():
|
||||||
csr.execute(q, (t, length, *current))
|
csr.execute(q, (t, length, *current))
|
||||||
for i, r in enumerate(csr):
|
for i, r in enumerate(csr):
|
||||||
prev.add(r["feature"])
|
prev.add(r["feature"])
|
||||||
if i < 15 and r["interesting"]:
|
if i < EVIDENCE_MAX and r["interesting"]:
|
||||||
evidence.append(r)
|
evidence.append(r)
|
||||||
length += 1
|
length += 1
|
||||||
evidence = sorted(evidence, key=lambda x: -x["length"])
|
evidence = sorted(evidence, key=lambda x: -x["length"])
|
||||||
evidence = sorted(evidence, key=lambda x: -x["interesting"])
|
evidence = sorted(evidence, key=lambda x: -x["interesting"])
|
||||||
interesting_evidence = []
|
interesting_evidence = []
|
||||||
|
seen = []
|
||||||
for e in evidence:
|
for e in evidence:
|
||||||
for i in interesting_evidence:
|
for s in seen:
|
||||||
if e["type"] == i["type"] and e["feature"] in i["feature"]:
|
if e["type"] == s["type"] and (e["feature"] in s["feature"] or s["feature"] in e["feature"]):
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
interesting_evidence.append(e)
|
interesting_evidence.append(e)
|
||||||
if len(interesting_evidence) >= 15:
|
seen.append(e)
|
||||||
|
ln = len(e["feature"])
|
||||||
|
if ln >= 6:
|
||||||
|
seen.append({"type": e["type"], "feature": e["feature"][0 : ln // 3]})
|
||||||
|
seen.append({"type": e["type"], "feature": e["feature"][ln // 3 : ln * 2 // 3]})
|
||||||
|
seen.append({"type": e["type"], "feature": e["feature"][ln * 2 // 3 : ln]})
|
||||||
|
if len(interesting_evidence) >= EVIDENCE_MAX:
|
||||||
break
|
break
|
||||||
p1 = 1
|
p1 = 1
|
||||||
p2 = 1
|
p2 = 1
|
||||||
for i in interesting_evidence:
|
for i in interesting_evidence:
|
||||||
print("#", i["spam_prob"], i["type"], i["feature"])
|
print("#", i["spam_prob"], i["type"], i["length"], i["feature"], sep="\t")
|
||||||
p1 *= i["spam_prob"]
|
p1 *= i["spam_prob"]
|
||||||
p2 *= 1 - i["spam_prob"]
|
p2 *= 1 - i["spam_prob"]
|
||||||
p = p1 / (p1 + p2)
|
p = p1 / (p1 + p2)
|
||||||
|
|
Loading…
Reference in New Issue