From 631f97abe5d09ac1ce46f54f4c86a4ccf039b617 Mon Sep 17 00:00:00 2001 From: "Peter J. Holzer" Date: Sat, 17 Aug 2019 11:12:34 +0200 Subject: [PATCH] Avoid overlapping tokens For each used token, record the first, second and last third and exclude all tokens which include those. --- judge_message | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/judge_message b/judge_message index 281bf04..c52dae2 100755 --- a/judge_message +++ b/judge_message @@ -10,6 +10,8 @@ import psycopg2.extras as ppe components = {} +EVIDENCE_MAX = 20 + def add_component(t, c): if t not in components: components[t] = [] @@ -55,24 +57,31 @@ def extract_features(): csr.execute(q, (t, length, *current)) for i, r in enumerate(csr): prev.add(r["feature"]) - if i < 15 and r["interesting"]: + if i < EVIDENCE_MAX and r["interesting"]: evidence.append(r) length += 1 evidence = sorted(evidence, key=lambda x: -x["length"]) evidence = sorted(evidence, key=lambda x: -x["interesting"]) interesting_evidence = [] + seen = [] for e in evidence: - for i in interesting_evidence: - if e["type"] == i["type"] and e["feature"] in i["feature"]: + for s in seen: + if e["type"] == s["type"] and (e["feature"] in s["feature"] or s["feature"] in e["feature"]): break else: interesting_evidence.append(e) - if len(interesting_evidence) >= 15: + seen.append(e) + ln = len(e["feature"]) + if ln >= 6: + seen.append({"type": e["type"], "feature": e["feature"][0 : ln // 3]}) + seen.append({"type": e["type"], "feature": e["feature"][ln // 3 : ln * 2 // 3]}) + seen.append({"type": e["type"], "feature": e["feature"][ln * 2 // 3 : ln]}) + if len(interesting_evidence) >= EVIDENCE_MAX: break p1 = 1 p2 = 1 for i in interesting_evidence: - print("#", i["spam_prob"], i["type"], i["feature"]) + print("#", i["spam_prob"], i["type"], i["length"], i["feature"], sep="\t") p1 *= i["spam_prob"] p2 *= 1 - i["spam_prob"] p = p1 / (p1 + p2)