Improve overlap avoidance (#1)

When a feature is used, we use it to split the input string in which it
was found and use the fragments for subsequent feature searches. So
overlaps are impossible.
This commit is contained in:
Peter J. Holzer 2019-09-14 11:01:24 +02:00
parent e51294bca2
commit d96d1fc96e
1 changed files with 6 additions and 10 deletions

View File

@ -73,17 +73,13 @@ def extract_features(msgid, verbose):
interesting_evidence = [] interesting_evidence = []
seen = [] seen = []
for e in evidence: for e in evidence:
for s in seen: new_comp = []
if e["type"] == s["type"] and (e["feature"] in s["feature"] or s["feature"] in e["feature"]): for c in components[e["type"]]:
break new_comp += c.split(e["feature"])
else: if len(new_comp) > len(components[e["type"]]):
# we found it somewhere
interesting_evidence.append(e) interesting_evidence.append(e)
seen.append(e) components[e["type"]] = new_comp
ln = len(e["feature"])
if ln >= 6:
seen.append({"type": e["type"], "feature": e["feature"][0 : ln // 3]})
seen.append({"type": e["type"], "feature": e["feature"][ln // 3 : ln * 2 // 3]})
seen.append({"type": e["type"], "feature": e["feature"][ln * 2 // 3 : ln]})
if len(interesting_evidence) >= EVIDENCE_MAX: if len(interesting_evidence) >= EVIDENCE_MAX:
break break
p1 = 1 p1 = 1