Improve overlap avoidance (#1)

When a feature is used, we use it to split the input string in which it
was found and use the fragments for subsequent feature searches. So
overlaps are impossible.
This commit is contained in:
Peter J. Holzer 2019-09-14 11:01:24 +02:00
parent e51294bca2
commit d96d1fc96e
1 changed files with 6 additions and 10 deletions

View File

@ -73,17 +73,13 @@ def extract_features(msgid, verbose):
interesting_evidence = []
seen = []
for e in evidence:
for s in seen:
if e["type"] == s["type"] and (e["feature"] in s["feature"] or s["feature"] in e["feature"]):
break
else:
new_comp = []
for c in components[e["type"]]:
new_comp += c.split(e["feature"])
if len(new_comp) > len(components[e["type"]]):
# we found it somewhere
interesting_evidence.append(e)
seen.append(e)
ln = len(e["feature"])
if ln >= 6:
seen.append({"type": e["type"], "feature": e["feature"][0 : ln // 3]})
seen.append({"type": e["type"], "feature": e["feature"][ln // 3 : ln * 2 // 3]})
seen.append({"type": e["type"], "feature": e["feature"][ln * 2 // 3 : ln]})
components[e["type"]] = new_comp
if len(interesting_evidence) >= EVIDENCE_MAX:
break
p1 = 1