Improve overlap avoidance (#1)
When a feature is used, we use it to split the input string in which it was found and use the fragments for subsequent feature searches. So overlaps are impossible.
This commit is contained in:
parent
e51294bca2
commit
d96d1fc96e
|
@ -73,17 +73,13 @@ def extract_features(msgid, verbose):
|
|||
interesting_evidence = []
|
||||
seen = []
|
||||
for e in evidence:
|
||||
for s in seen:
|
||||
if e["type"] == s["type"] and (e["feature"] in s["feature"] or s["feature"] in e["feature"]):
|
||||
break
|
||||
else:
|
||||
new_comp = []
|
||||
for c in components[e["type"]]:
|
||||
new_comp += c.split(e["feature"])
|
||||
if len(new_comp) > len(components[e["type"]]):
|
||||
# we found it somewhere
|
||||
interesting_evidence.append(e)
|
||||
seen.append(e)
|
||||
ln = len(e["feature"])
|
||||
if ln >= 6:
|
||||
seen.append({"type": e["type"], "feature": e["feature"][0 : ln // 3]})
|
||||
seen.append({"type": e["type"], "feature": e["feature"][ln // 3 : ln * 2 // 3]})
|
||||
seen.append({"type": e["type"], "feature": e["feature"][ln * 2 // 3 : ln]})
|
||||
components[e["type"]] = new_comp
|
||||
if len(interesting_evidence) >= EVIDENCE_MAX:
|
||||
break
|
||||
p1 = 1
|
||||
|
|
Loading…
Reference in New Issue