Improve overlap avoidance (#1)

When a feature is used, we use it to split the input string in which it was found and use the fragments for subsequent feature searches. So overlaps are impossible.
2019-09-14 11:01:24 +02:00 · 2019-09-14 11:01:24 +02:00 · d96d1fc96e
parent e51294bca2
commit d96d1fc96e
1 changed files with 6 additions and 10 deletions
--- a/16
+++ b/16
@ -73,17 +73,13 @@ def extract_features(msgid, verbose):
    interesting_evidence = []
    seen = []
    for e in evidence:
-        for s in seen:
-            if e["type"] == s["type"] and (e["feature"] in s["feature"] or s["feature"] in e["feature"]):
-                break
-        else:
+        new_comp = []
+        for c in components[e["type"]]:
+            new_comp += c.split(e["feature"])
+        if len(new_comp) > len(components[e["type"]]):
+            # we found it somewhere
            interesting_evidence.append(e)
-            seen.append(e)
-            ln = len(e["feature"])
-            if ln >= 6:
-                seen.append({"type": e["type"], "feature": e["feature"][0           : ln // 3]})
-                seen.append({"type": e["type"], "feature": e["feature"][ln // 3     : ln * 2 // 3]})
-                seen.append({"type": e["type"], "feature": e["feature"][ln * 2 // 3 : ln]})
+            components[e["type"]] = new_comp
        if len(interesting_evidence) >= EVIDENCE_MAX:
            break
    p1 = 1