Implement basic idea

I start with tokens of length 1, and add longer tokens iff they extend a previously seen token by one character. Probability computation follow's Paul Graham's "A Plan for Spam", except that I haven't implemented some of his tweaks (most importantly, I don't account for frequencs within a message like he does). While selecting tokens for judging a message, I ignore substrings of tokens that have been seen previously. This still results in the majority of tokens to overlap, which is probably not good.
2019-08-17 09:29:11 +02:00 · 2019-08-17 09:29:11 +02:00 · f3817c4355
commit f3817c4355
3 changed files with 265 additions and 0 deletions
--- a/89
+++ b/89
@ -0,0 +1,89 @@
+#!/usr/bin/python3
+
+import argparse
+import email.parser
+import email.policy
+import os
+
+import psycopg2 
+import psycopg2.extras as ppe
+
+components = {}
+
+def add_component(t, c):
+    if t not in components:
+        components[t] = []
+    components[t].append(c)
+
+def add_message(msg):
+    for h in msg.items():
+        add_component(*h)
+
+    if msg.is_multipart():
+        for p in msg.get_payload():
+            add_message(p)
+    else:
+        if msg.get_content_maintype() == "text":
+            charset = msg.get_param("charset", "iso-8859-1")
+            add_component(
+                    msg.get_content_subtype(),
+                    msg.get_payload(decode=True)
+                        .decode(charset, errors='replace'))
+
+def extract_features(msgtype, msgid):
+    db = psycopg2.connect("dbname=bayes")
+    csr = db.cursor(cursor_factory=ppe.DictCursor)
+    csr.execute(
+            """
+            insert into messages(id, type, message_id)
+            values(default, %s, %s)
+            returning id
+            """,
+            (msgtype, msgid))
+    msg_pk = csr.fetchone()["id"]
+    for t in components:
+        prev = {""}
+        length = 1
+        while prev:
+            current = set()
+            for c in components[t]:
+                for o in range(0, len(c) - length + 1):
+                    f = c[o:o+length]
+                    fp = f[:-1]
+                    if fp in prev:
+                        current.add(f)
+
+            # Record for this message
+            for f in current:
+                csr.execute(
+                        "insert into message_features(message, type, length, feature) values(%s, %s, %s, %s)",
+                        (msg_pk, t, length, f))
+            db.commit()
+
+            # We keep only those as "prev" values which already existed
+            # in the database
+            prev = set()
+            if current:
+                q = "select * from features where type = %s and length = %s and feature in (" + ", ".join(("%s",) * len(current)) + ")"
+                csr.execute(q, (t, length, *current))
+                for r in csr:
+                    prev.add(r["feature"])
+            length += 1
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument('file', nargs='?')
+    ap.add_argument('--spam', action="store_const", const="spam", dest="type")
+    ap.add_argument('--ham',  action="store_const", const="ham",  dest="type")
+    args = ap.parse_args()
+    if args.file:
+        fh = open(args.file, "rb")
+    else:
+        fh = os.fdopen(0, "rb")
+    parser = email.parser.BytesParser(policy=email.policy.default)
+    msg = parser.parse(fh)
+    add_message(msg)
+    extract_features(args.type, msg["Message-Id"])
+
+main()
--- a/81
+++ b/81
@ -0,0 +1,81 @@
+#!/usr/bin/python3
+
+import time
+
+
+import psycopg2 
+import psycopg2.extras as ppe
+
+t0 = time.time()
+db = psycopg2.connect("dbname=bayes")
+csr = db.cursor(cursor_factory=ppe.DictCursor)
+csr.execute(
+        """
+        delete from features
+        """)
+t1 = time.time()
+print(t1 - t0, "deleted", csr.rowcount, "rows")
+csr.execute(
+        """
+        insert into features(
+            type, length, feature,
+            spam_count, ham_count,
+            spam_prob,
+            interesting
+        )
+        with
+            m as (
+                select
+                    count(*) filter(where type='spam') as spam_message_count,
+                    count(*) filter(where type='ham') as ham_message_count
+                from messages
+            ),
+            f as (
+                select f.type, length, feature,
+                    count(*) filter (where m.type = 'spam') as spam_count,
+                    count(*) filter (where m.type = 'ham') as ham_count
+                from message_features f join messages m on (f.message = m.id)
+                group by f.type, length, feature
+            ),
+            f1 as (
+                select
+                    type, length, feature,
+                    spam_count, ham_count,
+                    spam_count::float8 / spam_message_count as spam_ratio,
+                    ham_count::float8 / ham_message_count as ham_ratio
+                from f, m
+            ),
+            p as (
+                select
+                    type, length, feature,
+                    spam_count, ham_count,
+                    case 
+                        when spam_count + ham_count > 4 then spam_ratio / (spam_ratio + ham_ratio)
+                    end as spam_prob
+                from f1
+            ),
+            p1 as (
+                select
+                    type, length, feature,
+                    spam_count, ham_count,
+                    case
+                        when spam_prob < 0.01 then 0.01
+                        when spam_prob > 0.99 then 0.99
+                        else spam_prob
+                    end as spam_prob
+                from p
+            ),
+            p2 as (
+                select
+                    type, length, feature,
+                    spam_count, ham_count,
+                    spam_prob,
+                    abs(spam_prob - 0.5) as interesting
+                from p1
+            )
+        select * from p2
+        order by interesting desc
+        """)
+t1 = time.time()
+print(t1 - t0, "inserted", csr.rowcount, "rows")
+db.commit()
--- a/95
+++ b/95
@ -0,0 +1,95 @@
+#!/usr/bin/python3
+
+import argparse
+import email.parser
+import email.policy
+import os
+
+import psycopg2 
+import psycopg2.extras as ppe
+
+components = {}
+
+def add_component(t, c):
+    if t not in components:
+        components[t] = []
+    components[t].append(c)
+
+def add_message(msg):
+    for h in msg.items():
+        add_component(*h)
+
+    if msg.is_multipart():
+        for p in msg.get_payload():
+            add_message(p)
+    else:
+        if msg.get_content_maintype() == "text":
+            charset = msg.get_param("charset", "iso-8859-1")
+            add_component(
+                    msg.get_content_subtype(),
+                    msg.get_payload(decode=True)
+                        .decode(charset, errors='replace'))
+
+def extract_features():
+    db = psycopg2.connect("dbname=bayes")
+    csr = db.cursor(cursor_factory=ppe.DictCursor)
+    evidence = []
+    for t in components:
+        prev = {""}
+        length = 1
+        while prev:
+            current = set()
+            for c in components[t]:
+                for o in range(0, len(c) - length + 1):
+                    f = c[o:o+length]
+                    fp = f[:-1]
+                    if fp in prev:
+                        current.add(f)
+
+            # We keep only those as "prev" values which already existed
+            # in the database
+            prev = set()
+            if current:
+                q = "select * from features where type = %s and length = %s and feature in (" + ", ".join(("%s",) * len(current)) + ")"
+                q += " order by interesting desc nulls last"
+                csr.execute(q, (t, length, *current))
+                for i, r in enumerate(csr):
+                    prev.add(r["feature"])
+                    if i < 15 and r["interesting"]:
+                        evidence.append(r)
+            length += 1
+    evidence = sorted(evidence, key=lambda x: -x["length"])
+    evidence = sorted(evidence, key=lambda x: -x["interesting"])
+    interesting_evidence = []
+    for e in evidence:
+        for i in interesting_evidence:
+            if e["type"] == i["type"] and e["feature"] in i["feature"]:
+                break
+        else:
+            interesting_evidence.append(e)
+        if len(interesting_evidence) >= 15:
+            break
+    p1 = 1
+    p2 = 1
+    for i in interesting_evidence:
+        print("#", i["spam_prob"], i["type"], i["feature"])
+        p1 *= i["spam_prob"]
+        p2 *= 1 - i["spam_prob"]
+    p = p1 / (p1 + p2)
+    return p
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument('file', nargs='?')
+    args = ap.parse_args()
+    if args.file:
+        fh = open(args.file, "rb")
+    else:
+        fh = os.fdopen(0, "rb")
+    parser = email.parser.BytesParser(policy=email.policy.default)
+    msg = parser.parse(fh)
+    add_message(msg)
+    p = extract_features()
+    print(p, "spam" if p > 0.5 else "ham")
+
+main()