Implement basic idea

I start with tokens of length 1, and add longer tokens iff they extend a previously seen token by one character. Probability computation follow's Paul Graham's "A Plan for Spam", except that I haven't implemented some of his tweaks (most importantly, I don't account for frequencs within a message like he does). While selecting tokens for judging a message, I ignore substrings of tokens that have been seen previously. This still results in the majority of tokens to overlap, which is probably not good.
2019-08-17 09:29:11 +02:00 · 2019-08-17 09:29:11 +02:00 · f3817c4355
commit f3817c4355
3 changed files with 265 additions and 0 deletions
--- a/89
+++ b/89
@ -0,0 +1,89 @@
 #!/usr/bin/python3
 import argparse
 import email.parser
 import email.policy
 import os
 import psycopg2 
 import psycopg2.extras as ppe
 components = {}
 def add_component(t, c):
    if t not in components:
        components[t] = []
    components[t].append(c)
 def add_message(msg):
    for h in msg.items():
        add_component(*h)
    if msg.is_multipart():
        for p in msg.get_payload():
            add_message(p)
    else:
        if msg.get_content_maintype() == "text":
            charset = msg.get_param("charset", "iso-8859-1")
            add_component(
                    msg.get_content_subtype(),
                    msg.get_payload(decode=True)
                        .decode(charset, errors='replace'))
 def extract_features(msgtype, msgid):
    db = psycopg2.connect("dbname=bayes")
    csr = db.cursor(cursor_factory=ppe.DictCursor)
    csr.execute(
            """
            insert into messages(id, type, message_id)
            values(default, %s, %s)
            returning id
            """,
            (msgtype, msgid))
    msg_pk = csr.fetchone()["id"]
    for t in components:
        prev = {""}
        length = 1
        while prev:
            current = set()
            for c in components[t]:
                for o in range(0, len(c) - length + 1):
                    f = c[o:o+length]
                    fp = f[:-1]
                    if fp in prev:
                        current.add(f)
            # Record for this message
            for f in current:
                csr.execute(
                        "insert into message_features(message, type, length, feature) values(%s, %s, %s, %s)",
                        (msg_pk, t, length, f))
            db.commit()
            # We keep only those as "prev" values which already existed
            # in the database
            prev = set()
            if current:
                q = "select * from features where type = %s and length = %s and feature in (" + ", ".join(("%s",) * len(current)) + ")"
                csr.execute(q, (t, length, *current))
                for r in csr:
                    prev.add(r["feature"])
            length += 1
 def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('file', nargs='?')
    ap.add_argument('--spam', action="store_const", const="spam", dest="type")
    ap.add_argument('--ham',  action="store_const", const="ham",  dest="type")
    args = ap.parse_args()
    if args.file:
        fh = open(args.file, "rb")
    else:
        fh = os.fdopen(0, "rb")
    parser = email.parser.BytesParser(policy=email.policy.default)
    msg = parser.parse(fh)
    add_message(msg)
    extract_features(args.type, msg["Message-Id"])
 main()
--- a/81
+++ b/81
@ -0,0 +1,81 @@
 #!/usr/bin/python3
 import time
 import psycopg2 
 import psycopg2.extras as ppe
 t0 = time.time()
 db = psycopg2.connect("dbname=bayes")
 csr = db.cursor(cursor_factory=ppe.DictCursor)
 csr.execute(
        """
        delete from features
        """)
 t1 = time.time()
 print(t1 - t0, "deleted", csr.rowcount, "rows")
 csr.execute(
        """
        insert into features(
            type, length, feature,
            spam_count, ham_count,
            spam_prob,
            interesting
        )
        with
            m as (
                select
                    count(*) filter(where type='spam') as spam_message_count,
                    count(*) filter(where type='ham') as ham_message_count
                from messages
            ),
            f as (
                select f.type, length, feature,
                    count(*) filter (where m.type = 'spam') as spam_count,
                    count(*) filter (where m.type = 'ham') as ham_count
                from message_features f join messages m on (f.message = m.id)
                group by f.type, length, feature
            ),
            f1 as (
                select
                    type, length, feature,
                    spam_count, ham_count,
                    spam_count::float8 / spam_message_count as spam_ratio,
                    ham_count::float8 / ham_message_count as ham_ratio
                from f, m
            ),
            p as (
                select
                    type, length, feature,
                    spam_count, ham_count,
                    case 
                        when spam_count + ham_count > 4 then spam_ratio / (spam_ratio + ham_ratio)
                    end as spam_prob
                from f1
            ),
            p1 as (
                select
                    type, length, feature,
                    spam_count, ham_count,
                    case
                        when spam_prob < 0.01 then 0.01
                        when spam_prob > 0.99 then 0.99
                        else spam_prob
                    end as spam_prob
                from p
            ),
            p2 as (
                select
                    type, length, feature,
                    spam_count, ham_count,
                    spam_prob,
                    abs(spam_prob - 0.5) as interesting
                from p1
            )
        select * from p2
        order by interesting desc
        """)
 t1 = time.time()
 print(t1 - t0, "inserted", csr.rowcount, "rows")
 db.commit()
--- a/95
+++ b/95
@ -0,0 +1,95 @@
 #!/usr/bin/python3
 import argparse
 import email.parser
 import email.policy
 import os
 import psycopg2 
 import psycopg2.extras as ppe
 components = {}
 def add_component(t, c):
    if t not in components:
        components[t] = []
    components[t].append(c)
 def add_message(msg):
    for h in msg.items():
        add_component(*h)
    if msg.is_multipart():
        for p in msg.get_payload():
            add_message(p)
    else:
        if msg.get_content_maintype() == "text":
            charset = msg.get_param("charset", "iso-8859-1")
            add_component(
                    msg.get_content_subtype(),
                    msg.get_payload(decode=True)
                        .decode(charset, errors='replace'))
 def extract_features():
    db = psycopg2.connect("dbname=bayes")
    csr = db.cursor(cursor_factory=ppe.DictCursor)
    evidence = []
    for t in components:
        prev = {""}
        length = 1
        while prev:
            current = set()
            for c in components[t]:
                for o in range(0, len(c) - length + 1):
                    f = c[o:o+length]
                    fp = f[:-1]
                    if fp in prev:
                        current.add(f)
            # We keep only those as "prev" values which already existed
            # in the database
            prev = set()
            if current:
                q = "select * from features where type = %s and length = %s and feature in (" + ", ".join(("%s",) * len(current)) + ")"
                q += " order by interesting desc nulls last"
                csr.execute(q, (t, length, *current))
                for i, r in enumerate(csr):
                    prev.add(r["feature"])
                    if i < 15 and r["interesting"]:
                        evidence.append(r)
            length += 1
    evidence = sorted(evidence, key=lambda x: -x["length"])
    evidence = sorted(evidence, key=lambda x: -x["interesting"])
    interesting_evidence = []
    for e in evidence:
        for i in interesting_evidence:
            if e["type"] == i["type"] and e["feature"] in i["feature"]:
                break
        else:
            interesting_evidence.append(e)
        if len(interesting_evidence) >= 15:
            break
    p1 = 1
    p2 = 1
    for i in interesting_evidence:
        print("#", i["spam_prob"], i["type"], i["feature"])
        p1 *= i["spam_prob"]
        p2 *= 1 - i["spam_prob"]
    p = p1 / (p1 + p2)
    return p
 def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('file', nargs='?')
    args = ap.parse_args()
    if args.file:
        fh = open(args.file, "rb")
    else:
        fh = os.fdopen(0, "rb")
    parser = email.parser.BytesParser(policy=email.policy.default)
    msg = parser.parse(fh)
    add_message(msg)
    p = extract_features()
    print(p, "spam" if p > 0.5 else "ham")
 main()