bayes/judge_message

#!/usr/bin/python3

import argparse
import email.parser
import email.policy
import os

import psycopg2 
import psycopg2.extras as ppe

components = {}

def add_component(t, c):
    if t not in components:
        components[t] = []
    components[t].append(c)

def add_message(msg):
    for h in msg.items():
        add_component(*h)

    if msg.is_multipart():
        for p in msg.get_payload():
            add_message(p)
    else:
        if msg.get_content_maintype() == "text":
            charset = msg.get_param("charset", "iso-8859-1")
            add_component(
                    msg.get_content_subtype(),
                    msg.get_payload(decode=True)
                        .decode(charset, errors='replace'))

def extract_features():
    db = psycopg2.connect("dbname=bayes")
    csr = db.cursor(cursor_factory=ppe.DictCursor)
    evidence = []
    for t in components:
        prev = {""}
        length = 1
        while prev:
            current = set()
            for c in components[t]:
                for o in range(0, len(c) - length + 1):
                    f = c[o:o+length]
                    fp = f[:-1]
                    if fp in prev:
                        current.add(f)

            # We keep only those as "prev" values which already existed
            # in the database
            prev = set()
            if current:
                q = "select * from features where type = %s and length = %s and feature in (" + ", ".join(("%s",) * len(current)) + ")"
                q += " order by interesting desc nulls last"
                csr.execute(q, (t, length, *current))
                for i, r in enumerate(csr):
                    prev.add(r["feature"])
                    if i < 15 and r["interesting"]:
                        evidence.append(r)
            length += 1
    evidence = sorted(evidence, key=lambda x: -x["length"])
    evidence = sorted(evidence, key=lambda x: -x["interesting"])
    interesting_evidence = []
    for e in evidence:
        for i in interesting_evidence:
            if e["type"] == i["type"] and e["feature"] in i["feature"]:
                break
        else:
            interesting_evidence.append(e)
        if len(interesting_evidence) >= 15:
            break
    p1 = 1
    p2 = 1
    for i in interesting_evidence:
        print("#", i["spam_prob"], i["type"], i["feature"])
        p1 *= i["spam_prob"]
        p2 *= 1 - i["spam_prob"]
    p = p1 / (p1 + p2)
    return p

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('file', nargs='?')
    args = ap.parse_args()
    if args.file:
        fh = open(args.file, "rb")
    else:
        fh = os.fdopen(0, "rb")
    parser = email.parser.BytesParser(policy=email.policy.default)
    msg = parser.parse(fh)
    add_message(msg)
    p = extract_features()
    print(p, "spam" if p > 0.5 else "ham")

main()
Implement basic idea I start with tokens of length 1, and add longer tokens iff they extend a previously seen token by one character. Probability computation follow's Paul Graham's "A Plan for Spam", except that I haven't implemented some of his tweaks (most importantly, I don't account for frequencs within a message like he does). While selecting tokens for judging a message, I ignore substrings of tokens that have been seen previously. This still results in the majority of tokens to overlap, which is probably not good. 2019-08-17 09:29:11 +02:00			`#!/usr/bin/python3`

			`import argparse`
			`import email.parser`
			`import email.policy`
			`import os`

			`import psycopg2`
			`import psycopg2.extras as ppe`

			`components = {}`

			`def add_component(t, c):`
			`if t not in components:`
			`components[t] = []`
			`components[t].append(c)`

			`def add_message(msg):`
			`for h in msg.items():`
			`add_component(*h)`

			`if msg.is_multipart():`
			`for p in msg.get_payload():`
			`add_message(p)`
			`else:`
			`if msg.get_content_maintype() == "text":`
			`charset = msg.get_param("charset", "iso-8859-1")`
			`add_component(`
			`msg.get_content_subtype(),`
			`msg.get_payload(decode=True)`
			`.decode(charset, errors='replace'))`

			`def extract_features():`
			`db = psycopg2.connect("dbname=bayes")`
			`csr = db.cursor(cursor_factory=ppe.DictCursor)`
			`evidence = []`
			`for t in components:`
			`prev = {""}`
			`length = 1`
			`while prev:`
			`current = set()`
			`for c in components[t]:`
			`for o in range(0, len(c) - length + 1):`
			`f = c[o:o+length]`
			`fp = f[:-1]`
			`if fp in prev:`
			`current.add(f)`

			`# We keep only those as "prev" values which already existed`
			`# in the database`
			`prev = set()`
			`if current:`
			`q = "select * from features where type = %s and length = %s and feature in (" + ", ".join(("%s",) * len(current)) + ")"`
			`q += " order by interesting desc nulls last"`
			`csr.execute(q, (t, length, *current))`
			`for i, r in enumerate(csr):`
			`prev.add(r["feature"])`
			`if i < 15 and r["interesting"]:`
			`evidence.append(r)`
			`length += 1`
			`evidence = sorted(evidence, key=lambda x: -x["length"])`
			`evidence = sorted(evidence, key=lambda x: -x["interesting"])`
			`interesting_evidence = []`
			`for e in evidence:`
			`for i in interesting_evidence:`
			`if e["type"] == i["type"] and e["feature"] in i["feature"]:`
			`break`
			`else:`
			`interesting_evidence.append(e)`
			`if len(interesting_evidence) >= 15:`
			`break`
			`p1 = 1`
			`p2 = 1`
			`for i in interesting_evidence:`
			`print("#", i["spam_prob"], i["type"], i["feature"])`
			`p1 *= i["spam_prob"]`
			`p2 *= 1 - i["spam_prob"]`
			`p = p1 / (p1 + p2)`
			`return p`

			`def main():`
			`ap = argparse.ArgumentParser()`
			`ap.add_argument('file', nargs='?')`
			`args = ap.parse_args()`
			`if args.file:`
			`fh = open(args.file, "rb")`
			`else:`
			`fh = os.fdopen(0, "rb")`
			`parser = email.parser.BytesParser(policy=email.policy.default)`
			`msg = parser.parse(fh)`
			`add_message(msg)`
			`p = extract_features()`
			`print(p, "spam" if p > 0.5 else "ham")`

			`main()`