#!/usr/bin/python3 import argparse import email.parser import email.policy import os import psycopg2 import psycopg2.extras as ppe components = {} EVIDENCE_MAX = 20 def add_component(t, c): if t not in components: components[t] = [] components[t].append(c) def add_message(msg): for h in msg.items(): add_component(*h) if msg.is_multipart(): for p in msg.get_payload(): add_message(p) else: if msg.get_content_maintype() == "text": charset = msg.get_param("charset", "iso-8859-1") add_component( msg.get_content_subtype(), msg.get_payload(decode=True) .decode(charset, errors='replace')) def extract_features(msgid): db = psycopg2.connect("dbname=bayes") csr = db.cursor(cursor_factory=ppe.DictCursor) csr.execute( """ insert into messages(id, message_id) values(default, %s) returning id """, (msgid,)) msg_pk = csr.fetchone()["id"] evidence = [] for t in components: prev = {""} length = 1 while prev: current = set() for c in components[t]: for o in range(0, len(c) - length + 1): f = c[o:o+length] fp = f[:-1] if fp in prev: current.add(f) # We keep only those as "prev" values which already existed # in the database prev = set() if current: q = "select * from features where type = %s and length = %s and feature in (" + ", ".join(("%s",) * len(current)) + ")" q += " order by interesting desc nulls last" csr.execute(q, (t, length, *current)) for i, r in enumerate(csr): prev.add(r["feature"]) if i < EVIDENCE_MAX and r["interesting"]: evidence.append(r) length += 1 evidence = sorted(evidence, key=lambda x: -x["length"]) evidence = sorted(evidence, key=lambda x: -x["interesting"]) interesting_evidence = [] seen = [] for e in evidence: for s in seen: if e["type"] == s["type"] and (e["feature"] in s["feature"] or s["feature"] in e["feature"]): break else: interesting_evidence.append(e) seen.append(e) ln = len(e["feature"]) if ln >= 6: seen.append({"type": e["type"], "feature": e["feature"][0 : ln // 3]}) seen.append({"type": e["type"], "feature": e["feature"][ln // 3 : ln * 2 // 3]}) seen.append({"type": e["type"], "feature": e["feature"][ln * 2 // 3 : ln]}) if len(interesting_evidence) >= EVIDENCE_MAX: break p1 = 1 p2 = 1 for i in interesting_evidence: print("#", i["spam_prob"], i["type"], i["length"], i["feature"], sep="\t") csr.execute( "insert into used_evidence(message, spam_prob, type, length, feature) values(%s, %s, %s, %s, %s)", (msg_pk, i["spam_prob"], i["type"], i["length"], i["feature"])) p1 *= i["spam_prob"] p2 *= 1 - i["spam_prob"] p = p1 / (p1 + p2) csr.execute("update messages set type=%s where id = %s", ("%.6f" % p, msg_pk)) db.commit() return p def main(): ap = argparse.ArgumentParser() ap.add_argument('file', nargs='?') args = ap.parse_args() if args.file: fh = open(args.file, "rb") else: fh = os.fdopen(0, "rb") parser = email.parser.BytesParser(policy=email.policy.default) msg = parser.parse(fh) add_message(msg) p = extract_features(msg["Message-Id"]) print(p, "spam" if p > 0.5 else "ham") main()