#!/usr/bin/python3 import argparse import email.parser import email.policy import os import psycopg2 import psycopg2.extras as ppe components = {} def add_component(t, c): if t not in components: components[t] = [] components[t].append(c) def add_message(msg): for h in msg.items(): add_component(*h) if msg.is_multipart(): for p in msg.get_payload(): add_message(p) else: if msg.get_content_maintype() == "text": charset = msg.get_param("charset", "iso-8859-1") add_component( msg.get_content_subtype(), msg.get_payload(decode=True) .decode(charset, errors='replace')) def extract_features(): db = psycopg2.connect("dbname=bayes") csr = db.cursor(cursor_factory=ppe.DictCursor) evidence = [] for t in components: prev = {""} length = 1 while prev: current = set() for c in components[t]: for o in range(0, len(c) - length + 1): f = c[o:o+length] fp = f[:-1] if fp in prev: current.add(f) # We keep only those as "prev" values which already existed # in the database prev = set() if current: q = "select * from features where type = %s and length = %s and feature in (" + ", ".join(("%s",) * len(current)) + ")" q += " order by interesting desc nulls last" csr.execute(q, (t, length, *current)) for i, r in enumerate(csr): prev.add(r["feature"]) if i < 15 and r["interesting"]: evidence.append(r) length += 1 evidence = sorted(evidence, key=lambda x: -x["length"]) evidence = sorted(evidence, key=lambda x: -x["interesting"]) interesting_evidence = [] for e in evidence: for i in interesting_evidence: if e["type"] == i["type"] and e["feature"] in i["feature"]: break else: interesting_evidence.append(e) if len(interesting_evidence) >= 15: break p1 = 1 p2 = 1 for i in interesting_evidence: print("#", i["spam_prob"], i["type"], i["feature"]) p1 *= i["spam_prob"] p2 *= 1 - i["spam_prob"] p = p1 / (p1 + p2) return p def main(): ap = argparse.ArgumentParser() ap.add_argument('file', nargs='?') args = ap.parse_args() if args.file: fh = open(args.file, "rb") else: fh = os.fdopen(0, "rb") parser = email.parser.BytesParser(policy=email.policy.default) msg = parser.parse(fh) add_message(msg) p = extract_features() print(p, "spam" if p > 0.5 else "ham") main()