#!/usr/bin/python3 import argparse import email.parser import email.policy import os import psycopg2 import psycopg2.extras as ppe components = {} def add_component(t, c): if t not in components: components[t] = [] components[t].append(c) def add_message(msg): for h in msg.items(): add_component(*h) if msg.is_multipart(): for p in msg.get_payload(): add_message(p) else: if msg.get_content_maintype() == "text": charset = msg.get_param("charset", "iso-8859-1") add_component( msg.get_content_subtype(), msg.get_payload(decode=True) .decode(charset, errors='replace')) def extract_features(msgtype, msgid): db = psycopg2.connect("dbname=bayes") csr = db.cursor(cursor_factory=ppe.DictCursor) csr.execute( """ insert into messages(id, type, message_id) values(default, %s, %s) returning id """, (msgtype, msgid)) msg_pk = csr.fetchone()["id"] for t in components: prev = {""} length = 1 while prev: current = set() for c in components[t]: for o in range(0, len(c) - length + 1): f = c[o:o+length] fp = f[:-1] if fp in prev: current.add(f) # Record for this message for f in current: csr.execute( "insert into message_features(message, type, length, feature) values(%s, %s, %s, %s)", (msg_pk, t, length, f)) db.commit() # We keep only those as "prev" values which already existed # in the database prev = set() if current: q = "select * from features where type = %s and length = %s and feature in (" + ", ".join(("%s",) * len(current)) + ")" csr.execute(q, (t, length, *current)) for r in csr: prev.add(r["feature"]) length += 1 def main(): ap = argparse.ArgumentParser() ap.add_argument('file', nargs='?') ap.add_argument('--spam', action="store_const", const="spam", dest="type") ap.add_argument('--ham', action="store_const", const="ham", dest="type") args = ap.parse_args() if args.file: fh = open(args.file, "rb") else: fh = os.fdopen(0, "rb") parser = email.parser.BytesParser(policy=email.policy.default) msg = parser.parse(fh) add_message(msg) extract_features(args.type, msg["Message-Id"]) main()