From f3817c4355bb06bba978519899794d88727b3e73 Mon Sep 17 00:00:00 2001 From: "Peter J. Holzer" Date: Sat, 17 Aug 2019 09:29:11 +0200 Subject: [PATCH] Implement basic idea I start with tokens of length 1, and add longer tokens iff they extend a previously seen token by one character. Probability computation follow's Paul Graham's "A Plan for Spam", except that I haven't implemented some of his tweaks (most importantly, I don't account for frequencs within a message like he does). While selecting tokens for judging a message, I ignore substrings of tokens that have been seen previously. This still results in the majority of tokens to overlap, which is probably not good. --- add_message | 89 +++++++++++++++++++++++++++++++++++++++++++++++ aggregate | 81 +++++++++++++++++++++++++++++++++++++++++++ judge_message | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 265 insertions(+) create mode 100755 add_message create mode 100755 aggregate create mode 100755 judge_message diff --git a/add_message b/add_message new file mode 100755 index 0000000..d5e3632 --- /dev/null +++ b/add_message @@ -0,0 +1,89 @@ +#!/usr/bin/python3 + +import argparse +import email.parser +import email.policy +import os + +import psycopg2 +import psycopg2.extras as ppe + +components = {} + +def add_component(t, c): + if t not in components: + components[t] = [] + components[t].append(c) + +def add_message(msg): + for h in msg.items(): + add_component(*h) + + if msg.is_multipart(): + for p in msg.get_payload(): + add_message(p) + else: + if msg.get_content_maintype() == "text": + charset = msg.get_param("charset", "iso-8859-1") + add_component( + msg.get_content_subtype(), + msg.get_payload(decode=True) + .decode(charset, errors='replace')) + +def extract_features(msgtype, msgid): + db = psycopg2.connect("dbname=bayes") + csr = db.cursor(cursor_factory=ppe.DictCursor) + csr.execute( + """ + insert into messages(id, type, message_id) + values(default, %s, %s) + returning id + """, + (msgtype, msgid)) + msg_pk = csr.fetchone()["id"] + for t in components: + prev = {""} + length = 1 + while prev: + current = set() + for c in components[t]: + for o in range(0, len(c) - length + 1): + f = c[o:o+length] + fp = f[:-1] + if fp in prev: + current.add(f) + + # Record for this message + for f in current: + csr.execute( + "insert into message_features(message, type, length, feature) values(%s, %s, %s, %s)", + (msg_pk, t, length, f)) + db.commit() + + # We keep only those as "prev" values which already existed + # in the database + prev = set() + if current: + q = "select * from features where type = %s and length = %s and feature in (" + ", ".join(("%s",) * len(current)) + ")" + csr.execute(q, (t, length, *current)) + for r in csr: + prev.add(r["feature"]) + length += 1 + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument('file', nargs='?') + ap.add_argument('--spam', action="store_const", const="spam", dest="type") + ap.add_argument('--ham', action="store_const", const="ham", dest="type") + args = ap.parse_args() + if args.file: + fh = open(args.file, "rb") + else: + fh = os.fdopen(0, "rb") + parser = email.parser.BytesParser(policy=email.policy.default) + msg = parser.parse(fh) + add_message(msg) + extract_features(args.type, msg["Message-Id"]) + +main() diff --git a/aggregate b/aggregate new file mode 100755 index 0000000..a7f5f8c --- /dev/null +++ b/aggregate @@ -0,0 +1,81 @@ +#!/usr/bin/python3 + +import time + + +import psycopg2 +import psycopg2.extras as ppe + +t0 = time.time() +db = psycopg2.connect("dbname=bayes") +csr = db.cursor(cursor_factory=ppe.DictCursor) +csr.execute( + """ + delete from features + """) +t1 = time.time() +print(t1 - t0, "deleted", csr.rowcount, "rows") +csr.execute( + """ + insert into features( + type, length, feature, + spam_count, ham_count, + spam_prob, + interesting + ) + with + m as ( + select + count(*) filter(where type='spam') as spam_message_count, + count(*) filter(where type='ham') as ham_message_count + from messages + ), + f as ( + select f.type, length, feature, + count(*) filter (where m.type = 'spam') as spam_count, + count(*) filter (where m.type = 'ham') as ham_count + from message_features f join messages m on (f.message = m.id) + group by f.type, length, feature + ), + f1 as ( + select + type, length, feature, + spam_count, ham_count, + spam_count::float8 / spam_message_count as spam_ratio, + ham_count::float8 / ham_message_count as ham_ratio + from f, m + ), + p as ( + select + type, length, feature, + spam_count, ham_count, + case + when spam_count + ham_count > 4 then spam_ratio / (spam_ratio + ham_ratio) + end as spam_prob + from f1 + ), + p1 as ( + select + type, length, feature, + spam_count, ham_count, + case + when spam_prob < 0.01 then 0.01 + when spam_prob > 0.99 then 0.99 + else spam_prob + end as spam_prob + from p + ), + p2 as ( + select + type, length, feature, + spam_count, ham_count, + spam_prob, + abs(spam_prob - 0.5) as interesting + from p1 + ) + select * from p2 + order by interesting desc + """) +t1 = time.time() +print(t1 - t0, "inserted", csr.rowcount, "rows") +db.commit() diff --git a/judge_message b/judge_message new file mode 100755 index 0000000..281bf04 --- /dev/null +++ b/judge_message @@ -0,0 +1,95 @@ +#!/usr/bin/python3 + +import argparse +import email.parser +import email.policy +import os + +import psycopg2 +import psycopg2.extras as ppe + +components = {} + +def add_component(t, c): + if t not in components: + components[t] = [] + components[t].append(c) + +def add_message(msg): + for h in msg.items(): + add_component(*h) + + if msg.is_multipart(): + for p in msg.get_payload(): + add_message(p) + else: + if msg.get_content_maintype() == "text": + charset = msg.get_param("charset", "iso-8859-1") + add_component( + msg.get_content_subtype(), + msg.get_payload(decode=True) + .decode(charset, errors='replace')) + +def extract_features(): + db = psycopg2.connect("dbname=bayes") + csr = db.cursor(cursor_factory=ppe.DictCursor) + evidence = [] + for t in components: + prev = {""} + length = 1 + while prev: + current = set() + for c in components[t]: + for o in range(0, len(c) - length + 1): + f = c[o:o+length] + fp = f[:-1] + if fp in prev: + current.add(f) + + # We keep only those as "prev" values which already existed + # in the database + prev = set() + if current: + q = "select * from features where type = %s and length = %s and feature in (" + ", ".join(("%s",) * len(current)) + ")" + q += " order by interesting desc nulls last" + csr.execute(q, (t, length, *current)) + for i, r in enumerate(csr): + prev.add(r["feature"]) + if i < 15 and r["interesting"]: + evidence.append(r) + length += 1 + evidence = sorted(evidence, key=lambda x: -x["length"]) + evidence = sorted(evidence, key=lambda x: -x["interesting"]) + interesting_evidence = [] + for e in evidence: + for i in interesting_evidence: + if e["type"] == i["type"] and e["feature"] in i["feature"]: + break + else: + interesting_evidence.append(e) + if len(interesting_evidence) >= 15: + break + p1 = 1 + p2 = 1 + for i in interesting_evidence: + print("#", i["spam_prob"], i["type"], i["feature"]) + p1 *= i["spam_prob"] + p2 *= 1 - i["spam_prob"] + p = p1 / (p1 + p2) + return p + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument('file', nargs='?') + args = ap.parse_args() + if args.file: + fh = open(args.file, "rb") + else: + fh = os.fdopen(0, "rb") + parser = email.parser.BytesParser(policy=email.policy.default) + msg = parser.parse(fh) + add_message(msg) + p = extract_features() + print(p, "spam" if p > 0.5 else "ham") + +main()