bayes/judge_message

116 lines
3.6 KiB
Python
Executable File

#!/usr/bin/python3
import argparse
import email.parser
import email.policy
import os
import psycopg2
import psycopg2.extras as ppe
components = {}
EVIDENCE_MAX = 20
def add_component(t, c):
if t not in components:
components[t] = []
components[t].append(c)
def add_message(msg):
for h in msg.items():
add_component(*h)
if msg.is_multipart():
for p in msg.get_payload():
add_message(p)
else:
if msg.get_content_maintype() == "text":
charset = msg.get_param("charset", "iso-8859-1")
add_component(
msg.get_content_subtype(),
msg.get_payload(decode=True)
.decode(charset, errors='replace'))
def extract_features(msgid, verbose):
db = psycopg2.connect("dbname=bayes")
csr = db.cursor(cursor_factory=ppe.DictCursor)
csr.execute(
"""
insert into messages(id, message_id)
values(default, %s)
returning id
""",
(msgid,))
msg_pk = csr.fetchone()["id"]
evidence = []
for t in components:
prev = {""}
length = 1
while prev:
current = set()
for c in components[t]:
for o in range(0, len(c) - length + 1):
f = c[o:o+length]
fp = f[:-1]
if fp in prev:
current.add(f)
# We keep only those as "prev" values which already existed
# in the database
prev = set()
if current:
q = "select * from features where type = %s and length = %s and feature in (" + ", ".join(("%s",) * len(current)) + ")"
q += " order by interesting desc nulls last"
csr.execute(q, (t, length, *current))
for i, r in enumerate(csr):
prev.add(r["feature"])
if i < EVIDENCE_MAX and r["interesting"]:
evidence.append(r)
length += 1
evidence = sorted(evidence, key=lambda x: -x["length"])
evidence = sorted(evidence, key=lambda x: -x["interesting"])
interesting_evidence = []
seen = []
for e in evidence:
new_comp = []
for c in components[e["type"]]:
new_comp += c.split(e["feature"])
if len(new_comp) > len(components[e["type"]]):
# we found it somewhere
interesting_evidence.append(e)
components[e["type"]] = new_comp
if len(interesting_evidence) >= EVIDENCE_MAX:
break
p1 = 1
p2 = 1
for i in interesting_evidence:
if verbose:
print("#", i["spam_prob"], i["type"], i["length"], i["feature"], sep="\t")
csr.execute(
"insert into used_evidence(message, spam_prob, type, length, feature) values(%s, %s, %s, %s, %s)",
(msg_pk, i["spam_prob"], i["type"], i["length"], i["feature"]))
p1 *= i["spam_prob"]
p2 *= 1 - i["spam_prob"]
p = p1 / (p1 + p2)
csr.execute("update messages set type=%s where id = %s", ("%.6f" % p, msg_pk))
db.commit()
return p
def main():
ap = argparse.ArgumentParser()
ap.add_argument('--verbose', action='store_true')
ap.add_argument('file', nargs='?')
args = ap.parse_args()
if args.file:
fh = open(args.file, "rb")
else:
fh = os.fdopen(0, "rb")
parser = email.parser.BytesParser(policy=email.policy.default)
msg = parser.parse(fh)
add_message(msg)
p = extract_features(msg["Message-Id"], args.verbose)
print(p, "spam" if p > 0.5 else "ham")
main()