116 lines
3.5 KiB
Python
Executable File
116 lines
3.5 KiB
Python
Executable File
#!/usr/bin/python3
|
|
|
|
import argparse
|
|
import email.parser
|
|
import email.policy
|
|
import os
|
|
|
|
import psycopg2
|
|
import psycopg2.extras as ppe
|
|
|
|
components = {}
|
|
|
|
EVIDENCE_MAX = 20
|
|
|
|
def add_component(t, c):
|
|
if t not in components:
|
|
components[t] = []
|
|
components[t].append(c)
|
|
|
|
def add_message(msg):
|
|
for h in msg.items():
|
|
add_component(*h)
|
|
|
|
if msg.is_multipart():
|
|
for p in msg.get_payload():
|
|
add_message(p)
|
|
else:
|
|
if msg.get_content_maintype() == "text":
|
|
charset = msg.get_param("charset", "iso-8859-1")
|
|
add_component(
|
|
msg.get_content_subtype(),
|
|
msg.get_payload(decode=True)
|
|
.decode(charset, errors='replace'))
|
|
|
|
def extract_features(msgid, verbose, used_evidence):
|
|
db = psycopg2.connect("dbname=bayes")
|
|
csr = db.cursor(cursor_factory=ppe.DictCursor)
|
|
csr.execute(
|
|
"""
|
|
insert into messages(id, message_id)
|
|
values(default, %s)
|
|
returning id
|
|
""",
|
|
(msgid,))
|
|
msg_pk = csr.fetchone()["id"]
|
|
evidence = []
|
|
for t in components:
|
|
#raise RuntimeError("Baustelle")
|
|
q = "select * from features where type = %s and interesting > 0.4"
|
|
csr.execute(q, (t,))
|
|
db_features = {}
|
|
maxlen = 0
|
|
for f in csr:
|
|
db_features[f["feature"]] = f
|
|
if f["length"] > maxlen:
|
|
maxlen = f["length"]
|
|
for length in range(1, maxlen + 1):
|
|
current = set()
|
|
for c in components[t]:
|
|
for o in range(0, len(c) - length + 1):
|
|
f = c[o:o+length]
|
|
current.add(f)
|
|
|
|
for f in current:
|
|
if f in db_features:
|
|
evidence.append(db_features[f])
|
|
evidence = sorted(evidence, key=lambda x: -x["length"])
|
|
evidence = sorted(evidence, key=lambda x: -x["interesting"])
|
|
interesting_evidence = []
|
|
seen = []
|
|
for e in evidence:
|
|
new_comp = []
|
|
for c in components[e["type"]]:
|
|
new_comp += c.split(e["feature"])
|
|
if len(new_comp) > len(components[e["type"]]):
|
|
# we found it somewhere
|
|
interesting_evidence.append(e)
|
|
components[e["type"]] = new_comp
|
|
if len(interesting_evidence) >= EVIDENCE_MAX:
|
|
break
|
|
p1 = 1
|
|
p2 = 1
|
|
for i in interesting_evidence:
|
|
if verbose:
|
|
print("#", i["spam_prob"], i["type"], i["length"], i["feature"], sep="\t")
|
|
if used_evidence:
|
|
csr.execute(
|
|
"insert into used_evidence(message, spam_prob, type, length, feature) values(%s, %s, %s, %s, %s)",
|
|
(msg_pk, i["spam_prob"], i["type"], i["length"], i["feature"]))
|
|
p1 *= i["spam_prob"]
|
|
p2 *= 1 - i["spam_prob"]
|
|
p = p1 / (p1 + p2)
|
|
csr.execute("update messages set type=%s where id = %s", ("%.6f" % p, msg_pk))
|
|
db.commit()
|
|
return p
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument('--verbose', action='store_true')
|
|
ap.add_argument('--no-used-evidence', action='store_false', dest='used_evidence')
|
|
ap.add_argument('file', nargs='?')
|
|
args = ap.parse_args()
|
|
if args.file:
|
|
fh = open(args.file, "rb")
|
|
else:
|
|
fh = os.fdopen(0, "rb")
|
|
parser = email.parser.BytesParser(policy=email.policy.default)
|
|
msg = parser.parse(fh)
|
|
add_message(msg)
|
|
p = extract_features(msg["Message-Id"], args.verbose, args.used_evidence)
|
|
print(p, "spam" if p > 0.5 else "ham")
|
|
|
|
main()
|
|
|
|
# vim: tw=79
|