90 lines
2.6 KiB
Plaintext
90 lines
2.6 KiB
Plaintext
|
#!/usr/bin/python3
|
||
|
|
||
|
import argparse
|
||
|
import email.parser
|
||
|
import email.policy
|
||
|
import os
|
||
|
|
||
|
import psycopg2
|
||
|
import psycopg2.extras as ppe
|
||
|
|
||
|
components = {}
|
||
|
|
||
|
def add_component(t, c):
|
||
|
if t not in components:
|
||
|
components[t] = []
|
||
|
components[t].append(c)
|
||
|
|
||
|
def add_message(msg):
|
||
|
for h in msg.items():
|
||
|
add_component(*h)
|
||
|
|
||
|
if msg.is_multipart():
|
||
|
for p in msg.get_payload():
|
||
|
add_message(p)
|
||
|
else:
|
||
|
if msg.get_content_maintype() == "text":
|
||
|
charset = msg.get_param("charset", "iso-8859-1")
|
||
|
add_component(
|
||
|
msg.get_content_subtype(),
|
||
|
msg.get_payload(decode=True)
|
||
|
.decode(charset, errors='replace'))
|
||
|
|
||
|
def extract_features(msgtype, msgid):
|
||
|
db = psycopg2.connect("dbname=bayes")
|
||
|
csr = db.cursor(cursor_factory=ppe.DictCursor)
|
||
|
csr.execute(
|
||
|
"""
|
||
|
insert into messages(id, type, message_id)
|
||
|
values(default, %s, %s)
|
||
|
returning id
|
||
|
""",
|
||
|
(msgtype, msgid))
|
||
|
msg_pk = csr.fetchone()["id"]
|
||
|
for t in components:
|
||
|
prev = {""}
|
||
|
length = 1
|
||
|
while prev:
|
||
|
current = set()
|
||
|
for c in components[t]:
|
||
|
for o in range(0, len(c) - length + 1):
|
||
|
f = c[o:o+length]
|
||
|
fp = f[:-1]
|
||
|
if fp in prev:
|
||
|
current.add(f)
|
||
|
|
||
|
# Record for this message
|
||
|
for f in current:
|
||
|
csr.execute(
|
||
|
"insert into message_features(message, type, length, feature) values(%s, %s, %s, %s)",
|
||
|
(msg_pk, t, length, f))
|
||
|
db.commit()
|
||
|
|
||
|
# We keep only those as "prev" values which already existed
|
||
|
# in the database
|
||
|
prev = set()
|
||
|
if current:
|
||
|
q = "select * from features where type = %s and length = %s and feature in (" + ", ".join(("%s",) * len(current)) + ")"
|
||
|
csr.execute(q, (t, length, *current))
|
||
|
for r in csr:
|
||
|
prev.add(r["feature"])
|
||
|
length += 1
|
||
|
|
||
|
|
||
|
def main():
|
||
|
ap = argparse.ArgumentParser()
|
||
|
ap.add_argument('file', nargs='?')
|
||
|
ap.add_argument('--spam', action="store_const", const="spam", dest="type")
|
||
|
ap.add_argument('--ham', action="store_const", const="ham", dest="type")
|
||
|
args = ap.parse_args()
|
||
|
if args.file:
|
||
|
fh = open(args.file, "rb")
|
||
|
else:
|
||
|
fh = os.fdopen(0, "rb")
|
||
|
parser = email.parser.BytesParser(policy=email.policy.default)
|
||
|
msg = parser.parse(fh)
|
||
|
add_message(msg)
|
||
|
extract_features(args.type, msg["Message-Id"])
|
||
|
|
||
|
main()
|