Implement basic idea
I start with tokens of length 1, and add longer tokens iff they extend a previously seen token by one character. Probability computation follow's Paul Graham's "A Plan for Spam", except that I haven't implemented some of his tweaks (most importantly, I don't account for frequencs within a message like he does). While selecting tokens for judging a message, I ignore substrings of tokens that have been seen previously. This still results in the majority of tokens to overlap, which is probably not good.
This commit is contained in:
commit
f3817c4355
|
@ -0,0 +1,89 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
import argparse
|
||||
import email.parser
|
||||
import email.policy
|
||||
import os
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras as ppe
|
||||
|
||||
components = {}
|
||||
|
||||
def add_component(t, c):
|
||||
if t not in components:
|
||||
components[t] = []
|
||||
components[t].append(c)
|
||||
|
||||
def add_message(msg):
|
||||
for h in msg.items():
|
||||
add_component(*h)
|
||||
|
||||
if msg.is_multipart():
|
||||
for p in msg.get_payload():
|
||||
add_message(p)
|
||||
else:
|
||||
if msg.get_content_maintype() == "text":
|
||||
charset = msg.get_param("charset", "iso-8859-1")
|
||||
add_component(
|
||||
msg.get_content_subtype(),
|
||||
msg.get_payload(decode=True)
|
||||
.decode(charset, errors='replace'))
|
||||
|
||||
def extract_features(msgtype, msgid):
|
||||
db = psycopg2.connect("dbname=bayes")
|
||||
csr = db.cursor(cursor_factory=ppe.DictCursor)
|
||||
csr.execute(
|
||||
"""
|
||||
insert into messages(id, type, message_id)
|
||||
values(default, %s, %s)
|
||||
returning id
|
||||
""",
|
||||
(msgtype, msgid))
|
||||
msg_pk = csr.fetchone()["id"]
|
||||
for t in components:
|
||||
prev = {""}
|
||||
length = 1
|
||||
while prev:
|
||||
current = set()
|
||||
for c in components[t]:
|
||||
for o in range(0, len(c) - length + 1):
|
||||
f = c[o:o+length]
|
||||
fp = f[:-1]
|
||||
if fp in prev:
|
||||
current.add(f)
|
||||
|
||||
# Record for this message
|
||||
for f in current:
|
||||
csr.execute(
|
||||
"insert into message_features(message, type, length, feature) values(%s, %s, %s, %s)",
|
||||
(msg_pk, t, length, f))
|
||||
db.commit()
|
||||
|
||||
# We keep only those as "prev" values which already existed
|
||||
# in the database
|
||||
prev = set()
|
||||
if current:
|
||||
q = "select * from features where type = %s and length = %s and feature in (" + ", ".join(("%s",) * len(current)) + ")"
|
||||
csr.execute(q, (t, length, *current))
|
||||
for r in csr:
|
||||
prev.add(r["feature"])
|
||||
length += 1
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument('file', nargs='?')
|
||||
ap.add_argument('--spam', action="store_const", const="spam", dest="type")
|
||||
ap.add_argument('--ham', action="store_const", const="ham", dest="type")
|
||||
args = ap.parse_args()
|
||||
if args.file:
|
||||
fh = open(args.file, "rb")
|
||||
else:
|
||||
fh = os.fdopen(0, "rb")
|
||||
parser = email.parser.BytesParser(policy=email.policy.default)
|
||||
msg = parser.parse(fh)
|
||||
add_message(msg)
|
||||
extract_features(args.type, msg["Message-Id"])
|
||||
|
||||
main()
|
|
@ -0,0 +1,81 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
import time
|
||||
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras as ppe
|
||||
|
||||
t0 = time.time()
|
||||
db = psycopg2.connect("dbname=bayes")
|
||||
csr = db.cursor(cursor_factory=ppe.DictCursor)
|
||||
csr.execute(
|
||||
"""
|
||||
delete from features
|
||||
""")
|
||||
t1 = time.time()
|
||||
print(t1 - t0, "deleted", csr.rowcount, "rows")
|
||||
csr.execute(
|
||||
"""
|
||||
insert into features(
|
||||
type, length, feature,
|
||||
spam_count, ham_count,
|
||||
spam_prob,
|
||||
interesting
|
||||
)
|
||||
with
|
||||
m as (
|
||||
select
|
||||
count(*) filter(where type='spam') as spam_message_count,
|
||||
count(*) filter(where type='ham') as ham_message_count
|
||||
from messages
|
||||
),
|
||||
f as (
|
||||
select f.type, length, feature,
|
||||
count(*) filter (where m.type = 'spam') as spam_count,
|
||||
count(*) filter (where m.type = 'ham') as ham_count
|
||||
from message_features f join messages m on (f.message = m.id)
|
||||
group by f.type, length, feature
|
||||
),
|
||||
f1 as (
|
||||
select
|
||||
type, length, feature,
|
||||
spam_count, ham_count,
|
||||
spam_count::float8 / spam_message_count as spam_ratio,
|
||||
ham_count::float8 / ham_message_count as ham_ratio
|
||||
from f, m
|
||||
),
|
||||
p as (
|
||||
select
|
||||
type, length, feature,
|
||||
spam_count, ham_count,
|
||||
case
|
||||
when spam_count + ham_count > 4 then spam_ratio / (spam_ratio + ham_ratio)
|
||||
end as spam_prob
|
||||
from f1
|
||||
),
|
||||
p1 as (
|
||||
select
|
||||
type, length, feature,
|
||||
spam_count, ham_count,
|
||||
case
|
||||
when spam_prob < 0.01 then 0.01
|
||||
when spam_prob > 0.99 then 0.99
|
||||
else spam_prob
|
||||
end as spam_prob
|
||||
from p
|
||||
),
|
||||
p2 as (
|
||||
select
|
||||
type, length, feature,
|
||||
spam_count, ham_count,
|
||||
spam_prob,
|
||||
abs(spam_prob - 0.5) as interesting
|
||||
from p1
|
||||
)
|
||||
select * from p2
|
||||
order by interesting desc
|
||||
""")
|
||||
t1 = time.time()
|
||||
print(t1 - t0, "inserted", csr.rowcount, "rows")
|
||||
db.commit()
|
|
@ -0,0 +1,95 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
import argparse
|
||||
import email.parser
|
||||
import email.policy
|
||||
import os
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras as ppe
|
||||
|
||||
components = {}
|
||||
|
||||
def add_component(t, c):
|
||||
if t not in components:
|
||||
components[t] = []
|
||||
components[t].append(c)
|
||||
|
||||
def add_message(msg):
|
||||
for h in msg.items():
|
||||
add_component(*h)
|
||||
|
||||
if msg.is_multipart():
|
||||
for p in msg.get_payload():
|
||||
add_message(p)
|
||||
else:
|
||||
if msg.get_content_maintype() == "text":
|
||||
charset = msg.get_param("charset", "iso-8859-1")
|
||||
add_component(
|
||||
msg.get_content_subtype(),
|
||||
msg.get_payload(decode=True)
|
||||
.decode(charset, errors='replace'))
|
||||
|
||||
def extract_features():
|
||||
db = psycopg2.connect("dbname=bayes")
|
||||
csr = db.cursor(cursor_factory=ppe.DictCursor)
|
||||
evidence = []
|
||||
for t in components:
|
||||
prev = {""}
|
||||
length = 1
|
||||
while prev:
|
||||
current = set()
|
||||
for c in components[t]:
|
||||
for o in range(0, len(c) - length + 1):
|
||||
f = c[o:o+length]
|
||||
fp = f[:-1]
|
||||
if fp in prev:
|
||||
current.add(f)
|
||||
|
||||
# We keep only those as "prev" values which already existed
|
||||
# in the database
|
||||
prev = set()
|
||||
if current:
|
||||
q = "select * from features where type = %s and length = %s and feature in (" + ", ".join(("%s",) * len(current)) + ")"
|
||||
q += " order by interesting desc nulls last"
|
||||
csr.execute(q, (t, length, *current))
|
||||
for i, r in enumerate(csr):
|
||||
prev.add(r["feature"])
|
||||
if i < 15 and r["interesting"]:
|
||||
evidence.append(r)
|
||||
length += 1
|
||||
evidence = sorted(evidence, key=lambda x: -x["length"])
|
||||
evidence = sorted(evidence, key=lambda x: -x["interesting"])
|
||||
interesting_evidence = []
|
||||
for e in evidence:
|
||||
for i in interesting_evidence:
|
||||
if e["type"] == i["type"] and e["feature"] in i["feature"]:
|
||||
break
|
||||
else:
|
||||
interesting_evidence.append(e)
|
||||
if len(interesting_evidence) >= 15:
|
||||
break
|
||||
p1 = 1
|
||||
p2 = 1
|
||||
for i in interesting_evidence:
|
||||
print("#", i["spam_prob"], i["type"], i["feature"])
|
||||
p1 *= i["spam_prob"]
|
||||
p2 *= 1 - i["spam_prob"]
|
||||
p = p1 / (p1 + p2)
|
||||
return p
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument('file', nargs='?')
|
||||
args = ap.parse_args()
|
||||
if args.file:
|
||||
fh = open(args.file, "rb")
|
||||
else:
|
||||
fh = os.fdopen(0, "rb")
|
||||
parser = email.parser.BytesParser(policy=email.policy.default)
|
||||
msg = parser.parse(fh)
|
||||
add_message(msg)
|
||||
p = extract_features()
|
||||
print(p, "spam" if p > 0.5 else "ham")
|
||||
|
||||
main()
|
Loading…
Reference in New Issue