bayes/aggregate

82 lines
2.5 KiB
Plaintext
Raw Normal View History

#!/usr/bin/python3
import time
import psycopg2
import psycopg2.extras as ppe
t0 = time.time()
db = psycopg2.connect("dbname=bayes")
csr = db.cursor(cursor_factory=ppe.DictCursor)
csr.execute(
"""
delete from features
""")
t1 = time.time()
print(t1 - t0, "deleted", csr.rowcount, "rows")
csr.execute(
"""
insert into features(
type, length, feature,
spam_count, ham_count,
spam_prob,
interesting
)
with
m as (
select
count(*) filter(where type='spam') as spam_message_count,
count(*) filter(where type='ham') as ham_message_count
from messages
),
f as (
select f.type, length, feature,
count(*) filter (where m.type = 'spam') as spam_count,
count(*) filter (where m.type = 'ham') as ham_count
from message_features f join messages m on (f.message = m.id)
group by f.type, length, feature
),
f1 as (
select
type, length, feature,
spam_count, ham_count,
spam_count::float8 / spam_message_count as spam_ratio,
ham_count::float8 / ham_message_count as ham_ratio
from f, m
),
p as (
select
type, length, feature,
spam_count, ham_count,
case
when spam_count + ham_count > 4 then spam_ratio / (spam_ratio + ham_ratio)
end as spam_prob
from f1
),
p1 as (
select
type, length, feature,
spam_count, ham_count,
case
when spam_prob < 0.01 then 0.01
when spam_prob > 0.99 then 0.99
else spam_prob
end as spam_prob
from p
),
p2 as (
select
type, length, feature,
spam_count, ham_count,
spam_prob,
abs(spam_prob - 0.5) as interesting
from p1
)
select * from p2
order by interesting desc
""")
t1 = time.time()
print(t1 - t0, "inserted", csr.rowcount, "rows")
db.commit()