bayes/aggregate

69 lines
2.0 KiB
Plaintext
Raw Permalink Normal View History

#!/usr/bin/python3
import time
import psycopg2
import psycopg2.extras as ppe
t0 = time.time()
db = psycopg2.connect("dbname=bayes")
csr = db.cursor(cursor_factory=ppe.DictCursor)
csr.execute(
"""
delete from features
""")
t1 = time.time()
print(t1 - t0, "deleted", csr.rowcount, "rows")
csr.execute(
"""
insert into features(
type, length, feature,
spam_count, ham_count,
spam_prob,
interesting
)
with
m as (
select
count(*) filter(where type='spam') as spam_message_count,
count(*) filter(where type='ham') as ham_message_count
from messages
),
f as (
select f.type, length, feature,
count(*) filter (where m.type = 'spam') as spam_count,
count(*) filter (where m.type = 'ham') as ham_count
from message_features f join messages m on (f.message = m.id)
group by f.type, length, feature
),
f1 as (
select
type, length, feature,
spam_count, ham_count,
(spam_count + 1.0) / (spam_message_count + 1.0) as spam_ratio,
(ham_count + 1.0) / (ham_message_count + 1.0) as ham_ratio
from f, m
),
p as (
select
type, length, feature,
spam_count, ham_count,
spam_ratio / (spam_ratio + ham_ratio) as spam_prob
from f1
),
p2 as (
select
type, length, feature,
spam_count, ham_count,
spam_prob,
abs(spam_prob - 0.5) as interesting
from p
)
select * from p2
order by interesting desc
""")
t1 = time.time()
print(t1 - t0, "inserted", csr.rowcount, "rows")
db.commit()