82 lines
2.5 KiB
Python
Executable File
82 lines
2.5 KiB
Python
Executable File
#!/usr/bin/python3
|
|
|
|
import time
|
|
|
|
|
|
import psycopg2
|
|
import psycopg2.extras as ppe
|
|
|
|
t0 = time.time()
|
|
db = psycopg2.connect("dbname=bayes")
|
|
csr = db.cursor(cursor_factory=ppe.DictCursor)
|
|
csr.execute(
|
|
"""
|
|
delete from features
|
|
""")
|
|
t1 = time.time()
|
|
print(t1 - t0, "deleted", csr.rowcount, "rows")
|
|
csr.execute(
|
|
"""
|
|
insert into features(
|
|
type, length, feature,
|
|
spam_count, ham_count,
|
|
spam_prob,
|
|
interesting
|
|
)
|
|
with
|
|
m as (
|
|
select
|
|
count(*) filter(where type='spam') as spam_message_count,
|
|
count(*) filter(where type='ham') as ham_message_count
|
|
from messages
|
|
),
|
|
f as (
|
|
select f.type, length, feature,
|
|
count(*) filter (where m.type = 'spam') as spam_count,
|
|
count(*) filter (where m.type = 'ham') as ham_count
|
|
from message_features f join messages m on (f.message = m.id)
|
|
group by f.type, length, feature
|
|
),
|
|
f1 as (
|
|
select
|
|
type, length, feature,
|
|
spam_count, ham_count,
|
|
spam_count::float8 / spam_message_count as spam_ratio,
|
|
ham_count::float8 / ham_message_count as ham_ratio
|
|
from f, m
|
|
),
|
|
p as (
|
|
select
|
|
type, length, feature,
|
|
spam_count, ham_count,
|
|
case
|
|
when spam_count + ham_count > 4 then spam_ratio / (spam_ratio + ham_ratio)
|
|
end as spam_prob
|
|
from f1
|
|
),
|
|
p1 as (
|
|
select
|
|
type, length, feature,
|
|
spam_count, ham_count,
|
|
case
|
|
when spam_prob < 0.01 then 0.01
|
|
when spam_prob > 0.99 then 0.99
|
|
else spam_prob
|
|
end as spam_prob
|
|
from p
|
|
),
|
|
p2 as (
|
|
select
|
|
type, length, feature,
|
|
spam_count, ham_count,
|
|
spam_prob,
|
|
abs(spam_prob - 0.5) as interesting
|
|
from p1
|
|
)
|
|
select * from p2
|
|
order by interesting desc
|
|
""")
|
|
t1 = time.time()
|
|
print(t1 - t0, "inserted", csr.rowcount, "rows")
|
|
db.commit()
|