69 lines
2.0 KiB
Python
Executable File
69 lines
2.0 KiB
Python
Executable File
#!/usr/bin/python3
|
|
|
|
import time
|
|
|
|
|
|
import psycopg2
|
|
import psycopg2.extras as ppe
|
|
|
|
t0 = time.time()
|
|
db = psycopg2.connect("dbname=bayes")
|
|
csr = db.cursor(cursor_factory=ppe.DictCursor)
|
|
csr.execute(
|
|
"""
|
|
delete from features
|
|
""")
|
|
t1 = time.time()
|
|
print(t1 - t0, "deleted", csr.rowcount, "rows")
|
|
csr.execute(
|
|
"""
|
|
insert into features(
|
|
type, length, feature,
|
|
spam_count, ham_count,
|
|
spam_prob,
|
|
interesting
|
|
)
|
|
with
|
|
m as (
|
|
select
|
|
count(*) filter(where type='spam') as spam_message_count,
|
|
count(*) filter(where type='ham') as ham_message_count
|
|
from messages
|
|
),
|
|
f as (
|
|
select f.type, length, feature,
|
|
count(*) filter (where m.type = 'spam') as spam_count,
|
|
count(*) filter (where m.type = 'ham') as ham_count
|
|
from message_features f join messages m on (f.message = m.id)
|
|
group by f.type, length, feature
|
|
),
|
|
f1 as (
|
|
select
|
|
type, length, feature,
|
|
spam_count, ham_count,
|
|
(spam_count + 1.0) / (spam_message_count + 1.0) as spam_ratio,
|
|
(ham_count + 1.0) / (ham_message_count + 1.0) as ham_ratio
|
|
from f, m
|
|
),
|
|
p as (
|
|
select
|
|
type, length, feature,
|
|
spam_count, ham_count,
|
|
spam_ratio / (spam_ratio + ham_ratio) as spam_prob
|
|
from f1
|
|
),
|
|
p2 as (
|
|
select
|
|
type, length, feature,
|
|
spam_count, ham_count,
|
|
spam_prob,
|
|
abs(spam_prob - 0.5) as interesting
|
|
from p
|
|
)
|
|
select * from p2
|
|
order by interesting desc
|
|
""")
|
|
t1 = time.time()
|
|
print(t1 - t0, "inserted", csr.rowcount, "rows")
|
|
db.commit()
|