#!/usr/bin/python3 import time import psycopg2 import psycopg2.extras as ppe t0 = time.time() db = psycopg2.connect("dbname=bayes") csr = db.cursor(cursor_factory=ppe.DictCursor) csr.execute( """ delete from features """) t1 = time.time() print(t1 - t0, "deleted", csr.rowcount, "rows") csr.execute( """ insert into features( type, length, feature, spam_count, ham_count, spam_prob, interesting ) with m as ( select count(*) filter(where type='spam') as spam_message_count, count(*) filter(where type='ham') as ham_message_count from messages ), f as ( select f.type, length, feature, count(*) filter (where m.type = 'spam') as spam_count, count(*) filter (where m.type = 'ham') as ham_count from message_features f join messages m on (f.message = m.id) group by f.type, length, feature ), f1 as ( select type, length, feature, spam_count, ham_count, spam_count::float8 / spam_message_count as spam_ratio, ham_count::float8 / ham_message_count as ham_ratio from f, m ), p as ( select type, length, feature, spam_count, ham_count, case when spam_count + ham_count > 4 then spam_ratio / (spam_ratio + ham_ratio) end as spam_prob from f1 ), p1 as ( select type, length, feature, spam_count, ham_count, case when spam_prob < 0.01 then 0.01 when spam_prob > 0.99 then 0.99 else spam_prob end as spam_prob from p ), p2 as ( select type, length, feature, spam_count, ham_count, spam_prob, abs(spam_prob - 0.5) as interesting from p1 ) select * from p2 order by interesting desc """) t1 = time.time() print(t1 - t0, "inserted", csr.rowcount, "rows") db.commit()