diff --git a/aggregate b/aggregate index a7f5f8c..fc2c3ab 100755 --- a/aggregate +++ b/aggregate @@ -41,37 +41,24 @@ csr.execute( select type, length, feature, spam_count, ham_count, - spam_count::float8 / spam_message_count as spam_ratio, - ham_count::float8 / ham_message_count as ham_ratio + (spam_count + 1.0) / (spam_message_count + 1.0) as spam_ratio, + (ham_count + 1.0) / (ham_message_count + 1.0) as ham_ratio from f, m ), p as ( select type, length, feature, spam_count, ham_count, - case - when spam_count + ham_count > 4 then spam_ratio / (spam_ratio + ham_ratio) - end as spam_prob + spam_ratio / (spam_ratio + ham_ratio) as spam_prob from f1 ), - p1 as ( - select - type, length, feature, - spam_count, ham_count, - case - when spam_prob < 0.01 then 0.01 - when spam_prob > 0.99 then 0.99 - else spam_prob - end as spam_prob - from p - ), p2 as ( select type, length, feature, spam_count, ham_count, spam_prob, abs(spam_prob - 0.5) as interesting - from p1 + from p ) select * from p2 order by interesting desc