From e6a4ba72f18beeb9fd048af9b8e8683f3919f930 Mon Sep 17 00:00:00 2001 From: "Peter J. Holzer" Date: Sat, 17 Aug 2019 11:32:59 +0200 Subject: [PATCH] Smooth limits of spam probability Instead of clipping the probability at [0.01, 0.99] we just add 1 to each side. With my current corpus size this results in very similar limits (they will creep closer to 0 and 1 with a larger corpus, but never reach them) while avoiding having lots of tokens with exactly the same probability. This makes the selection by judge_message less random and more relevant (it prefers tokens which have been seen more frequently). --- aggregate | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/aggregate b/aggregate index a7f5f8c..fc2c3ab 100755 --- a/aggregate +++ b/aggregate @@ -41,37 +41,24 @@ csr.execute( select type, length, feature, spam_count, ham_count, - spam_count::float8 / spam_message_count as spam_ratio, - ham_count::float8 / ham_message_count as ham_ratio + (spam_count + 1.0) / (spam_message_count + 1.0) as spam_ratio, + (ham_count + 1.0) / (ham_message_count + 1.0) as ham_ratio from f, m ), p as ( select type, length, feature, spam_count, ham_count, - case - when spam_count + ham_count > 4 then spam_ratio / (spam_ratio + ham_ratio) - end as spam_prob + spam_ratio / (spam_ratio + ham_ratio) as spam_prob from f1 ), - p1 as ( - select - type, length, feature, - spam_count, ham_count, - case - when spam_prob < 0.01 then 0.01 - when spam_prob > 0.99 then 0.99 - else spam_prob - end as spam_prob - from p - ), p2 as ( select type, length, feature, spam_count, ham_count, spam_prob, abs(spam_prob - 0.5) as interesting - from p1 + from p ) select * from p2 order by interesting desc