From e6a4ba72f18beeb9fd048af9b8e8683f3919f930 Mon Sep 17 00:00:00 2001
From: "Peter J. Holzer" <hjp@hjp.at>
Date: Sat, 17 Aug 2019 11:32:59 +0200
Subject: [PATCH] Smooth limits of spam probability

Instead of clipping the probability at [0.01, 0.99] we just add 1 to
each side. With my current corpus size this results in very similar
limits (they will creep closer to 0 and 1 with a larger corpus, but
never reach them) while avoiding having lots of tokens with exactly the
same probability. This makes the selection by judge_message less random
and more relevant (it prefers tokens which have been seen more
frequently).
---
 aggregate | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

diff --git a/aggregate b/aggregate
index a7f5f8c..fc2c3ab 100755
--- a/aggregate
+++ b/aggregate
@@ -41,37 +41,24 @@ csr.execute(
                 select
                     type, length, feature,
                     spam_count, ham_count,
-                    spam_count::float8 / spam_message_count as spam_ratio,
-                    ham_count::float8 / ham_message_count as ham_ratio
+                    (spam_count + 1.0) / (spam_message_count + 1.0) as spam_ratio,
+                    (ham_count + 1.0) / (ham_message_count + 1.0) as ham_ratio
                 from f, m
             ),
             p as (
                 select
                     type, length, feature,
                     spam_count, ham_count,
-                    case 
-                        when spam_count + ham_count > 4 then spam_ratio / (spam_ratio + ham_ratio)
-                    end as spam_prob
+                    spam_ratio / (spam_ratio + ham_ratio) as spam_prob
                 from f1
             ),
-            p1 as (
-                select
-                    type, length, feature,
-                    spam_count, ham_count,
-                    case
-                        when spam_prob < 0.01 then 0.01
-                        when spam_prob > 0.99 then 0.99
-                        else spam_prob
-                    end as spam_prob
-                from p
-            ),
             p2 as (
                 select
                     type, length, feature,
                     spam_count, ham_count,
                     spam_prob,
                     abs(spam_prob - 0.5) as interesting
-                from p1
+                from p
             )
         select * from p2
         order by interesting desc