Make expiry exponential

Choosing the element to expire from a uniform random distribution tends to expire data points much too early. We want to always keep the oldest observation and have a roughly exponential distribution between the newest and the oldest observation.
2022-12-17 17:49:19 +01:00 · 2022-12-17 17:49:19 +01:00 · b10b62e77d
parent a802f2ee27
commit b10b62e77d
1 changed files with 34 additions and 2 deletions
--- a/ltsdb_json.py
+++ b/ltsdb_json.py
@ -4,9 +4,13 @@ import fcntl
 import glob
 import hashlib
 import json
+import logging
+import math
 import random
 import time

+log = logging.getLogger()
+
 class LTS:
    base_dir = "data"
    queue_dir = "queue"
@ -39,10 +43,38 @@ class LTS:
                json.dump({"description": self.description, "data": self.data}, fh)
            self.rebuild_index()

+    def shrink(self):
+        # Remove one element in such a way that the distributions gets closer
+        # to an exponential curve through the first and the last few data
+        # points.
+        # To do this we compute the ideal t value at each point and compare it 
+        # to the real value. We remove the first point which sticks out too
+        # much (I'm tempted to dub this the barber's algorithm).
+        # This extremely inefficient but it's simple to understand and works.
+        data = self.data
+        n = len(data)
+        t_last = data[-1][0]
+        dt = (t_last - data[-5][0]) / 4
+        k = math.log((t_last - data[0][0]) / dt / n + 1)
+        for i in range(1, n):
+            t_ideal = (math.exp(k * (n - i)/n) - 1) * (n * dt)
+            if t_last - data[i][0] > t_ideal:
+                log.debug("%s - %s > %s -> popping element %s", t_last, data[i][0], t_ideal, i)
+                data.pop(i)
+                break
+        else:
+            # Well, it works mostly. Sometimes all the real points are below
+            # the curve but we have to remove one anyway. This needs to be
+            # heavily biased towards newer data points, but we don't want to
+            # delete the few newest data points so choose one at random from a
+            # narrow range just before that.
+            i = random.randrange(int(n*0.98), int(n*0.99))
+            log.debug("no match -> popping element %s", i)
+            data.pop(i)
+
    def add(self, ts, value):
        while len(self.data) >= self.limit:
-            r = random.randrange(0, self.limit)
-            self.data.pop(r)
+            self.shrink()

        if len(self.data) == 0 or ts >= self.data[-1][0]:
            self.data.append((ts, value,))