Make expiry exponential

Choosing the element to expire from a uniform random distribution tends to expire data points much too early. We want to always keep the oldest observation and have a roughly exponential distribution between the newest and the oldest observation.
2022-12-17 17:49:19 +01:00 · 2022-12-17 17:49:19 +01:00 · b10b62e77d
parent a802f2ee27
commit b10b62e77d
1 changed files with 34 additions and 2 deletions
--- a/ltsdb_json.py
+++ b/ltsdb_json.py
@ -4,9 +4,13 @@ import fcntl
 import glob
 import hashlib
 import json
 import logging
 import math
 import random
 import time
 log = logging.getLogger()
 class LTS:
    base_dir = "data"
    queue_dir = "queue"
@ -39,10 +43,38 @@ class LTS:
                json.dump({"description": self.description, "data": self.data}, fh)
            self.rebuild_index()
    def shrink(self):
        # Remove one element in such a way that the distributions gets closer
        # to an exponential curve through the first and the last few data
        # points.
        # To do this we compute the ideal t value at each point and compare it 
        # to the real value. We remove the first point which sticks out too
        # much (I'm tempted to dub this the barber's algorithm).
        # This extremely inefficient but it's simple to understand and works.
        data = self.data
        n = len(data)
        t_last = data[-1][0]
        dt = (t_last - data[-5][0]) / 4
        k = math.log((t_last - data[0][0]) / dt / n + 1)
        for i in range(1, n):
            t_ideal = (math.exp(k * (n - i)/n) - 1) * (n * dt)
            if t_last - data[i][0] > t_ideal:
                log.debug("%s - %s > %s -> popping element %s", t_last, data[i][0], t_ideal, i)
                data.pop(i)
                break
        else:
            # Well, it works mostly. Sometimes all the real points are below
            # the curve but we have to remove one anyway. This needs to be
            # heavily biased towards newer data points, but we don't want to
            # delete the few newest data points so choose one at random from a
            # narrow range just before that.
            i = random.randrange(int(n*0.98), int(n*0.99))
            log.debug("no match -> popping element %s", i)
            data.pop(i)
    def add(self, ts, value):
        while len(self.data) >= self.limit:
-            r = random.randrange(0, self.limit)
+            self.shrink()
            self.data.pop(r)
        if len(self.data) == 0 or ts >= self.data[-1][0]:
            self.data.append((ts, value,))