#!/usr/bin/python3 import fcntl import glob import hashlib import json import logging import math import random import time log = logging.getLogger() class LTS: base_dir = "data" queue_dir = "queue" limit = 1000 def __init__(self, description=None, id=None): if description: canonical_description = {x: description[x] for x in sorted(description.keys())} self.description = canonical_description serialized_description = json.dumps(canonical_description) m = hashlib.sha256() m.update(bytes(serialized_description, encoding="UTF-8")) id = m.hexdigest() self.filename = self.base_dir + "/" + id self.id = id try: with open(self.filename, "r") as fh: fcntl.flock(fh, fcntl.LOCK_SH) d = json.load(fh) self.new = False self.description = d["description"] self.data = d["data"] except FileNotFoundError as e: self.new = True self.data = [] # Create the file immediately. Makes saving later simpler if we can # assume it exists with open(self.filename, "x+") as fh: fcntl.flock(fh, fcntl.LOCK_EX) json.dump({"description": self.description, "data": self.data}, fh) self.rebuild_index() def shrink(self): # Remove one element in such a way that the distributions gets closer # to an exponential curve through the first and the last few data # points. # To do this we compute the ideal t value at each point and compare it # to the real value. We remove the first point which sticks out too # much (I'm tempted to dub this the barber's algorithm). # This extremely inefficient but it's simple to understand and works. data = self.data n = len(data) t_last = data[-1][0] dt = (t_last - data[-5][0]) / 4 k = math.log((t_last - data[0][0]) / dt / n + 1) for i in range(1, n): t_ideal = (math.exp(k * (n - i)/n) - 1) * (n * dt) if t_last - data[i][0] > t_ideal: log.debug("%s - %s > %s -> popping element %s", t_last, data[i][0], t_ideal, i) data.pop(i) break else: # Well, it works mostly. Sometimes all the real points are below # the curve but we have to remove one anyway. This needs to be # heavily biased towards newer data points, but we don't want to # delete the few newest data points so choose one at random from a # narrow range just before that. i = random.randrange(int(n*0.98), int(n*0.99)) log.debug("no match -> popping element %s", i) data.pop(i) def add(self, ts, value): while len(self.data) >= self.limit: self.shrink() if len(self.data) == 0 or ts >= self.data[-1][0]: self.data.append((ts, value,)) else: # Shouldn't happen that often, so I do a simple linear search instead # of a binary search for i in range(len(self.data)): if self.data[i][0] >= ts: break self.data.insert(i, (ts, value,)) def save(self): with open(self.filename, "r+") as fh: fcntl.flock(fh, fcntl.LOCK_EX) json.dump({"description": self.description, "data": self.data}, fh) fh.truncate() with open(self.queue_dir + "/" + self.id, "w") as fh: pass def rebuild_index(self): t0 = time.time() index = {} for fn in glob.glob(self.base_dir + "/*"): (_, _, hash) = fn.rpartition("/") with open(fn, "r") as fh: fcntl.flock(fh, fcntl.LOCK_SH) d = json.load(fh) for k, v in d["description"].items(): d1 = index.setdefault(k, {}) d2 = d1.setdefault(v, []) d2.append(hash) with open(self.base_dir + "/.index", "r+") as fh: fcntl.flock(fh, fcntl.LOCK_EX) json.dump(index, fh) t1 = time.time() print("index rebuilt in", t1 - t0, "seconds") @classmethod def find(cls, match): result = None with open(cls.base_dir + "/.index", "r") as fh: fcntl.flock(fh, fcntl.LOCK_SH) index = json.load(fh) for d, v in match.items(): ts = set(index[d][v]) if result is None: result = ts else: result &= ts return result def data_json_by_row(self): d = [] for dp in self.data: d.append({ "t": dp[0], "v": dp[1], "utc": time.strftime("%Y-%m-%dT%H:%M:%S", time.gmtime(dp[0])), }) return json.dumps(d)