ltsdb/ltsdb_json.py

#!/usr/bin/python3

import fcntl
import glob
import hashlib
import json
import logging
import math
import random
import time

log = logging.getLogger()

class LTS:
    base_dir = "data"
    queue_dir = "queue"
    limit = 1000

    def __init__(self, description=None, id=None):
        if description:
            canonical_description = {x: description[x] for x in sorted(description.keys())}
            self.description = canonical_description
            serialized_description = json.dumps(canonical_description)
            m = hashlib.sha256()
            m.update(bytes(serialized_description, encoding="UTF-8"))
            id = m.hexdigest()
        self.filename = self.base_dir + "/" + id
        self.id = id
        try:
            with open(self.filename, "r") as fh:
                fcntl.flock(fh, fcntl.LOCK_SH)
                d = json.load(fh)
            self.new = False
            self.description = d["description"]
            self.data = d["data"]
        except FileNotFoundError as e:
            self.new = True
            self.data = []
            # Create the file immediately. Makes saving later simpler if we can
            # assume it exists
            with open(self.filename, "x+") as fh:
                fcntl.flock(fh, fcntl.LOCK_EX)
                json.dump({"description": self.description, "data": self.data}, fh)
            self.rebuild_index()

    def shrink(self):
        # Remove one element in such a way that the distributions gets closer
        # to an exponential curve through the first and the last few data
        # points.
        # To do this we compute the ideal t value at each point and compare it
        # to the real value. We remove the first point which sticks out too
        # much (I'm tempted to dub this the barber's algorithm).
        # This extremely inefficient but it's simple to understand and works.
        data = self.data
        n = len(data)
        t_last = data[-1][0]
        dt = (t_last - data[-5][0]) / 4
        k = math.log((t_last - data[0][0]) / dt / n + 1)
        for i in range(1, n):
            t_ideal = (math.exp(k * (n - i)/n) - 1) * (n * dt)
            if t_last - data[i][0] > t_ideal:
                log.debug("%s - %s > %s -> popping element %s", t_last, data[i][0], t_ideal, i)
                data.pop(i)
                break
        else:
            # Well, it works mostly. Sometimes all the real points are below
            # the curve but we have to remove one anyway. This needs to be
            # heavily biased towards newer data points, but we don't want to
            # delete the few newest data points so choose one at random from a
            # narrow range just before that.
            i = random.randrange(int(n*0.98), int(n*0.99))
            log.debug("no match -> popping element %s", i)
            data.pop(i)

    def add(self, ts, value):
        while len(self.data) >= self.limit:
            self.shrink()

        if len(self.data) == 0 or ts >= self.data[-1][0]:
            self.data.append((ts, value,))
        else:
            # Shouldn't happen that often, so I do a simple linear search instead
            # of a binary search
            for i in range(len(self.data)):
                if self.data[i][0] >= ts:
                    break
            self.data.insert(i, (ts, value,))

    def save(self):
        with open(self.filename, "r+") as fh:
            fcntl.flock(fh, fcntl.LOCK_EX)
            json.dump({"description": self.description, "data": self.data}, fh)
            fh.truncate()
        with open(self.queue_dir + "/" + self.id, "w") as fh:
            pass

    def rebuild_index(self):
        t0 = time.time()
        index  = {}
        for fn in glob.glob(self.base_dir + "/*"):
            (_, _, hash) = fn.rpartition("/")
            with open(fn, "r") as fh:
                fcntl.flock(fh, fcntl.LOCK_SH)
                d = json.load(fh)
            for k, v in d["description"].items():
                d1 = index.setdefault(k, {})
                d2 = d1.setdefault(v, [])
                d2.append(hash)
        with open(self.base_dir + "/.index", "r+") as fh:
            fcntl.flock(fh, fcntl.LOCK_EX)
            json.dump(index, fh)
        t1 = time.time()
        print("index rebuilt in", t1 - t0, "seconds")

    @classmethod
    def find(cls, match):
        result = None
        with open(cls.base_dir + "/.index", "r") as fh:
            fcntl.flock(fh, fcntl.LOCK_SH)
            index = json.load(fh)
            for d, v in match.items():
                    ts = set(index[d][v])
                    if result is None:
                        result = ts
                    else:
                        result &= ts
        return result

    def data_json_by_row(self):
        d = []
        for dp in self.data:
            d.append({
                "t": dp[0],
                "v": dp[1],
                "utc": time.strftime("%Y-%m-%dT%H:%M:%S", time.gmtime(dp[0])),
            })
        return json.dumps(d)