Implement json prototype of LTsDb

This commit is contained in:
Peter J. Holzer 2022-08-21 11:58:31 +02:00
parent 1a4d8ef471
commit 6d689666e9
3 changed files with 118 additions and 0 deletions

17
doc/multiple-timeseries Normal file
View File

@ -0,0 +1,17 @@
PoC, not optimized for performance.
Store data as JSON
Use one file per timeseries
Metadata is a dict of dimension/value pairs.
We can find each file quickly by using a hash of the metadata as the
filename
Can we find all timeseries which match only some of the dimensions
(e.g. response times of a particular service across all nodes)?
Opening each file is going to be slow pretty fast. So we need an index.
We don't expect new timeseries to spring into existence all that often,
so I guess for now we can just rewrite the whole index when a new
timeseries is added. Structure is pretty simple: Two levels of
dict (dimension, value) and then a list of matching timeseries.
Using an RDBMS doesn't seem like a good idea,
I'll design an efficient binary format later.

81
ltsdb_json.py Normal file
View File

@ -0,0 +1,81 @@
#!/usr/bin/python3
import fcntl
import glob
import hashlib
import json
import time
class LTS:
base_dir = "data"
limit = 1000
def __init__(self, description):
# Oh, I think we need to be able to load by hash, too
canonical_description = {x: description[x] for x in sorted(description.keys())}
self.description = canonical_description
serialized_description = json.dumps(canonical_description)
m = hashlib.sha256()
m.update(bytes(serialized_description, encoding="UTF-8"))
self.filename = self.base_dir + "/" + m.hexdigest()
try:
with open(self.filename, "r") as fh:
fcntl.flock(fh, fcntl.LOCK_SH)
d = json.load(fh)
self.new = False
self.data = d["data"]
except FileNotFoundError as e:
self.new = True
self.data = []
# Create the file immediately. Makes saving later simpler if we can
# assume it exists
with open(self.filename, "x+") as fh:
fcntl.flock(fh, fcntl.LOCK_EX)
json.dump({"description": self.description, "data": self.data}, fh)
self.rebuild_index()
def add(self, ts, value):
while len(self.data) >= self.limit:
r = random.randrange(0, self.limit)
self.data.pop(r)
if len(self.data) == 0 or ts >= self.data[-1][0]:
self.data.append((ts, value,))
else:
# Shouldn't happen that often, so I do a simple linear search instead
# of a binary search
for i in range(len(self.data)):
if self.data[i][0] >= ts:
break
self.data.insert(i, (ts, value,))
def save(self):
with open(self.filename, "r+") as fh:
fcntl.flock(fh, fcntl.LOCK_EX)
json.dump({"description": self.description, "data": self.data}, fh)
fh.truncate()
def rebuild_index(self):
t0 = time.time()
index = {}
for fn in glob.glob(self.base_dir + "/*"):
(_, _, hash) = fn.rpartition("/")
with open(fn, "r") as fh:
fcntl.flock(fh, fcntl.LOCK_SH)
d = json.load(fh)
for k, v in d["description"].items():
d1 = index.setdefault(k, {})
d2 = d1.setdefault(v, [])
d2.append(hash)
with open(self.base_dir + "/.index", "r+") as fh:
fcntl.flock(fh, fcntl.LOCK_EX)
json.dump(index, fh)
t1 = time.time()
print("index rebuilt in", t1 - t0, "seconds")
@classmethod
def find(self, match):
with open(self.base_dir + "/.index", "r") as fh:
fcntl.flock(fh, fcntl.LOCK_SH)
index = json.dump(fh)

20
ltsdb_test Executable file
View File

@ -0,0 +1,20 @@
#!/usr/bin/python3
from ltsdb_json import LTS
ts1 = LTS({"hostname": "rorschach.hjp.at", "measure": "uptime"})
ts1.add(1661026122, 4)
ts1.save()
ts1 = LTS({"hostname": "rorschach.hjp.at", "website": "i12e.hjp.at", "measure": "rtt"})
ts1.add(1661026122, 0.06)
ts1.save()
ts1 = LTS({"hostname": "rorschach.hjp.at", "measure": "uptime"})
ts1.add(1661026361, 5)
ts1.save()
ts1 = LTS({"hostname": "charly.wsr.ac.at", "website": "www.wifo.ac.at", "measure": "rtt"})
ts1.add(1661026122, 0.347)
ts1.save()