ltsdb/process_queue

#!/usr/bin/python3

import logging
import logging.config
import math
import os
import socket
import statistics
import time

from ltsdb_json import LTS

import config

logging.config.dictConfig(config.logging)
log = logging.getLogger("process_queue")


node = socket.gethostbyaddr(socket.gethostname())[0]

class DiskFullPredictor:
    def match(self, lts):
        # measure=bytes_used, mountpoint=*
        if "measure" not in lts.description:
            return False
        if lts.description["measure"] != "bytes_used":
            return False
        if "mountpoint" not in lts.description:
            return False
        return True

    def run(self, lts):
        # find matching bytes_usable series
        desc = {**lts.description, "measure": "bytes_usable"}
        usable_lts = LTS(description=desc)
        # The two timeseries are always updated together, so the
        # timestamps should match exactly. But just in case we decide to
        # change that in the future accept a difference of up to an
        # hour.
        now = lts.data[-1][0]
        if abs(now - usable_lts.data[-1][0]) > 3600:
            log.warning("Timeseries %s and %s have different end times: %s vs %s",
                        lts.id, usable_lts.id,
                        now, usable_lts.data[-1][0])
            return
        current_used_bytes = lts.data[-1][1]
        current_usable_bytes = usable_lts.data[-1][1]
        tuf = 1E9
        for i in reversed(range(len(lts.data))):
            m = statistics.mean(x[1] for x in lts.data[max(0, i - 2) : min(len(lts.data), i + 3)])
            if m < current_usable_bytes * 0.1:
                continue # for sanity
            if current_used_bytes ** 2 / m > current_usable_bytes:
                log.info("d = %s, current_used_bytes = %s, current_usable_bytes = %s", m, current_used_bytes, current_usable_bytes)
                tuf = now - lts.data[i][0]
                break
        else:
            # Try always use the minimum of a range.
            # We prefer the first datapoint
            first_used_bytes = lts.data[0][2] if len(lts.data[0]) >= 4 else lts.data[0][1]
            # But if that's not useable we search the whole timeseries for the
            # minimum
            if first_used_bytes >= current_used_bytes:
                first_used_bytes = current_used_bytes
                first_i = None
                for i in range(len(lts.data)):
                    used_bytes = lts.data[i][2] if len(lts.data[i]) >= 4 else lts.data[i][1]
                    if used_bytes < first_used_bytes:
                        first_used_bytes = used_bytes
                        first_i = i
            else:
                first_i = 0

            if first_i is not None:
                historic_growth = current_used_bytes / first_used_bytes
                future_growth = current_usable_bytes  / current_used_bytes
                tuf = math.log(future_growth) / math.log(historic_growth) * (now - lts.data[first_i][0])
                tuf = max(tuf, now - lts.data[first_i][0])
                tuf = min(tuf, 1E9)
        desc = {**lts.description,
             "measure": "time_until_disk_full",
             "node": node,
             "unit": "s",
             "remote_addr": "",
            }
        lts = LTS(desc)
        lts.add(now, tuf)
        lts.save()

processors = [
    DiskFullPredictor(),
]

def process(lts):
    for processor in processors:
        if processor.match(lts):
            processor.run(lts)

while True:
    for id in os.listdir("queue"):
        lts = LTS(id=id)
        os.remove("queue/" + id)
        process(lts)
    time.sleep(1)