ltsdb/process_queue

#!/usr/bin/python3

import logging
import logging.config
import math
import os
import socket
import statistics
import time

from ltsdb_json import LTS

import config

logging.config.dictConfig(config.logging)
log = logging.getLogger("process_queue")


node = socket.gethostbyaddr(socket.gethostname())[0]

class DiskFullPredictor:
    def match(self, lts):
        # measure=bytes_used, mountpoint=*
        if "measure" not in lts.description:
            return False
        if lts.description["measure"] != "bytes_used":
            return False
        if "mountpoint" not in lts.description:
            return False
        return True

    def run(self, lts):
        # find matching bytes_usable series
        desc = {**lts.description, "measure": "bytes_usable"}
        usable_lts = LTS(description=desc)
        # The two timeseries are always updated together, so the
        # timestamps should match exactly. But just in case we decide to
        # change that in the future accept a difference of up to an
        # hour.
        now = lts.data[-1][0]
        if abs(now - usable_lts.data[-1][0]) > 3600:
            log.warning("Timeseries %s and %s have different end times: %s vs %s",
                        lts.id, usable_lts.id,
                        now, usable_lts.data[-1][0])
            return
        current_used_bytes = lts.data[-1][1]
        current_usable_bytes = usable_lts.data[-1][1]
        tuf = 1E9
        for i in reversed(range(len(lts.data))):
            m = statistics.mean(x[1] for x in lts.data[max(0, i - 2) : min(len(lts.data), i + 3)])
            if m < current_usable_bytes * 0.1:
                continue # for sanity
            if current_used_bytes ** 2 / m > current_usable_bytes:
                log.info("d = %s, current_used_bytes = %s, current_usable_bytes = %s", m, current_used_bytes, current_usable_bytes)
                tuf = now - lts.data[i][0]
                break
        else:
            # Try always use the minimum of a range.
            # We prefer the first datapoint 
            first_used_bytes = lts.data[0][2] if len(lts.data[0]) >= 4 else lts.data[0][1]
            # But if that's not useable we search the whole timeseries for the
            # minimum
            if first_used_bytes >= current_used_bytes:
                first_used_bytes = current_used_bytes
                first_i = None
                for i in range(len(lts.data)):
                    used_bytes = lts.data[i][2] if len(lts.data[i]) >= 4 else lts.data[i][1]
                    if used_bytes < first_used_bytes:
                        first_used_bytes = used_bytes
                        first_i = i
            else:
                first_i = 0

            if first_i is not None:
                historic_growth = current_used_bytes / first_used_bytes
                future_growth = current_usable_bytes  / current_used_bytes
                tuf = math.log(future_growth) / math.log(historic_growth) * (now - lts.data[first_i][0])
                tuf = max(tuf, now - lts.data[first_i][0])
        desc = {**lts.description,
             "measure": "time_until_disk_full",
             "node": node,
             "unit": "s",
             "remote_addr": "",
            }
        lts = LTS(desc)
        lts.add(now, tuf)
        lts.save()

processors = [
    DiskFullPredictor(),
]

def process(lts):
    for processor in processors:
        if processor.match(lts):
            processor.run(lts)

while True:
    for id in os.listdir("queue"):
        lts = LTS(id=id)
        os.remove("queue/" + id)
        process(lts)
    time.sleep(1)
Process queue 2022-12-11 22:58:26 +01:00			`#!/usr/bin/python3`

Add logging to process_queue 2022-12-27 11:44:03 +01:00			`import logging`
			`import logging.config`
Extrapolate further into the future So far we have only extapolated as far into the future as we could look into the past. Everything beyond that was "infinity". Now we use the first and last observation to extrapolate beyond that. 2024-09-07 12:01:20 +02:00			`import math`
Process queue 2022-12-11 22:58:26 +01:00			`import os`
			`import socket`
Smooth out old data to avoid false positives in disk full prediction 2023-03-27 22:34:00 +02:00			`import statistics`
Process queue 2022-12-11 22:58:26 +01:00			`import time`

			`from ltsdb_json import LTS`

Add logging to process_queue 2022-12-27 11:44:03 +01:00			`import config`

			`logging.config.dictConfig(config.logging)`
			`log = logging.getLogger("process_queue")`


Process queue 2022-12-11 22:58:26 +01:00			`node = socket.gethostbyaddr(socket.gethostname())[0]`

			`class DiskFullPredictor:`
			`def match(self, lts):`
			`# measure=bytes_used, mountpoint=*`
			`if "measure" not in lts.description:`
			`return False`
			`if lts.description["measure"] != "bytes_used":`
			`return False`
			`if "mountpoint" not in lts.description:`
			`return False`
			`return True`

			`def run(self, lts):`
			`# find matching bytes_usable series`
			`desc = {**lts.description, "measure": "bytes_usable"}`
			`usable_lts = LTS(description=desc)`
			`# The two timeseries are always updated together, so the`
			`# timestamps should match exactly. But just in case we decide to`
			`# change that in the future accept a difference of up to an`
			`# hour.`
			`now = lts.data[-1][0]`
			`if abs(now - usable_lts.data[-1][0]) > 3600:`
			`log.warning("Timeseries %s and %s have different end times: %s vs %s",`
			`lts.id, usable_lts.id,`
			`now, usable_lts.data[-1][0])`
			`return`
			`current_used_bytes = lts.data[-1][1]`
			`current_usable_bytes = usable_lts.data[-1][1]`
Cap time until full at 1 billion seconds JSON can't represent infinity (at least not portably) so we need to use a finite value. I certainly won't be worried if a disk will fill up in 32 years. 2022-12-12 01:00:00 +01:00			`tuf = 1E9`
Smooth out old data to avoid false positives in disk full prediction 2023-03-27 22:34:00 +02:00			`for i in reversed(range(len(lts.data))):`
			`m = statistics.mean(x[1] for x in lts.data[max(0, i - 2) : min(len(lts.data), i + 3)])`
			`if m < current_usable_bytes * 0.1:`
Process queue 2022-12-11 22:58:26 +01:00			`continue # for sanity`
Smooth out old data to avoid false positives in disk full prediction 2023-03-27 22:34:00 +02:00			`if current_used_bytes ** 2 / m > current_usable_bytes:`
			`log.info("d = %s, current_used_bytes = %s, current_usable_bytes = %s", m, current_used_bytes, current_usable_bytes)`
			`tuf = now - lts.data[i][0]`
Process queue 2022-12-11 22:58:26 +01:00			`break`
Extrapolate further into the future So far we have only extapolated as far into the future as we could look into the past. Everything beyond that was "infinity". Now we use the first and last observation to extrapolate beyond that. 2024-09-07 12:01:20 +02:00			`else:`
Search for global minimum if start of timeseries is unusable 2024-09-07 14:36:16 +02:00			`# Try always use the minimum of a range.`
			`# We prefer the first datapoint`
			`first_used_bytes = lts.data[0][2] if len(lts.data[0]) >= 4 else lts.data[0][1]`
			`# But if that's not useable we search the whole timeseries for the`
			`# minimum`
			`if first_used_bytes >= current_used_bytes:`
			`first_used_bytes = current_used_bytes`
			`first_i = None`
			`for i in range(len(lts.data)):`
			`used_bytes = lts.data[i][2] if len(lts.data[i]) >= 4 else lts.data[i][1]`
			`if used_bytes < first_used_bytes:`
			`first_used_bytes = used_bytes`
			`first_i = i`
			`else:`
			`first_i = 0`

			`if first_i is not None:`
			`historic_growth = current_used_bytes / first_used_bytes`
			`future_growth = current_usable_bytes / current_used_bytes`
			`tuf = math.log(future_growth) / math.log(historic_growth) * (now - lts.data[first_i][0])`
			`tuf = max(tuf, now - lts.data[first_i][0])`
Process queue 2022-12-11 22:58:26 +01:00			`desc = {**lts.description,`
			`"measure": "time_until_disk_full",`
			`"node": node,`
			`"unit": "s",`
			`"remote_addr": "",`
			`}`
			`lts = LTS(desc)`
			`lts.add(now, tuf)`
			`lts.save()`

			`processors = [`
			`DiskFullPredictor(),`
			`]`

			`def process(lts):`
			`for processor in processors:`
			`if processor.match(lts):`
			`processor.run(lts)`

			`while True:`
			`for id in os.listdir("queue"):`
			`lts = LTS(id=id)`
			`os.remove("queue/" + id)`
			`process(lts)`
			`time.sleep(1)`