From dfc1e8bf8bf54e33b62d090b0e5c1fcf2e4f0f5b Mon Sep 17 00:00:00 2001
From: "Peter J. Holzer" <hjp@hjp.at>
Date: Fri, 25 Feb 2022 16:21:07 +0100
Subject: [PATCH] Implement MVP of DSL

I implemented the parser manually since I couldn't get pypeg2 to emit
useful error messages (I confess to be a bit proud of my multi-level
color-coded reporting).

The DSL isn't complete yet. Explicit dependencies and nested rules are
still missing, as are most column types. But it's enough to create and
populate some tables from a real-world project, so I consider it to have
reached to MVP stage.
---
 demo_usradm_templates.procrusql |  55 ++++++
 parse_dsl_manual                | 307 ++++++++++++++++++++++++++++++++
 2 files changed, 362 insertions(+)
 create mode 100644 demo_usradm_templates.procrusql
 create mode 100755 parse_dsl_manual

diff --git a/demo_usradm_templates.procrusql b/demo_usradm_templates.procrusql
new file mode 100644
index 0000000..10c2582
--- /dev/null
+++ b/demo_usradm_templates.procrusql
@@ -0,0 +1,55 @@
+table service_type
+column service_type id serial primary key
+column service_type type text not null unique
+column service_type description text
+
+data service_type {type: "group"} {}
+data service_type {type: "publ"} {}
+data service_type {type: "rolle"} {}
+
+table service
+column service id serial primary key
+column service type text
+column service feature text
+column service description text
+column service hidden boolean
+
+data service {type: "login", feature: "unix"} {description: "Can login on Unix systems", hidden: False} >> data_service_login_unix
+data service {type: "login", feature: "wds"} {description: "Can login into WDS", hidden: False} >> data_service_login_wds
+data service {type: "login", feature: "wwa"} {description: "Can login into WWA", hidden: False} >> data_service_login_wwa
+data service {type: "group", feature: "wifo"} {hidden: False} >> data_service_group_wifo
+data service {type: "group", feature: "wifo-intern"} {hidden: False} >> data_service_group_wifo_intern
+data service {type: "group", feature: "wsr"} {hidden: False} >> data_service_group_wsr
+data service {type: "mailinglist2", feature: "wifo-aktuell"} {hidden: False} >> data_service_mailinglist2_wifo_aktuell
+data service {type: "org", feature: "WIFO"} {hidden: False} >> data_service_org_WIFO
+data service {type: "org", feature: "WSR"} {hidden: False} >> data_service_org_WSR
+data service {type: "publ", feature: "wifo_intern"} {hidden: False} >> data_service_publ_wifo_intern
+data service {type: "publ", feature: "wifo_temporary"} {hidden: False} >> data_service_publ_wifo_temporary
+data service {type: "rolle", feature: "administrative_staff_member"} {hidden: False} >> data_service_rolle_administrative_staff_member
+data service {type: "rolle", feature: "associate"} {hidden: False} >> data_service_rolle_associate
+data service {type: "rolle", feature: "economist"} {hidden: False} >> data_service_rolle_economist
+data service {type: "rolle", feature: "emeritus_consultant"} {hidden: False} >> data_service_rolle_emeritus_consultant
+data service {type: "rolle", feature: "research_assistant"} {hidden: False} >> data_service_rolle_research_assistant
+data service {type: "rolle", feature: "scientific_administration_staff_member"} {hidden: False} >> data_service_rolle_scientific_administration_staff_member
+data service {type: "rolle", feature: "scientific_administrative_assistant"} {hidden: False} >> data_service_rolle_scientific_administrative_assistant
+data service {type: "rolle", feature: "scientific_consultant"} {hidden: False} >> data_service_rolle_scientific_consultant
+data service {type: "rolle", feature: "senior_economist"} {hidden: False} >> data_service_rolle_senior_economist
+
+table template
+column template id            serial primary key
+column template name          text   not null unique
+column template sortorder     int
+column template email_pattern text
+column template email_after   text
+column template funktion      text
+column template status        text
+
+data template {name: "Standard Vorlage"} {sortorder: 1} >> data_template_std
+data template {name: "WIFO Wissenschaftliche Assistenz"} {sortorder: 10, "email_pattern": "vorname#.#nachname", "email_after": "wifo.ac.at", funktion: "FB (1-5)", status: "FU, FM"} >> data_template_wifo_wa
+
+table template_service
+column template_service id serial primary key
+column template_service template int references template
+column template_service service int references service
+
+data template_service {template: @data_template_std/0/id, service: @data_service_login_wwa/0/id} {}
diff --git a/parse_dsl_manual b/parse_dsl_manual
new file mode 100755
index 0000000..ecdc431
--- /dev/null
+++ b/parse_dsl_manual
@@ -0,0 +1,307 @@
+#!/usr/bin/python3
+import logging
+import re
+import sys
+from pprint import pprint
+
+import psycopg2
+
+import procrusql
+
+class RuleFile:
+    def __init__(self, filename):
+        with open(filename) as f:
+            self.text = f.read()
+
+class Failure:
+    def __init__(self, message, position, parse_state):
+        self.message = message
+        self.position = position
+        self.parse_state = parse_state
+
+class ParseState:
+
+    def __init__(self, text, position=0):
+        self.text = text
+        self.position = position
+        self.child_failure = None
+
+    def clone(self):
+        ps = ParseState(self.text, self.position)
+        return ps
+
+    @property
+    def rest(self):
+        return self.text[self.position:]
+
+    def printerror(self):
+        if not self.child_failure:
+            return
+        position = self.child_failure.position
+        message = self.child_failure.message
+        linesbefore = self.text[:position].split("\n")
+        linesafter = self.text[position:].split("\n")
+        good = "\x1B[40;32m"
+        bad = "\x1B[40;31m"
+        reset = "\x1B[0m"
+        s = reset + message + "\n"
+        lines = []
+        for ln in range(max(len(linesbefore) - 3, 0), len(linesbefore)):
+            # XXX - join
+            lines.append(reset + f"{ln+1:4}: " + good + linesbefore[ln] + reset)
+        s += "\n".join(lines)
+        s += bad + linesafter[0] + reset
+        print(s)
+
+        if  self.child_failure.parse_state:
+            self.child_failure.parse_state.printerror()
+
+    def skip_whitespace_and_comments(self):
+        self.match(r"(\s+|#.*)*")
+
+    def match(self, regexp):
+        if m := re.match(regexp, self.text[self.position:]):
+            self.position += len(m.group(0))
+        return m
+
+    def match_newlines(self):
+        self.match(r"(\s+|#.*)*\n")
+
+    def record_child_failure(self, ps_child, msg):
+        if not self.child_failure or  ps_child.position > self.child_failure.position:
+            self.child_failure = Failure(position=ps_child.position, message=msg, parse_state=ps_child)
+
+def parse_ruleset(ps):
+    ps2 = ps.clone()
+    ps2.ast = []
+    while ps2.rest:
+        ps3 = parse_table_rule(ps2) or parse_column_rule(ps2) or parse_data_rule(ps2)
+        if ps3:
+            ps2.ast.append(ps3.ast)
+            ps2.position = ps3.position
+        else:
+            ps.record_child_failure(ps2, "expected one of: table rule, column rule, data rule")
+            return
+    return ps2
+
+def parse_table_rule(ps):
+    ps2 = ps.clone()
+    ps2.skip_whitespace_and_comments()
+    if not ps2.match(r"table\b"):
+        ps.record_child_failure(ps2, "expected “table”")
+        return
+    ps2.skip_whitespace_and_comments()
+    ps3 = parse_table_name(ps2)
+    if not ps3:
+        ps.record_child_failure(ps2, "expected table name")
+        return
+
+    ps2.ast = procrusql.HaveTable(rulename(), [], ps3.ast[0])
+    ps2.position = ps3.position
+    return ps2
+
+def parse_column_rule(ps):
+    ps2 = ps.clone()
+    ps2.ast = []
+    ps2.skip_whitespace_and_comments()
+    if not ps2.match(r"column\b"):
+        ps.record_child_failure(ps2, "expected “column”")
+        return
+
+    # The table name should be omitted if this is part of a table declaration.
+    # I haven't decided if I want to make that optional in this rule or write a
+    # different rule. Probably the latter. If the former, I may have to change
+    # the syntax to avoid ambiguity.
+    ps3 = parse_table_name(ps2)
+    if not ps3:
+        ps.record_child_failure(ps2, "expected table name")
+        return
+    table_name = ps3.ast[0]
+    ps2.position = ps3.position
+
+    ps3 = parse_column_name(ps2)
+    if not ps3:
+        ps.record_child_failure(ps2, "expected column name")
+        return
+    column_name = ps3.ast[0]
+    ps2.position = ps3.position
+
+    ps3 = parse_column_definition(ps2)
+    if not ps3:
+        ps.record_child_failure(ps2, "expected column definition")
+        return
+    column_definition = ps3.ast[0]
+    ps2.position = ps3.position
+
+    ps2.ast = procrusql.HaveColumn(rulename(), [], table_name, column_name, column_definition)
+
+    ps2.match_newlines()
+
+    return ps2
+
+def parse_data_rule(ps):
+    ps2 = ps.clone()
+    ps2.skip_whitespace_and_comments()
+    if not ps2.match(r"data\b"):
+        ps.record_child_failure(ps2, "expected “data”")
+        return
+
+    ps3 = parse_table_name(ps2)
+    if not ps3:
+        ps.record_child_failure(ps2, "expected table name")
+        return
+    table_name = ps3.ast[0]
+    ps2.position = ps3.position
+
+    ps3 = parse_dict(ps2)
+    if not ps3:
+        ps.record_child_failure(ps2, "expected key data definition")
+        return
+    key_data = ps3.ast
+    ps2.position = ps3.position
+
+    ps3 = parse_dict(ps2)
+    if not ps3:
+        ps.record_child_failure(ps2, "expected extra data definition")
+        return
+    extra_data = ps3.ast
+    ps2.position = ps3.position
+
+    ps3 = parse_label(ps2)
+    if ps3:
+        label = ps3.ast
+        ps2.position = ps3.position
+    else:
+        label = rulename()
+
+    ps2.ast = procrusql.HaveData(label, [], table_name, key_data, extra_data)
+
+    ps2.match_newlines()
+
+    return ps2
+
+def parse_table_name(ps):
+    # For now this matches only simple names, not schema-qualified names or
+    # quoted names.
+    ps2 = ps.clone()
+    ps2.ast = []
+    ps2.skip_whitespace_and_comments()
+    if ps2.rest[0].isalpha():
+        m = ps2.match(r"\w+") # always succeeds since we already checked the first character
+        ps2.ast.append(m.group(0))
+    else:
+        ps.record_child_failure(ps2, "expected table name")
+    return ps2
+
+def parse_column_name(ps):
+    # For now this matches only simple names, not quoted names.
+    # Also, this is an exact duplicate of parse_table_name, but they will
+    # probably diverge, so I duplicated it.
+    ps2 = ps.clone()
+    ps2.ast = []
+    ps2.skip_whitespace_and_comments()
+    if ps2.rest[0].isalpha():
+        m = ps2.match(r"\w+") # always succeeds since we already checked the first character
+        ps2.ast.append(m.group(0))
+        return ps2
+    else:
+        ps.record_child_failure(ps2, "expected column name")
+        return
+
+def parse_column_definition(ps):
+    ps2 = ps.clone()
+    ps2.ast = []
+    ps2.skip_whitespace_and_comments()
+    m = ps2.match(r"(int|serial|text|boolean)(\s+not null)?(\s+(primary key|unique|references \w+))?\b")
+    if not m:
+        ps.record_child_failure(ps2, "expected column definition")
+        return
+    ps2.ast.append(m.group(0))
+    return ps2
+
+def parse_dict(ps):
+    ps2 = ps.clone()
+    d = {}
+    ps2.skip_whitespace_and_comments()
+    if not ps2.match(r"{"):
+        ps.record_child_failure(ps2, "expected “{”")
+        return
+    while True:
+        ps2.skip_whitespace_and_comments()
+        if ps2.match(r'}'):
+            break
+
+        m = ps2.match(r'\w+|"([^"]+)"')
+        if not m:
+            ps.record_child_failure(ps2, "expected column name")
+            return
+        # XXX - unquote properly
+        if m.group(1):
+            k = m.group(1)
+        else:
+            k = m.group(0)
+
+        ps2.skip_whitespace_and_comments()
+        if not ps2.match(":"):
+            ps.record_child_failure(ps2, "expected “:”")
+            return
+        ps2.skip_whitespace_and_comments()
+        if m := ps2.match(r'[0-9]+'):
+            v = int(m.group(0))
+        elif m := ps2.match(r'"([^"]*)"'):
+            # XXX - process backslash escapes
+            v = m.group(1)
+        elif m := ps2.match(r'[tT]rue'):
+            v = True
+        elif m := ps2.match(r'[fF]alse'):
+            v = False
+        elif m := ps2.match(r'None|null|NULL'):
+            v = None
+        elif m := ps2.match(r'@(\w+)/(\d+)/(\w+)'):
+            v = procrusql.Ref(m.group(1), int(m.group(2)), m.group(3))
+        else:
+            ps.record_child_failure(ps2, "expected value")
+            return
+
+        d[k] = v
+
+        ps2.skip_whitespace_and_comments()
+        comma_found = ps2.match(r',')
+        ps2.skip_whitespace_and_comments()
+        if ps2.match(r'}'):
+            break
+        if not comma_found:
+            ps.record_child_failure(ps2, "expected comma or close brace")
+            return
+    ps2.ast = d
+    return ps2
+
+def parse_label(ps):
+    ps2 = ps.clone()
+    if m := ps2.match(r"\s*>>\s*(\w+)"):
+        ps2.ast = m.group(1)
+        return ps2
+    else:
+        ps.record_child_failure(ps2, "expected label definition")
+        return
+
+rulenum = 0
+def rulename():
+    global rulenum
+    rulenum += 1
+    return f"__rule_{rulenum}"
+
+if __name__ == "__main__":
+    logging.basicConfig(format="%(asctime)s %(levelname)s %(name)s %(lineno)d | %(message)s", level=logging.DEBUG)
+    with open(sys.argv[1]) as rf:
+        text = rf.read()
+    ps = ParseState(text)
+
+    ps2 = parse_ruleset(ps)
+
+    if not ps2:
+        ps.printerror()
+        sys.exit(1)
+
+    db = psycopg2.connect(sys.argv[2])
+    procrusql.fit(db, ps2.ast)