Merge thread-handling from mbox2threads into mbox2web

2020-04-12 23:08:10 +02:00 · 2020-04-12 23:08:10 +02:00 · 6923e6273a
parent 1ada1c3817
commit 6923e6273a
4 changed files with 425 additions and 24 deletions
--- a/392
+++ b/392
@ -1,5 +1,6 @@
 #!/usr/bin/python3

+import datetime
 import email.header
 import email.parser
 import hashlib
@ -7,6 +8,7 @@ import html
 import html.parser
 import mailbox
 import os
+import pprint
 import re
 import subprocess
 import sys
@ -243,7 +245,9 @@ def render_body(msg, extra=None):
            if not whole_msg_embedded_id:
                whole_msg.add_header("Message-Id", "<" + whole_msg_id + ">")
                whole_msg_embedded_id = whole_msg_id
-            archive(whole_msg)
+            if whole_msg["Date"] is None:
+                whole_msg["Date"] = msg["Date"]
+            arch.add_message(whole_msg) # XXX - global
            del partial_message_cache[whole_msg_id]
        return "<p>This is part %d of %d of <a href='../%s/'>%s</a></p>" % (
                        int(msg.get_param("number")),
@ -415,7 +419,7 @@ def render_body(msg, extra=None):
    content_type = msg.get_content_type()
    content_disposition = msg.get_content_disposition()
    if content_disposition == "attachment":
-        # XXX - not sure, if we should just store all content-types. 
+        # XXX - not sure if we should just store all content-types. 
        # We probably should clean up html. Alternatively we could just store
        # all of them application/octet-stream, which browsers should download
        # and not try to display.
@ -426,27 +430,6 @@ def render_body(msg, extra=None):
    return jinja2.Markup(bodyhtml)


-def archive(msg):
-    mid = get_message_id(msg)
-    print("M", mid, file=sys.stderr)
-    encmid = encode_message_id(mid)
-    msgdir = basedir + "/msg/" + encmid
-    os.makedirs(msgdir, exist_ok=True)
-    with open(msgdir + "/index.html", "w") as hfd:
-        msgtmpl = jenv.get_template("message.html")
-        bodyhtml = render_body(msg)
-        context = {
-            "list": "LUGA",
-            "message_id": mid,
-            "subject": decode_rfc2047(msg["Subject"]),
-            "from": decode_rfc2047(msg["From"]),
-            "date": msg["Date"],
-            "bodyhtml": bodyhtml,
-        }
-        msghtml = msgtmpl.render(context)
-        hfd.write(msghtml)
-
-
 class HTMLPart(html.parser.HTMLParser):
    """
    A text/html part
@ -913,15 +896,376 @@ class TextFlowedPart:
        return s


+class Message:
+    def __init__(self, msg):
+        self.msgid = get_message_id(msg)
+        print("M", self.msgid, file=sys.stderr)
+        self.encmsgid = encode_message_id(self.msgid)
+
+        self.date = email.utils.parsedate_to_datetime(msg["Date"])
+
+        # In-Reply-To headers with more than one message-id are rare, but
+        # standard-conforming, and some MUAs (e.g., mutt) create them.
+        in_reply_to = msg["In-Reply-To"]
+        if in_reply_to:
+            if isinstance(in_reply_to, email.header.Header):
+                in_reply_to = in_reply_to.encode()
+            in_reply_to_msgids = re.findall(r'<(.*?)>', in_reply_to)
+        else:
+            in_reply_to_msgids = []
+
+        references = msg["References"]
+        if references:
+            references_msgids = re.findall(r'<(.*?)>', references)
+        else:
+            references_msgids = []
+
+        for msgid in in_reply_to_msgids:
+            if msgid not in references_msgids:
+                references_msgids.append(msgid)
+        if not in_reply_to_msgids and references_msgids:
+            in_reply_to_msgid = [references_msgids[-1]]
+        self.in_reply_to = in_reply_to_msgids
+        self.references = references_msgids
+        self.mfrom = msg["From"]
+        self.subject = msg["Subject"]
+        self.msg = msg
+        self.kids = False
+        if self.date.tzinfo is None:
+            # If timezone is missing, assume local time
+            self.date = self.date.astimezone()
+    
+    def __repr__(self):
+        return (
+            self.msgid + " " +
+            self.date.strftime("%Y-%m-%d %H:%M:%S%z") +
+            " [" + ", ".join(self.references) + "]"
+        )


+    def webify(self):
+        msg = self.msg
+        mid = self.msgid
+        print("M", mid, file=sys.stderr)
+        encmid = self.encmsgid
+        msgdir = basedir + "/msg/" + encmid
+        os.makedirs(msgdir, exist_ok=True)
+        with open(msgdir + "/index.html", "w") as hfd:
+            msgtmpl = jenv.get_template("message.html")
+            bodyhtml = render_body(msg)
+            context = {
+                "list": "LUGA",
+                "message_id": mid,
+                "subject": decode_rfc2047(msg["Subject"]),
+                "from": decode_rfc2047(msg["From"]),
+                "date": msg["Date"],
+                "bodyhtml": bodyhtml,
+                "threadhtml": self.thread.as_html(),
+            }
+            msghtml = msgtmpl.render(context)
+            hfd.write(msghtml)


+# For each message-id, record the thread it belongs to.
+# This should probably be an instance variable of Archive instead of global,
+# but for it doesn't matter.
+msg2thread = {}
+
+class Thread:
+    def __init__(self, archive):
+        self.archive = archive
+        self.messages = {}
+        self.threadid = None
+        self._as_html = None
+
+    def add_message(self, msg):
+        self.messages[msg.msgid] = msg
+        self.archive.msg2thread[msg.msgid] = self
+        msg.thread = self
+
+    def merge_thread(self, other):
+        for msg in other.messages.values():
+            self.add_message(msg)
+
+    def __repr__(self):
+        if self.threadid:
+            s = self.threadid
+        else:
+            s = str(id(self))
+        if self.messages:
+            s += " {" + ", ".join(self.messages.keys()) + "}"
+        return s
+
+    def fixup_in_reply_tos(self):
+        # Fix up some problems with in_reply_to:
+        # Sometimes an in_reply_to refers to a message which isn't in the
+        # archive. Add a dummy message if this happens.
+        # Sometimes an in_reply_to refers to a message with a later date.
+        # In this case one of the two date headers must be wrong. We could try
+        # to analyze other headers (especially received), but for now we just
+        # assume that it is the referrer (although in the example I'm
+        # currently looking at it is the referree) and adjust that. We should
+        # preserve the original date header, though. Use separate sort_date and
+        # date?
+        missing = set()
+        for m in self.messages.values():
+            for r in m.in_reply_to:
+                if r not in self.messages:
+                    missing.add(r)
+        for r in missing:
+            firstdate = sorted(self.messages.values(), key=lambda x: x.date)[0].date
+            missingdate = firstdate - datetime.timedelta(seconds=1)
+            msg = email.message.EmailMessage()
+            msg["Message-Id"] = f"<{r}>"
+            msg["Date"] = missingdate
+            msg["From"] = "unknown@invalid"
+            msg["Subject"] = "(not in archive)"
+            self.add_message(Message(msg))
+        dates_ok = False
+        while not dates_ok:
+            dates_ok = True
+            for m in self.messages.values():
+                for r in m.in_reply_to:
+                    rr = self.messages[r]
+                    if rr.date >= m.date:
+                        m.date = rr.date + datetime.timedelta(seconds=1)
+                        dates_ok = False
+
+
+    def as_html(self):
+        if self._as_html:
+            # This method isn't that expensive, but it isn't idempotent - so we
+            # must not run the algorithm twice on the same thread. Therefore we
+            # remember the result and return it on subsequent runs.
+            s = self._as_html
+            return jinja2.Markup(s)
+        self.fixup_in_reply_tos()
+        y = 0
+        x = 0
+        nodes = []
+        edges = []
+        lines = []
+        for m in sorted(self.messages.values(), key=lambda x: x.date):
+            # We have already fudged the in_reply_to field to always contain
+            # the latest reference(s), so we only need to consider that
+            if len(m.in_reply_to) == 0:
+                if y == 0:
+                    # first message in thread
+                    # Just add a node
+                    nodes.append((x, y))
+                    m.x = x
+                    m.y = y
+                else:
+                    # Not in reply to anything, but not the start of the thread
+                    # either. This will happen if fixup_in_reply_tos adds more
+                    # than one dummy message, but it might also happen if we
+                    # use different criteria for matching threads (e.g. Subject
+                    # or Thread-Index)
+                    # Just start a new column to get out of the way
+                    x += 1
+                    nodes.append((x, y))
+                    m.x = x
+                    m.y = y
+
+            elif len(m.in_reply_to) == 1:
+                p = self.messages[m.in_reply_to[0]]
+                if p.kids:
+                    # The parent already has kids, so we must move to the side
+                    # to avoid running an edge through an existing kid. We
+                    # could use a sophisticated algorithm to find the best
+                    # position here, but I think it sufficient to just start a
+                    # new column. This may waste some space (there might have
+                    # been a suitable position in the existing columns, but it
+                    # will avoid collisions and is very simple.
+                    x += 1
+                    m.x = x
+                    m.y = y
+                else:
+                    # Just put the new kid directly below the parent
+                    m.x = p.x
+                    m.y = y
+                nodes.append((m.x, m.y))
+                edges.append((p.x, p.y, m.x, m.y))
+                p.kids = True
+            else:
+                # Generic case with multiple references.
+                # I think this should always work well if we start a new
+                # column. There may be special cases where we can avoid it, not
+                # sure.
+                x += 1
+                m.x = x
+                m.y = y
+                nodes.append((m.x, m.y))
+                for r in m.in_reply_to:
+                    p = self.messages[r]
+                    edges.append((p.x, p.y, m.x, m.y))
+            lines.append((m.date, m.mfrom, m.subject, m.encmsgid))
+            y += 1
+        s = "<table class='thread'>"
+        s += "<tr>"
+        s += f"<td rowspan={y}>"
+
+        r = 4
+        fx = 16
+        fy = 32
+        s += f"<svg width={(x + 1) * fx} height={y * fy}>"
+        for e in edges:
+            if e[0] == e[2]:
+                s += f"<line x1={e[0] * fx + fx/2} y1={e[1] * fy + fy/2} x2={e[2] * fx + fx/2} y2={e[3] * fy + fy/2} stroke='black' />"
+            else:
+                if e[3] == e[1] + 1:
+                    yc = (e[1] + e[2]) / 2
+                else:
+                    yc = e[1] + 1
+                s += f"<path d='M {e[0] * fx + fx/2} {e[1] * fy + fy/2} Q {e[2] * fx + fx/2} {yc * fy + fy/2} {e[2] * fx + fx/2} {e[3] * fy + fy/2}' stroke='black' fill='none' />"
+        for n in nodes:
+            s += f"<circle cx={n[0] * fx + fx/2} cy={n[1] * fy + fy/2} r={r} />"
+        s += "</svg>"
+        s += "</td>"
+        
+        # XXX  - escape!
+        s += f"<td class='date'><a href='/msg/{lines[0][3]}/'>{lines[0][0]}</a></td>"
+        s += f"<td class='from'>{lines[0][1]}</td>"
+        s += f"<td class='subject'>{lines[0][2]}</td>"
+        s += "</tr>"
+
+        for ln in lines[1:]:
+            s += "<tr>"
+            s += f"<td class='date'><a href='/msg/{ln[3]}/'>{ln[0]}</a></td>"
+            s += f"<td class='from'>{ln[1]}</td>"
+            s += f"<td class='subject'>{ln[2]}</td>"
+            s += "</tr>"
+        s += "</table>"
+        self._as_html = s
+        return jinja2.Markup(s)
+
+
+    @property
+    def subject(self):
+        return list(self.messages.values())[0].subject
+
+
+class Archive:
+    def __init__(self):
+        self.messages = []
+        self.msg2thread = {}
+
+    def add_message(self, msg):
+        self.self_check()
+        m = Message(msg)
+        if m.msgid in self.msg2thread:
+            # We have already seen this message, so ignore it
+            return
+        t = Thread(self)
+        t.add_message(m)
+        self.messages.append(m)
+        self.self_check()
+
+    def merge_threads(self):
+        self.self_check()
+        finished = False
+        while not finished:
+            finished = True
+            for msgid in list(self.msg2thread.keys()):
+                thread = self.msg2thread[msgid]
+                for msgid2 in list(thread.messages.keys()):
+                    msg = thread.messages[msgid2]
+                    for r in msg.references:
+                        if r in thread.messages:
+                            pass
+                        else:
+                            # references may contain non-existant messages, so
+                            # be careful:
+                            if r in self.msg2thread:
+                                thread.merge_thread(self.msg2thread[r])
+                                finished = False
+
+        self.thread_list = []
+        for thread in self.msg2thread.values():
+            if thread.threadid:
+                continue
+            messages = iter(thread.messages.values())
+            msg = next(messages)
+            thread.date = msg.date
+            thread.threadid = msg.msgid
+            for msg in messages:
+                if msg.date < thread.date:
+                    thread.threadid = msg.msgid
+                    thread.date = msg.date
+            self.thread_list.append(thread)
+
+    def webify_messages(self):
+        self.self_check()
+        for m in self.messages:
+            m.webify()
+
+    def webify_threads(self):
+        self.self_check()
+        threadtmpl = jenv.get_template("thread.html")
+        for t in self.thread_list:
+            threaddir = basedir + "/thread/" + t.threadid
+            os.makedirs(threaddir, exist_ok=True)
+            with open(threaddir + "/index.html", "w") as hfd:
+                context = {
+                    "list": "LUGA",
+                    "threadhtml": t.as_html(),
+                }
+                threadhtml = threadtmpl.render(context)
+                hfd.write(threadhtml)
+
+
+    def webify_calendar(self):
+        caltmpl = jenv.get_template("calendar.html")
+        cal = {}
+        for t in self.thread_list:
+            y = t.date.year
+            m = t.date.month
+            if y not in cal:
+                cal[y] = {}
+            if m not in cal[y]:
+                cal[y][m] = []
+            cal[y][m].append(t)
+        caldir = basedir + "/cal"
+        os.makedirs(caldir, exist_ok=True)
+        with open(caldir + "/index.html", "w") as hfd:
+            context = {
+                "list": "LUGA",
+                "cal": cal,
+            }
+            calhtml = caltmpl.render(context)
+            hfd.write(calhtml)
+            
+
+    def self_check(self):
+        # The messages in self.messages must be unique:
+
+        seen = set()
+        for m in self.messages:
+            assert m.msgid not in seen, m.msgid
+            seen.add(m.msgid)
+
+
+arch = Archive()
+
 for f in sys.argv[1:]:
    print("F", f, file=sys.stderr)
    mb = mailbox.mbox(f)

    for m in mb:
-        archive(m)
+        arch.add_message(m)
+
+
+# Now I have a lot of 1 message threads
+# Merge them
+arch.merge_threads()
+
+# Then dump all the messages
+arch.webify_messages()
+
+# And the threads
+arch.webify_threads()
+
+# And a calendar view
+arch.webify_calendar()

 # vim: tw=79
--- a/templates/calendar.html
+++ b/templates/calendar.html
@ -0,0 +1,37 @@
+<!DOCTYPE html>
+<html>
+    <head>
+        <meta charset="utf-8">
+        <title>
+            {{list}}: {{subject}}
+        </title>
+        <link rel="stylesheet" href="../../style/debug.css">
+    </head>
+    <body>
+        <h1>{{list}} by date</h1>
+        <nav>
+          <ul>
+
+          </ul>
+          {% for y in cal | dictsort %}
+            <li>
+            {{y.0}}
+            <ul>
+            {% for m in y.1 | dictsort %}
+              {{m.0}}
+              <ul>
+                {% for t in m.1 %}
+                  <li>
+                    <a href="../thread/{{t.threadid}}/">{{t.subject}}</a>
+                  </li>
+                {% endfor %}
+              </ul>
+            {% endfor %}
+            </ul>
+            </li>
+          {% endfor %}
+        </nav>
+    </body>
+</html>
+
+
--- a/templates/message.html
+++ b/templates/message.html
@ -9,6 +9,9 @@
    </head>
    <body>
        <h1>{{subject}}</h1>
+        <nav>
+          {{threadhtml}}
+        </nav>
        <table>
            <tr> <th>Message-Id </th> <td>{{message_id}} </td> </tr>
            <tr> <th>From </th> <td>{{from}} </td> </tr>
--- a/templates/thread.html
+++ b/templates/thread.html
@ -0,0 +1,17 @@
+<!DOCTYPE html>
+<html>
+    <head>
+        <meta charset="utf-8">
+        <title>
+            {{list}}: {{subject}}
+        </title>
+        <link rel="stylesheet" href="../../style/debug.css">
+    </head>
+    <body>
+        <h1>{{subject}}</h1>
+        <nav>
+          {{threadhtml}}
+        </nav>
+    </body>
+</html>
+