Merge thread-handling from mbox2threads into mbox2web

2020-04-12 23:08:10 +02:00 · 2020-04-12 23:08:10 +02:00 · 6923e6273a
parent 1ada1c3817
commit 6923e6273a
4 changed files with 425 additions and 24 deletions
--- a/392
+++ b/392
@ -1,5 +1,6 @@
 #!/usr/bin/python3
 import datetime
 import email.header
 import email.parser
 import hashlib
@ -7,6 +8,7 @@ import html
 import html.parser
 import mailbox
 import os
 import pprint
 import re
 import subprocess
 import sys
@ -243,7 +245,9 @@ def render_body(msg, extra=None):
            if not whole_msg_embedded_id:
                whole_msg.add_header("Message-Id", "<" + whole_msg_id + ">")
                whole_msg_embedded_id = whole_msg_id
-            archive(whole_msg)
+            if whole_msg["Date"] is None:
                whole_msg["Date"] = msg["Date"]
            arch.add_message(whole_msg) # XXX - global
            del partial_message_cache[whole_msg_id]
        return "<p>This is part %d of %d of <a href='../%s/'>%s</a></p>" % (
                        int(msg.get_param("number")),
@ -415,7 +419,7 @@ def render_body(msg, extra=None):
    content_type = msg.get_content_type()
    content_disposition = msg.get_content_disposition()
    if content_disposition == "attachment":
-        # XXX - not sure, if we should just store all content-types. 
+        # XXX - not sure if we should just store all content-types. 
        # We probably should clean up html. Alternatively we could just store
        # all of them application/octet-stream, which browsers should download
        # and not try to display.
@ -426,27 +430,6 @@ def render_body(msg, extra=None):
    return jinja2.Markup(bodyhtml)
 def archive(msg):
    mid = get_message_id(msg)
    print("M", mid, file=sys.stderr)
    encmid = encode_message_id(mid)
    msgdir = basedir + "/msg/" + encmid
    os.makedirs(msgdir, exist_ok=True)
    with open(msgdir + "/index.html", "w") as hfd:
        msgtmpl = jenv.get_template("message.html")
        bodyhtml = render_body(msg)
        context = {
            "list": "LUGA",
            "message_id": mid,
            "subject": decode_rfc2047(msg["Subject"]),
            "from": decode_rfc2047(msg["From"]),
            "date": msg["Date"],
            "bodyhtml": bodyhtml,
        }
        msghtml = msgtmpl.render(context)
        hfd.write(msghtml)
 class HTMLPart(html.parser.HTMLParser):
    """
    A text/html part
@ -913,15 +896,376 @@ class TextFlowedPart:
        return s
 class Message:
    def __init__(self, msg):
        self.msgid = get_message_id(msg)
        print("M", self.msgid, file=sys.stderr)
        self.encmsgid = encode_message_id(self.msgid)
        self.date = email.utils.parsedate_to_datetime(msg["Date"])
        # In-Reply-To headers with more than one message-id are rare, but
        # standard-conforming, and some MUAs (e.g., mutt) create them.
        in_reply_to = msg["In-Reply-To"]
        if in_reply_to:
            if isinstance(in_reply_to, email.header.Header):
                in_reply_to = in_reply_to.encode()
            in_reply_to_msgids = re.findall(r'<(.*?)>', in_reply_to)
        else:
            in_reply_to_msgids = []
        references = msg["References"]
        if references:
            references_msgids = re.findall(r'<(.*?)>', references)
        else:
            references_msgids = []
        for msgid in in_reply_to_msgids:
            if msgid not in references_msgids:
                references_msgids.append(msgid)
        if not in_reply_to_msgids and references_msgids:
            in_reply_to_msgid = [references_msgids[-1]]
        self.in_reply_to = in_reply_to_msgids
        self.references = references_msgids
        self.mfrom = msg["From"]
        self.subject = msg["Subject"]
        self.msg = msg
        self.kids = False
        if self.date.tzinfo is None:
            # If timezone is missing, assume local time
            self.date = self.date.astimezone()
    def __repr__(self):
        return (
            self.msgid + " " +
            self.date.strftime("%Y-%m-%d %H:%M:%S%z") +
            " [" + ", ".join(self.references) + "]"
        )
    def webify(self):
        msg = self.msg
        mid = self.msgid
        print("M", mid, file=sys.stderr)
        encmid = self.encmsgid
        msgdir = basedir + "/msg/" + encmid
        os.makedirs(msgdir, exist_ok=True)
        with open(msgdir + "/index.html", "w") as hfd:
            msgtmpl = jenv.get_template("message.html")
            bodyhtml = render_body(msg)
            context = {
                "list": "LUGA",
                "message_id": mid,
                "subject": decode_rfc2047(msg["Subject"]),
                "from": decode_rfc2047(msg["From"]),
                "date": msg["Date"],
                "bodyhtml": bodyhtml,
                "threadhtml": self.thread.as_html(),
            }
            msghtml = msgtmpl.render(context)
            hfd.write(msghtml)
 # For each message-id, record the thread it belongs to.
 # This should probably be an instance variable of Archive instead of global,
 # but for it doesn't matter.
 msg2thread = {}
 class Thread:
    def __init__(self, archive):
        self.archive = archive
        self.messages = {}
        self.threadid = None
        self._as_html = None
    def add_message(self, msg):
        self.messages[msg.msgid] = msg
        self.archive.msg2thread[msg.msgid] = self
        msg.thread = self
    def merge_thread(self, other):
        for msg in other.messages.values():
            self.add_message(msg)
    def __repr__(self):
        if self.threadid:
            s = self.threadid
        else:
            s = str(id(self))
        if self.messages:
            s += " {" + ", ".join(self.messages.keys()) + "}"
        return s
    def fixup_in_reply_tos(self):
        # Fix up some problems with in_reply_to:
        # Sometimes an in_reply_to refers to a message which isn't in the
        # archive. Add a dummy message if this happens.
        # Sometimes an in_reply_to refers to a message with a later date.
        # In this case one of the two date headers must be wrong. We could try
        # to analyze other headers (especially received), but for now we just
        # assume that it is the referrer (although in the example I'm
        # currently looking at it is the referree) and adjust that. We should
        # preserve the original date header, though. Use separate sort_date and
        # date?
        missing = set()
        for m in self.messages.values():
            for r in m.in_reply_to:
                if r not in self.messages:
                    missing.add(r)
        for r in missing:
            firstdate = sorted(self.messages.values(), key=lambda x: x.date)[0].date
            missingdate = firstdate - datetime.timedelta(seconds=1)
            msg = email.message.EmailMessage()
            msg["Message-Id"] = f"<{r}>"
            msg["Date"] = missingdate
            msg["From"] = "unknown@invalid"
            msg["Subject"] = "(not in archive)"
            self.add_message(Message(msg))
        dates_ok = False
        while not dates_ok:
            dates_ok = True
            for m in self.messages.values():
                for r in m.in_reply_to:
                    rr = self.messages[r]
                    if rr.date >= m.date:
                        m.date = rr.date + datetime.timedelta(seconds=1)
                        dates_ok = False
    def as_html(self):
        if self._as_html:
            # This method isn't that expensive, but it isn't idempotent - so we
            # must not run the algorithm twice on the same thread. Therefore we
            # remember the result and return it on subsequent runs.
            s = self._as_html
            return jinja2.Markup(s)
        self.fixup_in_reply_tos()
        y = 0
        x = 0
        nodes = []
        edges = []
        lines = []
        for m in sorted(self.messages.values(), key=lambda x: x.date):
            # We have already fudged the in_reply_to field to always contain
            # the latest reference(s), so we only need to consider that
            if len(m.in_reply_to) == 0:
                if y == 0:
                    # first message in thread
                    # Just add a node
                    nodes.append((x, y))
                    m.x = x
                    m.y = y
                else:
                    # Not in reply to anything, but not the start of the thread
                    # either. This will happen if fixup_in_reply_tos adds more
                    # than one dummy message, but it might also happen if we
                    # use different criteria for matching threads (e.g. Subject
                    # or Thread-Index)
                    # Just start a new column to get out of the way
                    x += 1
                    nodes.append((x, y))
                    m.x = x
                    m.y = y
            elif len(m.in_reply_to) == 1:
                p = self.messages[m.in_reply_to[0]]
                if p.kids:
                    # The parent already has kids, so we must move to the side
                    # to avoid running an edge through an existing kid. We
                    # could use a sophisticated algorithm to find the best
                    # position here, but I think it sufficient to just start a
                    # new column. This may waste some space (there might have
                    # been a suitable position in the existing columns, but it
                    # will avoid collisions and is very simple.
                    x += 1
                    m.x = x
                    m.y = y
                else:
                    # Just put the new kid directly below the parent
                    m.x = p.x
                    m.y = y
                nodes.append((m.x, m.y))
                edges.append((p.x, p.y, m.x, m.y))
                p.kids = True
            else:
                # Generic case with multiple references.
                # I think this should always work well if we start a new
                # column. There may be special cases where we can avoid it, not
                # sure.
                x += 1
                m.x = x
                m.y = y
                nodes.append((m.x, m.y))
                for r in m.in_reply_to:
                    p = self.messages[r]
                    edges.append((p.x, p.y, m.x, m.y))
            lines.append((m.date, m.mfrom, m.subject, m.encmsgid))
            y += 1
        s = "<table class='thread'>"
        s += "<tr>"
        s += f"<td rowspan={y}>"
        r = 4
        fx = 16
        fy = 32
        s += f"<svg width={(x + 1) * fx} height={y * fy}>"
        for e in edges:
            if e[0] == e[2]:
                s += f"<line x1={e[0] * fx + fx/2} y1={e[1] * fy + fy/2} x2={e[2] * fx + fx/2} y2={e[3] * fy + fy/2} stroke='black' />"
            else:
                if e[3] == e[1] + 1:
                    yc = (e[1] + e[2]) / 2
                else:
                    yc = e[1] + 1
                s += f"<path d='M {e[0] * fx + fx/2} {e[1] * fy + fy/2} Q {e[2] * fx + fx/2} {yc * fy + fy/2} {e[2] * fx + fx/2} {e[3] * fy + fy/2}' stroke='black' fill='none' />"
        for n in nodes:
            s += f"<circle cx={n[0] * fx + fx/2} cy={n[1] * fy + fy/2} r={r} />"
        s += "</svg>"
        s += "</td>"
        # XXX  - escape!
        s += f"<td class='date'><a href='/msg/{lines[0][3]}/'>{lines[0][0]}</a></td>"
        s += f"<td class='from'>{lines[0][1]}</td>"
        s += f"<td class='subject'>{lines[0][2]}</td>"
        s += "</tr>"
        for ln in lines[1:]:
            s += "<tr>"
            s += f"<td class='date'><a href='/msg/{ln[3]}/'>{ln[0]}</a></td>"
            s += f"<td class='from'>{ln[1]}</td>"
            s += f"<td class='subject'>{ln[2]}</td>"
            s += "</tr>"
        s += "</table>"
        self._as_html = s
        return jinja2.Markup(s)
    @property
    def subject(self):
        return list(self.messages.values())[0].subject
 class Archive:
    def __init__(self):
        self.messages = []
        self.msg2thread = {}
    def add_message(self, msg):
        self.self_check()
        m = Message(msg)
        if m.msgid in self.msg2thread:
            # We have already seen this message, so ignore it
            return
        t = Thread(self)
        t.add_message(m)
        self.messages.append(m)
        self.self_check()
    def merge_threads(self):
        self.self_check()
        finished = False
        while not finished:
            finished = True
            for msgid in list(self.msg2thread.keys()):
                thread = self.msg2thread[msgid]
                for msgid2 in list(thread.messages.keys()):
                    msg = thread.messages[msgid2]
                    for r in msg.references:
                        if r in thread.messages:
                            pass
                        else:
                            # references may contain non-existant messages, so
                            # be careful:
                            if r in self.msg2thread:
                                thread.merge_thread(self.msg2thread[r])
                                finished = False
        self.thread_list = []
        for thread in self.msg2thread.values():
            if thread.threadid:
                continue
            messages = iter(thread.messages.values())
            msg = next(messages)
            thread.date = msg.date
            thread.threadid = msg.msgid
            for msg in messages:
                if msg.date < thread.date:
                    thread.threadid = msg.msgid
                    thread.date = msg.date
            self.thread_list.append(thread)
    def webify_messages(self):
        self.self_check()
        for m in self.messages:
            m.webify()
    def webify_threads(self):
        self.self_check()
        threadtmpl = jenv.get_template("thread.html")
        for t in self.thread_list:
            threaddir = basedir + "/thread/" + t.threadid
            os.makedirs(threaddir, exist_ok=True)
            with open(threaddir + "/index.html", "w") as hfd:
                context = {
                    "list": "LUGA",
                    "threadhtml": t.as_html(),
                }
                threadhtml = threadtmpl.render(context)
                hfd.write(threadhtml)
    def webify_calendar(self):
        caltmpl = jenv.get_template("calendar.html")
        cal = {}
        for t in self.thread_list:
            y = t.date.year
            m = t.date.month
            if y not in cal:
                cal[y] = {}
            if m not in cal[y]:
                cal[y][m] = []
            cal[y][m].append(t)
        caldir = basedir + "/cal"
        os.makedirs(caldir, exist_ok=True)
        with open(caldir + "/index.html", "w") as hfd:
            context = {
                "list": "LUGA",
                "cal": cal,
            }
            calhtml = caltmpl.render(context)
            hfd.write(calhtml)
    def self_check(self):
        # The messages in self.messages must be unique:
        seen = set()
        for m in self.messages:
            assert m.msgid not in seen, m.msgid
            seen.add(m.msgid)
 arch = Archive()
 for f in sys.argv[1:]:
    print("F", f, file=sys.stderr)
    mb = mailbox.mbox(f)
    for m in mb:
-        archive(m)
+        arch.add_message(m)
 # Now I have a lot of 1 message threads
 # Merge them
 arch.merge_threads()
 # Then dump all the messages
 arch.webify_messages()
 # And the threads
 arch.webify_threads()
 # And a calendar view
 arch.webify_calendar()
 # vim: tw=79
--- a/templates/calendar.html
+++ b/templates/calendar.html
@ -0,0 +1,37 @@
 <!DOCTYPE html>
 <html>
    <head>
        <meta charset="utf-8">
        <title>
            {{list}}: {{subject}}
        </title>
        <link rel="stylesheet" href="../../style/debug.css">
    </head>
    <body>
        <h1>{{list}} by date</h1>
        <nav>
          <ul>
          </ul>
          {% for y in cal | dictsort %}
            <li>
            {{y.0}}
            <ul>
            {% for m in y.1 | dictsort %}
              {{m.0}}
              <ul>
                {% for t in m.1 %}
                  <li>
                    <a href="../thread/{{t.threadid}}/">{{t.subject}}</a>
                  </li>
                {% endfor %}
              </ul>
            {% endfor %}
            </ul>
            </li>
          {% endfor %}
        </nav>
    </body>
 </html>
--- a/templates/message.html
+++ b/templates/message.html
@ -9,6 +9,9 @@
    </head>
    <body>
        <h1>{{subject}}</h1>
        <nav>
          {{threadhtml}}
        </nav>
        <table>
            <tr> <th>Message-Id </th> <td>{{message_id}} </td> </tr>
            <tr> <th>From </th> <td>{{from}} </td> </tr>
--- a/templates/thread.html
+++ b/templates/thread.html
@ -0,0 +1,17 @@
 <!DOCTYPE html>
 <html>
    <head>
        <meta charset="utf-8">
        <title>
            {{list}}: {{subject}}
        </title>
        <link rel="stylesheet" href="../../style/debug.css">
    </head>
    <body>
        <h1>{{subject}}</h1>
        <nav>
          {{threadhtml}}
        </nav>
    </body>
 </html>