diff --git a/mbox2web b/mbox2web index ee1a0b3..f1ffe26 100755 --- a/mbox2web +++ b/mbox2web @@ -1,5 +1,6 @@ #!/usr/bin/python3 +import datetime import email.header import email.parser import hashlib @@ -7,6 +8,7 @@ import html import html.parser import mailbox import os +import pprint import re import subprocess import sys @@ -243,7 +245,9 @@ def render_body(msg, extra=None): if not whole_msg_embedded_id: whole_msg.add_header("Message-Id", "<" + whole_msg_id + ">") whole_msg_embedded_id = whole_msg_id - archive(whole_msg) + if whole_msg["Date"] is None: + whole_msg["Date"] = msg["Date"] + arch.add_message(whole_msg) # XXX - global del partial_message_cache[whole_msg_id] return "
This is part %d of %d of %s
" % ( int(msg.get_param("number")), @@ -415,7 +419,7 @@ def render_body(msg, extra=None): content_type = msg.get_content_type() content_disposition = msg.get_content_disposition() if content_disposition == "attachment": - # XXX - not sure, if we should just store all content-types. + # XXX - not sure if we should just store all content-types. # We probably should clean up html. Alternatively we could just store # all of them application/octet-stream, which browsers should download # and not try to display. @@ -426,27 +430,6 @@ def render_body(msg, extra=None): return jinja2.Markup(bodyhtml) -def archive(msg): - mid = get_message_id(msg) - print("M", mid, file=sys.stderr) - encmid = encode_message_id(mid) - msgdir = basedir + "/msg/" + encmid - os.makedirs(msgdir, exist_ok=True) - with open(msgdir + "/index.html", "w") as hfd: - msgtmpl = jenv.get_template("message.html") - bodyhtml = render_body(msg) - context = { - "list": "LUGA", - "message_id": mid, - "subject": decode_rfc2047(msg["Subject"]), - "from": decode_rfc2047(msg["From"]), - "date": msg["Date"], - "bodyhtml": bodyhtml, - } - msghtml = msgtmpl.render(context) - hfd.write(msghtml) - - class HTMLPart(html.parser.HTMLParser): """ A text/html part @@ -913,15 +896,376 @@ class TextFlowedPart: return s +class Message: + def __init__(self, msg): + self.msgid = get_message_id(msg) + print("M", self.msgid, file=sys.stderr) + self.encmsgid = encode_message_id(self.msgid) + + self.date = email.utils.parsedate_to_datetime(msg["Date"]) + + # In-Reply-To headers with more than one message-id are rare, but + # standard-conforming, and some MUAs (e.g., mutt) create them. + in_reply_to = msg["In-Reply-To"] + if in_reply_to: + if isinstance(in_reply_to, email.header.Header): + in_reply_to = in_reply_to.encode() + in_reply_to_msgids = re.findall(r'<(.*?)>', in_reply_to) + else: + in_reply_to_msgids = [] + + references = msg["References"] + if references: + references_msgids = re.findall(r'<(.*?)>', references) + else: + references_msgids = [] + + for msgid in in_reply_to_msgids: + if msgid not in references_msgids: + references_msgids.append(msgid) + if not in_reply_to_msgids and references_msgids: + in_reply_to_msgid = [references_msgids[-1]] + self.in_reply_to = in_reply_to_msgids + self.references = references_msgids + self.mfrom = msg["From"] + self.subject = msg["Subject"] + self.msg = msg + self.kids = False + if self.date.tzinfo is None: + # If timezone is missing, assume local time + self.date = self.date.astimezone() + + def __repr__(self): + return ( + self.msgid + " " + + self.date.strftime("%Y-%m-%d %H:%M:%S%z") + + " [" + ", ".join(self.references) + "]" + ) + def webify(self): + msg = self.msg + mid = self.msgid + print("M", mid, file=sys.stderr) + encmid = self.encmsgid + msgdir = basedir + "/msg/" + encmid + os.makedirs(msgdir, exist_ok=True) + with open(msgdir + "/index.html", "w") as hfd: + msgtmpl = jenv.get_template("message.html") + bodyhtml = render_body(msg) + context = { + "list": "LUGA", + "message_id": mid, + "subject": decode_rfc2047(msg["Subject"]), + "from": decode_rfc2047(msg["From"]), + "date": msg["Date"], + "bodyhtml": bodyhtml, + "threadhtml": self.thread.as_html(), + } + msghtml = msgtmpl.render(context) + hfd.write(msghtml) +# For each message-id, record the thread it belongs to. +# This should probably be an instance variable of Archive instead of global, +# but for it doesn't matter. +msg2thread = {} + +class Thread: + def __init__(self, archive): + self.archive = archive + self.messages = {} + self.threadid = None + self._as_html = None + + def add_message(self, msg): + self.messages[msg.msgid] = msg + self.archive.msg2thread[msg.msgid] = self + msg.thread = self + + def merge_thread(self, other): + for msg in other.messages.values(): + self.add_message(msg) + + def __repr__(self): + if self.threadid: + s = self.threadid + else: + s = str(id(self)) + if self.messages: + s += " {" + ", ".join(self.messages.keys()) + "}" + return s + + def fixup_in_reply_tos(self): + # Fix up some problems with in_reply_to: + # Sometimes an in_reply_to refers to a message which isn't in the + # archive. Add a dummy message if this happens. + # Sometimes an in_reply_to refers to a message with a later date. + # In this case one of the two date headers must be wrong. We could try + # to analyze other headers (especially received), but for now we just + # assume that it is the referrer (although in the example I'm + # currently looking at it is the referree) and adjust that. We should + # preserve the original date header, though. Use separate sort_date and + # date? + missing = set() + for m in self.messages.values(): + for r in m.in_reply_to: + if r not in self.messages: + missing.add(r) + for r in missing: + firstdate = sorted(self.messages.values(), key=lambda x: x.date)[0].date + missingdate = firstdate - datetime.timedelta(seconds=1) + msg = email.message.EmailMessage() + msg["Message-Id"] = f"<{r}>" + msg["Date"] = missingdate + msg["From"] = "unknown@invalid" + msg["Subject"] = "(not in archive)" + self.add_message(Message(msg)) + dates_ok = False + while not dates_ok: + dates_ok = True + for m in self.messages.values(): + for r in m.in_reply_to: + rr = self.messages[r] + if rr.date >= m.date: + m.date = rr.date + datetime.timedelta(seconds=1) + dates_ok = False + + + def as_html(self): + if self._as_html: + # This method isn't that expensive, but it isn't idempotent - so we + # must not run the algorithm twice on the same thread. Therefore we + # remember the result and return it on subsequent runs. + s = self._as_html + return jinja2.Markup(s) + self.fixup_in_reply_tos() + y = 0 + x = 0 + nodes = [] + edges = [] + lines = [] + for m in sorted(self.messages.values(), key=lambda x: x.date): + # We have already fudged the in_reply_to field to always contain + # the latest reference(s), so we only need to consider that + if len(m.in_reply_to) == 0: + if y == 0: + # first message in thread + # Just add a node + nodes.append((x, y)) + m.x = x + m.y = y + else: + # Not in reply to anything, but not the start of the thread + # either. This will happen if fixup_in_reply_tos adds more + # than one dummy message, but it might also happen if we + # use different criteria for matching threads (e.g. Subject + # or Thread-Index) + # Just start a new column to get out of the way + x += 1 + nodes.append((x, y)) + m.x = x + m.y = y + + elif len(m.in_reply_to) == 1: + p = self.messages[m.in_reply_to[0]] + if p.kids: + # The parent already has kids, so we must move to the side + # to avoid running an edge through an existing kid. We + # could use a sophisticated algorithm to find the best + # position here, but I think it sufficient to just start a + # new column. This may waste some space (there might have + # been a suitable position in the existing columns, but it + # will avoid collisions and is very simple. + x += 1 + m.x = x + m.y = y + else: + # Just put the new kid directly below the parent + m.x = p.x + m.y = y + nodes.append((m.x, m.y)) + edges.append((p.x, p.y, m.x, m.y)) + p.kids = True + else: + # Generic case with multiple references. + # I think this should always work well if we start a new + # column. There may be special cases where we can avoid it, not + # sure. + x += 1 + m.x = x + m.y = y + nodes.append((m.x, m.y)) + for r in m.in_reply_to: + p = self.messages[r] + edges.append((p.x, p.y, m.x, m.y)) + lines.append((m.date, m.mfrom, m.subject, m.encmsgid)) + y += 1 + s = "" + + r = 4 + fx = 16 + fy = 32 + s += f"" + s += " | " + + # XXX - escape! + s += f"{lines[0][0]} | " + s += f"{lines[0][1]} | " + s += f"{lines[0][2]} | " + s += "
{ln[0]} | " + s += f"{ln[1]} | " + s += f"{ln[2]} | " + s += "
Message-Id | {{message_id}} |
---|---|
From | {{from}} |