From 6923e6273aac576f54d700182b11175e45bb762a Mon Sep 17 00:00:00 2001 From: "Peter J. Holzer" Date: Sun, 12 Apr 2020 23:08:10 +0200 Subject: [PATCH] Merge thread-handling from mbox2threads into mbox2web --- mbox2web | 392 +++++++++++++++++++++++++++++++++++++--- templates/calendar.html | 37 ++++ templates/message.html | 3 + templates/thread.html | 17 ++ 4 files changed, 425 insertions(+), 24 deletions(-) create mode 100644 templates/calendar.html create mode 100644 templates/thread.html diff --git a/mbox2web b/mbox2web index ee1a0b3..f1ffe26 100755 --- a/mbox2web +++ b/mbox2web @@ -1,5 +1,6 @@ #!/usr/bin/python3 +import datetime import email.header import email.parser import hashlib @@ -7,6 +8,7 @@ import html import html.parser import mailbox import os +import pprint import re import subprocess import sys @@ -243,7 +245,9 @@ def render_body(msg, extra=None): if not whole_msg_embedded_id: whole_msg.add_header("Message-Id", "<" + whole_msg_id + ">") whole_msg_embedded_id = whole_msg_id - archive(whole_msg) + if whole_msg["Date"] is None: + whole_msg["Date"] = msg["Date"] + arch.add_message(whole_msg) # XXX - global del partial_message_cache[whole_msg_id] return "

This is part %d of %d of %s

" % ( int(msg.get_param("number")), @@ -415,7 +419,7 @@ def render_body(msg, extra=None): content_type = msg.get_content_type() content_disposition = msg.get_content_disposition() if content_disposition == "attachment": - # XXX - not sure, if we should just store all content-types. + # XXX - not sure if we should just store all content-types. # We probably should clean up html. Alternatively we could just store # all of them application/octet-stream, which browsers should download # and not try to display. @@ -426,27 +430,6 @@ def render_body(msg, extra=None): return jinja2.Markup(bodyhtml) -def archive(msg): - mid = get_message_id(msg) - print("M", mid, file=sys.stderr) - encmid = encode_message_id(mid) - msgdir = basedir + "/msg/" + encmid - os.makedirs(msgdir, exist_ok=True) - with open(msgdir + "/index.html", "w") as hfd: - msgtmpl = jenv.get_template("message.html") - bodyhtml = render_body(msg) - context = { - "list": "LUGA", - "message_id": mid, - "subject": decode_rfc2047(msg["Subject"]), - "from": decode_rfc2047(msg["From"]), - "date": msg["Date"], - "bodyhtml": bodyhtml, - } - msghtml = msgtmpl.render(context) - hfd.write(msghtml) - - class HTMLPart(html.parser.HTMLParser): """ A text/html part @@ -913,15 +896,376 @@ class TextFlowedPart: return s +class Message: + def __init__(self, msg): + self.msgid = get_message_id(msg) + print("M", self.msgid, file=sys.stderr) + self.encmsgid = encode_message_id(self.msgid) + + self.date = email.utils.parsedate_to_datetime(msg["Date"]) + + # In-Reply-To headers with more than one message-id are rare, but + # standard-conforming, and some MUAs (e.g., mutt) create them. + in_reply_to = msg["In-Reply-To"] + if in_reply_to: + if isinstance(in_reply_to, email.header.Header): + in_reply_to = in_reply_to.encode() + in_reply_to_msgids = re.findall(r'<(.*?)>', in_reply_to) + else: + in_reply_to_msgids = [] + + references = msg["References"] + if references: + references_msgids = re.findall(r'<(.*?)>', references) + else: + references_msgids = [] + + for msgid in in_reply_to_msgids: + if msgid not in references_msgids: + references_msgids.append(msgid) + if not in_reply_to_msgids and references_msgids: + in_reply_to_msgid = [references_msgids[-1]] + self.in_reply_to = in_reply_to_msgids + self.references = references_msgids + self.mfrom = msg["From"] + self.subject = msg["Subject"] + self.msg = msg + self.kids = False + if self.date.tzinfo is None: + # If timezone is missing, assume local time + self.date = self.date.astimezone() + + def __repr__(self): + return ( + self.msgid + " " + + self.date.strftime("%Y-%m-%d %H:%M:%S%z") + + " [" + ", ".join(self.references) + "]" + ) + def webify(self): + msg = self.msg + mid = self.msgid + print("M", mid, file=sys.stderr) + encmid = self.encmsgid + msgdir = basedir + "/msg/" + encmid + os.makedirs(msgdir, exist_ok=True) + with open(msgdir + "/index.html", "w") as hfd: + msgtmpl = jenv.get_template("message.html") + bodyhtml = render_body(msg) + context = { + "list": "LUGA", + "message_id": mid, + "subject": decode_rfc2047(msg["Subject"]), + "from": decode_rfc2047(msg["From"]), + "date": msg["Date"], + "bodyhtml": bodyhtml, + "threadhtml": self.thread.as_html(), + } + msghtml = msgtmpl.render(context) + hfd.write(msghtml) +# For each message-id, record the thread it belongs to. +# This should probably be an instance variable of Archive instead of global, +# but for it doesn't matter. +msg2thread = {} + +class Thread: + def __init__(self, archive): + self.archive = archive + self.messages = {} + self.threadid = None + self._as_html = None + + def add_message(self, msg): + self.messages[msg.msgid] = msg + self.archive.msg2thread[msg.msgid] = self + msg.thread = self + + def merge_thread(self, other): + for msg in other.messages.values(): + self.add_message(msg) + + def __repr__(self): + if self.threadid: + s = self.threadid + else: + s = str(id(self)) + if self.messages: + s += " {" + ", ".join(self.messages.keys()) + "}" + return s + + def fixup_in_reply_tos(self): + # Fix up some problems with in_reply_to: + # Sometimes an in_reply_to refers to a message which isn't in the + # archive. Add a dummy message if this happens. + # Sometimes an in_reply_to refers to a message with a later date. + # In this case one of the two date headers must be wrong. We could try + # to analyze other headers (especially received), but for now we just + # assume that it is the referrer (although in the example I'm + # currently looking at it is the referree) and adjust that. We should + # preserve the original date header, though. Use separate sort_date and + # date? + missing = set() + for m in self.messages.values(): + for r in m.in_reply_to: + if r not in self.messages: + missing.add(r) + for r in missing: + firstdate = sorted(self.messages.values(), key=lambda x: x.date)[0].date + missingdate = firstdate - datetime.timedelta(seconds=1) + msg = email.message.EmailMessage() + msg["Message-Id"] = f"<{r}>" + msg["Date"] = missingdate + msg["From"] = "unknown@invalid" + msg["Subject"] = "(not in archive)" + self.add_message(Message(msg)) + dates_ok = False + while not dates_ok: + dates_ok = True + for m in self.messages.values(): + for r in m.in_reply_to: + rr = self.messages[r] + if rr.date >= m.date: + m.date = rr.date + datetime.timedelta(seconds=1) + dates_ok = False + + + def as_html(self): + if self._as_html: + # This method isn't that expensive, but it isn't idempotent - so we + # must not run the algorithm twice on the same thread. Therefore we + # remember the result and return it on subsequent runs. + s = self._as_html + return jinja2.Markup(s) + self.fixup_in_reply_tos() + y = 0 + x = 0 + nodes = [] + edges = [] + lines = [] + for m in sorted(self.messages.values(), key=lambda x: x.date): + # We have already fudged the in_reply_to field to always contain + # the latest reference(s), so we only need to consider that + if len(m.in_reply_to) == 0: + if y == 0: + # first message in thread + # Just add a node + nodes.append((x, y)) + m.x = x + m.y = y + else: + # Not in reply to anything, but not the start of the thread + # either. This will happen if fixup_in_reply_tos adds more + # than one dummy message, but it might also happen if we + # use different criteria for matching threads (e.g. Subject + # or Thread-Index) + # Just start a new column to get out of the way + x += 1 + nodes.append((x, y)) + m.x = x + m.y = y + + elif len(m.in_reply_to) == 1: + p = self.messages[m.in_reply_to[0]] + if p.kids: + # The parent already has kids, so we must move to the side + # to avoid running an edge through an existing kid. We + # could use a sophisticated algorithm to find the best + # position here, but I think it sufficient to just start a + # new column. This may waste some space (there might have + # been a suitable position in the existing columns, but it + # will avoid collisions and is very simple. + x += 1 + m.x = x + m.y = y + else: + # Just put the new kid directly below the parent + m.x = p.x + m.y = y + nodes.append((m.x, m.y)) + edges.append((p.x, p.y, m.x, m.y)) + p.kids = True + else: + # Generic case with multiple references. + # I think this should always work well if we start a new + # column. There may be special cases where we can avoid it, not + # sure. + x += 1 + m.x = x + m.y = y + nodes.append((m.x, m.y)) + for r in m.in_reply_to: + p = self.messages[r] + edges.append((p.x, p.y, m.x, m.y)) + lines.append((m.date, m.mfrom, m.subject, m.encmsgid)) + y += 1 + s = "" + s += "" + s += f"" + + # XXX - escape! + s += f"" + s += f"" + s += f"" + s += "" + + for ln in lines[1:]: + s += "" + s += f"" + s += f"" + s += f"" + s += "" + s += "
" + + r = 4 + fx = 16 + fy = 32 + s += f"" + for e in edges: + if e[0] == e[2]: + s += f"" + else: + if e[3] == e[1] + 1: + yc = (e[1] + e[2]) / 2 + else: + yc = e[1] + 1 + s += f"" + for n in nodes: + s += f"" + s += "" + s += "{lines[0][0]}{lines[0][1]}{lines[0][2]}
{ln[0]}{ln[1]}{ln[2]}
" + self._as_html = s + return jinja2.Markup(s) + + + @property + def subject(self): + return list(self.messages.values())[0].subject + + +class Archive: + def __init__(self): + self.messages = [] + self.msg2thread = {} + + def add_message(self, msg): + self.self_check() + m = Message(msg) + if m.msgid in self.msg2thread: + # We have already seen this message, so ignore it + return + t = Thread(self) + t.add_message(m) + self.messages.append(m) + self.self_check() + + def merge_threads(self): + self.self_check() + finished = False + while not finished: + finished = True + for msgid in list(self.msg2thread.keys()): + thread = self.msg2thread[msgid] + for msgid2 in list(thread.messages.keys()): + msg = thread.messages[msgid2] + for r in msg.references: + if r in thread.messages: + pass + else: + # references may contain non-existant messages, so + # be careful: + if r in self.msg2thread: + thread.merge_thread(self.msg2thread[r]) + finished = False + + self.thread_list = [] + for thread in self.msg2thread.values(): + if thread.threadid: + continue + messages = iter(thread.messages.values()) + msg = next(messages) + thread.date = msg.date + thread.threadid = msg.msgid + for msg in messages: + if msg.date < thread.date: + thread.threadid = msg.msgid + thread.date = msg.date + self.thread_list.append(thread) + + def webify_messages(self): + self.self_check() + for m in self.messages: + m.webify() + + def webify_threads(self): + self.self_check() + threadtmpl = jenv.get_template("thread.html") + for t in self.thread_list: + threaddir = basedir + "/thread/" + t.threadid + os.makedirs(threaddir, exist_ok=True) + with open(threaddir + "/index.html", "w") as hfd: + context = { + "list": "LUGA", + "threadhtml": t.as_html(), + } + threadhtml = threadtmpl.render(context) + hfd.write(threadhtml) + + + def webify_calendar(self): + caltmpl = jenv.get_template("calendar.html") + cal = {} + for t in self.thread_list: + y = t.date.year + m = t.date.month + if y not in cal: + cal[y] = {} + if m not in cal[y]: + cal[y][m] = [] + cal[y][m].append(t) + caldir = basedir + "/cal" + os.makedirs(caldir, exist_ok=True) + with open(caldir + "/index.html", "w") as hfd: + context = { + "list": "LUGA", + "cal": cal, + } + calhtml = caltmpl.render(context) + hfd.write(calhtml) + + + def self_check(self): + # The messages in self.messages must be unique: + + seen = set() + for m in self.messages: + assert m.msgid not in seen, m.msgid + seen.add(m.msgid) + + +arch = Archive() + for f in sys.argv[1:]: print("F", f, file=sys.stderr) mb = mailbox.mbox(f) for m in mb: - archive(m) + arch.add_message(m) + + +# Now I have a lot of 1 message threads +# Merge them +arch.merge_threads() + +# Then dump all the messages +arch.webify_messages() + +# And the threads +arch.webify_threads() + +# And a calendar view +arch.webify_calendar() # vim: tw=79 diff --git a/templates/calendar.html b/templates/calendar.html new file mode 100644 index 0000000..9516366 --- /dev/null +++ b/templates/calendar.html @@ -0,0 +1,37 @@ + + + + + + {{list}}: {{subject}} + + + + +

{{list}} by date

+ + + + + diff --git a/templates/message.html b/templates/message.html index aa3ab6d..03c8d69 100644 --- a/templates/message.html +++ b/templates/message.html @@ -9,6 +9,9 @@

{{subject}}

+ diff --git a/templates/thread.html b/templates/thread.html new file mode 100644 index 0000000..a1b9422 --- /dev/null +++ b/templates/thread.html @@ -0,0 +1,17 @@ + + + + + + {{list}}: {{subject}} + + + + +

{{subject}}

+ + + +
Message-Id {{message_id}}
From {{from}}