diff --git a/mbox2threads b/mbox2threads new file mode 100755 index 0000000..ad3f67c --- /dev/null +++ b/mbox2threads @@ -0,0 +1,328 @@ +#!/usr/bin/python3 + +import datetime +import email.utils +import mailbox +import pdb +import re +import sys + + +def get_message_id(msg): + """ + Extract the message id from a message + + Note that this assumes that there is (at least) one message id. If + this is not the case, it will raise an exception (currently an + IndexError, but we may use something more suitable in the future). + """ + match = re.search(r'<(.*?)>', msg["Message-ID"]) + return match.group(1) + + +def encode_message_id(msgid): + encmsgid = re.sub('[^!"$(-.0-9:=@-z|~]', lambda x: "{%02x}" % (ord(x.group(0))), msgid) + return encmsgid + + +class Message: + def __init__(self, msgid, in_reply_to, references, date, mfrom, subject): + self.msgid = msgid + self.in_reply_to = in_reply_to + self.references = references + self.date = date + self.mfrom = mfrom + self.subject = subject + self.kids = False + if self.date.tzinfo is None: + # If timezone is missing, assume local time + self.date = self.date.astimezone() + + def __repr__(self): + return ( + self.msgid + " " + + self.date.strftime("%Y-%m-%d %H:%M:%S%z") + + " [" + ", ".join(self.references) + "]" + ) + + +msg2thread = {} + +class Thread: + def __init__(self): + self.messages = {} + self.threadid = None + + def add_message(self, msg): + self.messages[msg.msgid] = msg + msg2thread[msg.msgid] = self + + def merge_thread(self, other): + for msg in other.messages.values(): + self.add_message(msg) + + def __repr__(self): + if self.threadid: + s = self.threadid + else: + s = str(id(self)) + if self.messages: + s += " {" + ", ".join(self.messages.keys()) + "}" + return s + + def fixup_in_reply_tos(self): + # Fix up some problems with in_reply_to: + # Sometimes an in_reply_to refers to a message which isn't in the + # archive. Add a dummy message if this happens. + # Sometimes an in_reply_to refers to a message with a later date. + # In this case one of the two date headers must be wrong. We could try + # to analyze other headers (especially received), but for now we just + # assume that it is the referrer (although in the example I'm + # currently looking at it is the referree) and adjust that. We should + # preserve the original date header, though. Use separate sort_date and + # date? + missing = set() + for m in self.messages.values(): + for r in m.in_reply_to: + if r not in self.messages: + missing.add(r) + for r in missing: + firstdate = sorted(self.messages.values(), key=lambda x: x.date)[0].date + missingdate = firstdate - datetime.timedelta(seconds=1) + self.add_message( + Message(r, [], [], + missingdate, + "unknown@invalid", "(not in archive)") + ) + dates_ok = False + while not dates_ok: + dates_ok = True + for m in self.messages.values(): + for r in m.in_reply_to: + rr = self.messages[r] + if rr.date >= m.date: + m.date = rr.date + datetime.timedelta(seconds=1) + dates_ok = False + + + + def as_html(self): + self.fixup_in_reply_tos() + y = 0 + x = 0 + nodes = [] + edges = [] + lines = [] + for m in sorted(self.messages.values(), key=lambda x: x.date): + # We have already fudged the in_reply_to field to always contain + # the latest reference(s), so we only need to consider that + if len(m.in_reply_to) == 0: + if y == 0: + # first message in thread + # Just add a node + nodes.append((x, y)) + m.x = x + m.y = y + else: + # Not in reply to anything, but not the start of the thread + # either. This will happen if fixup_in_reply_tos adds more + # than one dummy message, but it might also happen if we + # use different criteria for matching threads (e.g. Subject + # or Thread-Index) + # Just start a new column to get out of the way + x += 1 + nodes.append((x, y)) + m.x = x + m.y = y + + elif len(m.in_reply_to) == 1: + p = self.messages[m.in_reply_to[0]] + if p.kids: + # The parent already has kids, so we must move to the side + # to avoid running an edge through an existing kid. We + # could use a sophisticated algorithm to find the best + # position here, but I think it sufficient to just start a + # new column. This may waste some space (there might have + # been a suitable position in the existing columns, but it + # will avoid collisions and is very simple. + x += 1 + m.x = x + m.y = y + else: + # Just put the new kid directly below the parent + m.x = p.x + m.y = y + nodes.append((m.x, m.y)) + edges.append((p.x, p.y, m.x, m.y)) + p.kids = True + else: + # Generic case with multiple references. + # I think this should always work well if we start a new + # column. There may be special cases where we can avoid it, not + # sure. + x += 1 + m.x = x + m.y = y + nodes.append((m.x, m.y)) + for r in m.in_reply_to: + p = self.messages[r] + edges.append((p.x, p.y, m.x, m.y)) + lines.append((m.date, m.mfrom, m.subject)) + y += 1 + s = "
" + + r = 4 + fx = 16 + fy = 32 + s += f"" + s += " | " + + # XXX - escape! + s += f"{lines[0][0]} | " + s += f"{lines[0][1]} | " + s += f"{lines[0][2]} | " + s += "
{ln[0]} | " + s += f"{ln[1]} | " + s += f"{ln[2]} | " + s += "