#!/usr/bin/python3 import datetime import email.utils import mailbox import pdb import re import sys def get_message_id(msg): """ Extract the message id from a message Note that this assumes that there is (at least) one message id. If this is not the case, it will raise an exception (currently an IndexError, but we may use something more suitable in the future). """ match = re.search(r'<(.*?)>', msg["Message-ID"]) return match.group(1) def encode_message_id(msgid): encmsgid = re.sub('[^!"$(-.0-9:=@-z|~]', lambda x: "{%02x}" % (ord(x.group(0))), msgid) return encmsgid class Message: def __init__(self, msgid, in_reply_to, references, date, mfrom, subject): self.msgid = msgid self.in_reply_to = in_reply_to self.references = references self.date = date self.mfrom = mfrom self.subject = subject self.kids = False if self.date.tzinfo is None: # If timezone is missing, assume local time self.date = self.date.astimezone() def __repr__(self): return ( self.msgid + " " + self.date.strftime("%Y-%m-%d %H:%M:%S%z") + " [" + ", ".join(self.references) + "]" ) msg2thread = {} class Thread: def __init__(self): self.messages = {} self.threadid = None def add_message(self, msg): self.messages[msg.msgid] = msg msg2thread[msg.msgid] = self def merge_thread(self, other): for msg in other.messages.values(): self.add_message(msg) def __repr__(self): if self.threadid: s = self.threadid else: s = str(id(self)) if self.messages: s += " {" + ", ".join(self.messages.keys()) + "}" return s def fixup_in_reply_tos(self): # Fix up some problems with in_reply_to: # Sometimes an in_reply_to refers to a message which isn't in the # archive. Add a dummy message if this happens. # Sometimes an in_reply_to refers to a message with a later date. # In this case one of the two date headers must be wrong. We could try # to analyze other headers (especially received), but for now we just # assume that it is the referrer (although in the example I'm # currently looking at it is the referree) and adjust that. We should # preserve the original date header, though. Use separate sort_date and # date? missing = set() for m in self.messages.values(): for r in m.in_reply_to: if r not in self.messages: missing.add(r) for r in missing: firstdate = sorted(self.messages.values(), key=lambda x: x.date)[0].date missingdate = firstdate - datetime.timedelta(seconds=1) self.add_message( Message(r, [], [], missingdate, "unknown@invalid", "(not in archive)") ) dates_ok = False while not dates_ok: dates_ok = True for m in self.messages.values(): for r in m.in_reply_to: rr = self.messages[r] if rr.date >= m.date: m.date = rr.date + datetime.timedelta(seconds=1) dates_ok = False def as_html(self): self.fixup_in_reply_tos() y = 0 x = 0 nodes = [] edges = [] lines = [] for m in sorted(self.messages.values(), key=lambda x: x.date): # We have already fudged the in_reply_to field to always contain # the latest reference(s), so we only need to consider that if len(m.in_reply_to) == 0: if y == 0: # first message in thread # Just add a node nodes.append((x, y)) m.x = x m.y = y else: # Not in reply to anything, but not the start of the thread # either. This will happen if fixup_in_reply_tos adds more # than one dummy message, but it might also happen if we # use different criteria for matching threads (e.g. Subject # or Thread-Index) # Just start a new column to get out of the way x += 1 nodes.append((x, y)) m.x = x m.y = y elif len(m.in_reply_to) == 1: p = self.messages[m.in_reply_to[0]] if p.kids: # The parent already has kids, so we must move to the side # to avoid running an edge through an existing kid. We # could use a sophisticated algorithm to find the best # position here, but I think it sufficient to just start a # new column. This may waste some space (there might have # been a suitable position in the existing columns, but it # will avoid collisions and is very simple. x += 1 m.x = x m.y = y else: # Just put the new kid directly below the parent m.x = p.x m.y = y nodes.append((m.x, m.y)) edges.append((p.x, p.y, m.x, m.y)) p.kids = True else: # Generic case with multiple references. # I think this should always work well if we start a new # column. There may be special cases where we can avoid it, not # sure. x += 1 m.x = x m.y = y nodes.append((m.x, m.y)) for r in m.in_reply_to: p = self.messages[r] edges.append((p.x, p.y, m.x, m.y)) lines.append((m.date, m.mfrom, m.subject)) y += 1 s = "" s += "" s += f"" # XXX - escape! s += f"" s += f"" s += f"" s += "" for ln in lines[1:]: s += "" s += f"" s += f"" s += f"" s += "" s += "
" r = 4 fx = 16 fy = 32 s += f"" for e in edges: if e[0] == e[2]: s += f"" else: if e[3] == e[1] + 1: yc = (e[1] + e[2]) / 2 else: yc = e[1] + 1 s += f"" for n in nodes: s += f"" s += "" s += "{lines[0][0]}{lines[0][1]}{lines[0][2]}
{ln[0]}{ln[1]}{ln[2]}
" return s def add_message(msg): mid = get_message_id(msg) print("M", mid, file=sys.stderr) encmid = encode_message_id(mid) date = email.utils.parsedate_to_datetime(msg["Date"]) # In-Reply-To headers with more than one message-id are rare, but # standard-conforming, and some MUAs (e.g., mutt) create them. in_reply_to = msg["In-Reply-To"] if in_reply_to: if isinstance(in_reply_to, email.header.Header): in_reply_to = in_reply_to.encode() in_reply_to_msgids = re.findall(r'<(.*?)>', in_reply_to) else: in_reply_to_msgids = [] references = msg["References"] if references: references_msgids = re.findall(r'<(.*?)>', references) else: references_msgids = [] for msgid in in_reply_to_msgids: if msgid not in references_msgids: references_msgids.append(msgid) if not in_reply_to_msgids and references_msgids: in_reply_to_msgid = [references_msgids[-1]] t = Thread() t.add_message( Message( mid, in_reply_to_msgids, references_msgids, date, msg["From"], msg["Subject"])) for f in sys.argv[1:]: print("F", f, file=sys.stderr) mb = mailbox.mbox(f) for m in mb: add_message(m) # Now I have a lot of 1 message threads # Merge them finished = False while not finished: finished = True for msgid in list(msg2thread.keys()): thread = msg2thread[msgid] for msgid2 in list(thread.messages.keys()): msg = thread.messages[msgid2] for r in msg.references: if r in thread.messages: pass else: # references may contain non-existant messages, so # be careful: if r in msg2thread: thread.merge_thread(msg2thread[r]) finished = False thread_list = [] for thread in msg2thread.values(): if thread.threadid: continue messages = iter(thread.messages.values()) msg = next(messages) thread.date = msg.date thread.threadid = msg.msgid for msg in messages: if msg.date < thread.date: thread.threadid = msg.msgid thread.date = msg.date thread_list.append(thread) print(""" """) for thread in sorted(thread_list, key=lambda x: x.date): print(thread.as_html()) # vim: tw=79