From 1ada1c3817b9f307754578c028004d8dbb6f5482 Mon Sep 17 00:00:00 2001 From: "Peter J. Holzer" Date: Sat, 29 Feb 2020 20:25:00 +0100 Subject: [PATCH] Add test program for threading algorithm --- mbox2threads | 328 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 328 insertions(+) create mode 100755 mbox2threads diff --git a/mbox2threads b/mbox2threads new file mode 100755 index 0000000..ad3f67c --- /dev/null +++ b/mbox2threads @@ -0,0 +1,328 @@ +#!/usr/bin/python3 + +import datetime +import email.utils +import mailbox +import pdb +import re +import sys + + +def get_message_id(msg): + """ + Extract the message id from a message + + Note that this assumes that there is (at least) one message id. If + this is not the case, it will raise an exception (currently an + IndexError, but we may use something more suitable in the future). + """ + match = re.search(r'<(.*?)>', msg["Message-ID"]) + return match.group(1) + + +def encode_message_id(msgid): + encmsgid = re.sub('[^!"$(-.0-9:=@-z|~]', lambda x: "{%02x}" % (ord(x.group(0))), msgid) + return encmsgid + + +class Message: + def __init__(self, msgid, in_reply_to, references, date, mfrom, subject): + self.msgid = msgid + self.in_reply_to = in_reply_to + self.references = references + self.date = date + self.mfrom = mfrom + self.subject = subject + self.kids = False + if self.date.tzinfo is None: + # If timezone is missing, assume local time + self.date = self.date.astimezone() + + def __repr__(self): + return ( + self.msgid + " " + + self.date.strftime("%Y-%m-%d %H:%M:%S%z") + + " [" + ", ".join(self.references) + "]" + ) + + +msg2thread = {} + +class Thread: + def __init__(self): + self.messages = {} + self.threadid = None + + def add_message(self, msg): + self.messages[msg.msgid] = msg + msg2thread[msg.msgid] = self + + def merge_thread(self, other): + for msg in other.messages.values(): + self.add_message(msg) + + def __repr__(self): + if self.threadid: + s = self.threadid + else: + s = str(id(self)) + if self.messages: + s += " {" + ", ".join(self.messages.keys()) + "}" + return s + + def fixup_in_reply_tos(self): + # Fix up some problems with in_reply_to: + # Sometimes an in_reply_to refers to a message which isn't in the + # archive. Add a dummy message if this happens. + # Sometimes an in_reply_to refers to a message with a later date. + # In this case one of the two date headers must be wrong. We could try + # to analyze other headers (especially received), but for now we just + # assume that it is the referrer (although in the example I'm + # currently looking at it is the referree) and adjust that. We should + # preserve the original date header, though. Use separate sort_date and + # date? + missing = set() + for m in self.messages.values(): + for r in m.in_reply_to: + if r not in self.messages: + missing.add(r) + for r in missing: + firstdate = sorted(self.messages.values(), key=lambda x: x.date)[0].date + missingdate = firstdate - datetime.timedelta(seconds=1) + self.add_message( + Message(r, [], [], + missingdate, + "unknown@invalid", "(not in archive)") + ) + dates_ok = False + while not dates_ok: + dates_ok = True + for m in self.messages.values(): + for r in m.in_reply_to: + rr = self.messages[r] + if rr.date >= m.date: + m.date = rr.date + datetime.timedelta(seconds=1) + dates_ok = False + + + + def as_html(self): + self.fixup_in_reply_tos() + y = 0 + x = 0 + nodes = [] + edges = [] + lines = [] + for m in sorted(self.messages.values(), key=lambda x: x.date): + # We have already fudged the in_reply_to field to always contain + # the latest reference(s), so we only need to consider that + if len(m.in_reply_to) == 0: + if y == 0: + # first message in thread + # Just add a node + nodes.append((x, y)) + m.x = x + m.y = y + else: + # Not in reply to anything, but not the start of the thread + # either. This will happen if fixup_in_reply_tos adds more + # than one dummy message, but it might also happen if we + # use different criteria for matching threads (e.g. Subject + # or Thread-Index) + # Just start a new column to get out of the way + x += 1 + nodes.append((x, y)) + m.x = x + m.y = y + + elif len(m.in_reply_to) == 1: + p = self.messages[m.in_reply_to[0]] + if p.kids: + # The parent already has kids, so we must move to the side + # to avoid running an edge through an existing kid. We + # could use a sophisticated algorithm to find the best + # position here, but I think it sufficient to just start a + # new column. This may waste some space (there might have + # been a suitable position in the existing columns, but it + # will avoid collisions and is very simple. + x += 1 + m.x = x + m.y = y + else: + # Just put the new kid directly below the parent + m.x = p.x + m.y = y + nodes.append((m.x, m.y)) + edges.append((p.x, p.y, m.x, m.y)) + p.kids = True + else: + # Generic case with multiple references. + # I think this should always work well if we start a new + # column. There may be special cases where we can avoid it, not + # sure. + x += 1 + m.x = x + m.y = y + nodes.append((m.x, m.y)) + for r in m.in_reply_to: + p = self.messages[r] + edges.append((p.x, p.y, m.x, m.y)) + lines.append((m.date, m.mfrom, m.subject)) + y += 1 + s = "" + s += "" + s += f"" + + # XXX - escape! + s += f"" + s += f"" + s += f"" + s += "" + + for ln in lines[1:]: + s += "" + s += f"" + s += f"" + s += f"" + s += "" + s += "
" + + r = 4 + fx = 16 + fy = 32 + s += f"" + for e in edges: + if e[0] == e[2]: + s += f"" + else: + if e[3] == e[1] + 1: + yc = (e[1] + e[2]) / 2 + else: + yc = e[1] + 1 + s += f"" + for n in nodes: + s += f"" + s += "" + s += "{lines[0][0]}{lines[0][1]}{lines[0][2]}
{ln[0]}{ln[1]}{ln[2]}
" + return s + + +def add_message(msg): + mid = get_message_id(msg) + print("M", mid, file=sys.stderr) + encmid = encode_message_id(mid) + + date = email.utils.parsedate_to_datetime(msg["Date"]) + + # In-Reply-To headers with more than one message-id are rare, but + # standard-conforming, and some MUAs (e.g., mutt) create them. + in_reply_to = msg["In-Reply-To"] + if in_reply_to: + if isinstance(in_reply_to, email.header.Header): + in_reply_to = in_reply_to.encode() + in_reply_to_msgids = re.findall(r'<(.*?)>', in_reply_to) + else: + in_reply_to_msgids = [] + + references = msg["References"] + if references: + references_msgids = re.findall(r'<(.*?)>', references) + else: + references_msgids = [] + + for msgid in in_reply_to_msgids: + if msgid not in references_msgids: + references_msgids.append(msgid) + if not in_reply_to_msgids and references_msgids: + in_reply_to_msgid = [references_msgids[-1]] + t = Thread() + t.add_message( + Message( + mid, + in_reply_to_msgids, references_msgids, + date, + msg["From"], msg["Subject"])) + + + +for f in sys.argv[1:]: + print("F", f, file=sys.stderr) + mb = mailbox.mbox(f) + + for m in mb: + add_message(m) + +# Now I have a lot of 1 message threads +# Merge them + +finished = False +while not finished: + finished = True + for msgid in list(msg2thread.keys()): + thread = msg2thread[msgid] + for msgid2 in list(thread.messages.keys()): + msg = thread.messages[msgid2] + for r in msg.references: + if r in thread.messages: + pass + else: + # references may contain non-existant messages, so + # be careful: + if r in msg2thread: + thread.merge_thread(msg2thread[r]) + finished = False + +thread_list = [] +for thread in msg2thread.values(): + if thread.threadid: + continue + messages = iter(thread.messages.values()) + msg = next(messages) + thread.date = msg.date + thread.threadid = msg.msgid + for msg in messages: + if msg.date < thread.date: + thread.threadid = msg.msgid + thread.date = msg.date + thread_list.append(thread) + +print(""" + + + + + + + + +""") +for thread in sorted(thread_list, key=lambda x: x.date): + print(thread.as_html()) + +# vim: tw=79