From 1ada1c3817b9f307754578c028004d8dbb6f5482 Mon Sep 17 00:00:00 2001
From: "Peter J. Holzer" <hjp@hjp.at>
Date: Sat, 29 Feb 2020 20:25:00 +0100
Subject: [PATCH] Add test program for threading algorithm

---
 mbox2threads | 328 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 328 insertions(+)
 create mode 100755 mbox2threads

diff --git a/mbox2threads b/mbox2threads
new file mode 100755
index 0000000..ad3f67c
--- /dev/null
+++ b/mbox2threads
@@ -0,0 +1,328 @@
+#!/usr/bin/python3
+
+import datetime
+import email.utils
+import mailbox
+import pdb
+import re
+import sys
+
+
+def get_message_id(msg):
+    """
+    Extract the message id from a message
+
+    Note that this assumes that there is (at least) one message id. If
+    this is not the case, it will raise an exception (currently an
+    IndexError, but we may use something more suitable in the future).
+    """
+    match = re.search(r'<(.*?)>', msg["Message-ID"])
+    return match.group(1)
+
+
+def encode_message_id(msgid):
+    encmsgid = re.sub('[^!"$(-.0-9:=@-z|~]', lambda x: "{%02x}" % (ord(x.group(0))), msgid)
+    return encmsgid
+
+
+class Message:
+    def __init__(self, msgid, in_reply_to, references, date, mfrom, subject):
+        self.msgid = msgid
+        self.in_reply_to = in_reply_to
+        self.references = references
+        self.date = date
+        self.mfrom = mfrom
+        self.subject = subject
+        self.kids = False
+        if self.date.tzinfo is None:
+            # If timezone is missing, assume local time
+            self.date = self.date.astimezone()
+    
+    def __repr__(self):
+        return (
+            self.msgid + " " +
+            self.date.strftime("%Y-%m-%d %H:%M:%S%z") +
+            " [" + ", ".join(self.references) + "]"
+        )
+
+
+msg2thread = {}
+
+class Thread:
+    def __init__(self):
+        self.messages = {}
+        self.threadid = None
+
+    def add_message(self, msg):
+        self.messages[msg.msgid] = msg
+        msg2thread[msg.msgid] = self
+
+    def merge_thread(self, other):
+        for msg in other.messages.values():
+            self.add_message(msg)
+
+    def __repr__(self):
+        if self.threadid:
+            s = self.threadid
+        else:
+            s = str(id(self))
+        if self.messages:
+            s += " {" + ", ".join(self.messages.keys()) + "}"
+        return s
+
+    def fixup_in_reply_tos(self):
+        # Fix up some problems with in_reply_to:
+        # Sometimes an in_reply_to refers to a message which isn't in the
+        # archive. Add a dummy message if this happens.
+        # Sometimes an in_reply_to refers to a message with a later date.
+        # In this case one of the two date headers must be wrong. We could try
+        # to analyze other headers (especially received), but for now we just
+        # assume that it is the referrer (although in the example I'm
+        # currently looking at it is the referree) and adjust that. We should
+        # preserve the original date header, though. Use separate sort_date and
+        # date?
+        missing = set()
+        for m in self.messages.values():
+            for r in m.in_reply_to:
+                if r not in self.messages:
+                    missing.add(r)
+        for r in missing:
+            firstdate = sorted(self.messages.values(), key=lambda x: x.date)[0].date
+            missingdate = firstdate - datetime.timedelta(seconds=1)
+            self.add_message(
+                    Message(r, [], [],
+                    missingdate,
+                    "unknown@invalid", "(not in archive)")
+                 )
+        dates_ok = False
+        while not dates_ok:
+            dates_ok = True
+            for m in self.messages.values():
+                for r in m.in_reply_to:
+                    rr = self.messages[r]
+                    if rr.date >= m.date:
+                        m.date = rr.date + datetime.timedelta(seconds=1)
+                        dates_ok = False
+
+
+
+    def as_html(self):
+        self.fixup_in_reply_tos()
+        y = 0
+        x = 0
+        nodes = []
+        edges = []
+        lines = []
+        for m in sorted(self.messages.values(), key=lambda x: x.date):
+            # We have already fudged the in_reply_to field to always contain
+            # the latest reference(s), so we only need to consider that
+            if len(m.in_reply_to) == 0:
+                if y == 0:
+                    # first message in thread
+                    # Just add a node
+                    nodes.append((x, y))
+                    m.x = x
+                    m.y = y
+                else:
+                    # Not in reply to anything, but not the start of the thread
+                    # either. This will happen if fixup_in_reply_tos adds more
+                    # than one dummy message, but it might also happen if we
+                    # use different criteria for matching threads (e.g. Subject
+                    # or Thread-Index)
+                    # Just start a new column to get out of the way
+                    x += 1
+                    nodes.append((x, y))
+                    m.x = x
+                    m.y = y
+
+            elif len(m.in_reply_to) == 1:
+                p = self.messages[m.in_reply_to[0]]
+                if p.kids:
+                    # The parent already has kids, so we must move to the side
+                    # to avoid running an edge through an existing kid. We
+                    # could use a sophisticated algorithm to find the best
+                    # position here, but I think it sufficient to just start a
+                    # new column. This may waste some space (there might have
+                    # been a suitable position in the existing columns, but it
+                    # will avoid collisions and is very simple.
+                    x += 1
+                    m.x = x
+                    m.y = y
+                else:
+                    # Just put the new kid directly below the parent
+                    m.x = p.x
+                    m.y = y
+                nodes.append((m.x, m.y))
+                edges.append((p.x, p.y, m.x, m.y))
+                p.kids = True
+            else:
+                # Generic case with multiple references.
+                # I think this should always work well if we start a new
+                # column. There may be special cases where we can avoid it, not
+                # sure.
+                x += 1
+                m.x = x
+                m.y = y
+                nodes.append((m.x, m.y))
+                for r in m.in_reply_to:
+                    p = self.messages[r]
+                    edges.append((p.x, p.y, m.x, m.y))
+            lines.append((m.date, m.mfrom, m.subject))
+            y += 1
+        s = "<table class='thread'>"
+        s += "<tr>"
+        s += f"<td rowspan={y}>"
+
+        r = 4
+        fx = 16
+        fy = 32
+        s += f"<svg width={(x + 1) * fx} height={y * fy}>"
+        for e in edges:
+            if e[0] == e[2]:
+                s += f"<line x1={e[0] * fx + fx/2} y1={e[1] * fy + fy/2} x2={e[2] * fx + fx/2} y2={e[3] * fy + fy/2} stroke='black' />"
+            else:
+                if e[3] == e[1] + 1:
+                    yc = (e[1] + e[2]) / 2
+                else:
+                    yc = e[1] + 1
+                s += f"<path d='M {e[0] * fx + fx/2} {e[1] * fy + fy/2} Q {e[2] * fx + fx/2} {yc * fy + fy/2} {e[2] * fx + fx/2} {e[3] * fy + fy/2}' stroke='black' fill='none' />"
+        for n in nodes:
+            s += f"<circle cx={n[0] * fx + fx/2} cy={n[1] * fy + fy/2} r={r} />"
+        s += "</svg>"
+        s += "</td>"
+        
+        # XXX  - escape!
+        s += f"<td class='date'>{lines[0][0]}</td>"
+        s += f"<td class='from'>{lines[0][1]}</td>"
+        s += f"<td class='subject'>{lines[0][2]}</td>"
+        s += "</tr>"
+
+        for ln in lines[1:]:
+            s += "<tr>"
+            s += f"<td class='date'>{ln[0]}</td>"
+            s += f"<td class='from'>{ln[1]}</td>"
+            s += f"<td class='subject'>{ln[2]}</td>"
+            s += "</tr>"
+        s += "</table>"
+        return s
+
+
+def add_message(msg):
+    mid = get_message_id(msg)
+    print("M", mid, file=sys.stderr)
+    encmid = encode_message_id(mid)
+
+    date = email.utils.parsedate_to_datetime(msg["Date"])
+
+    # In-Reply-To headers with more than one message-id are rare, but
+    # standard-conforming, and some MUAs (e.g., mutt) create them.
+    in_reply_to = msg["In-Reply-To"]
+    if in_reply_to:
+        if isinstance(in_reply_to, email.header.Header):
+            in_reply_to = in_reply_to.encode()
+        in_reply_to_msgids = re.findall(r'<(.*?)>', in_reply_to)
+    else:
+        in_reply_to_msgids = []
+
+    references = msg["References"]
+    if references:
+        references_msgids = re.findall(r'<(.*?)>', references)
+    else:
+        references_msgids = []
+
+    for msgid in in_reply_to_msgids:
+        if msgid not in references_msgids:
+            references_msgids.append(msgid)
+    if not in_reply_to_msgids and references_msgids:
+        in_reply_to_msgid = [references_msgids[-1]]
+    t = Thread()
+    t.add_message(
+        Message(
+            mid,
+            in_reply_to_msgids, references_msgids,
+            date,
+            msg["From"], msg["Subject"]))
+
+
+
+for f in sys.argv[1:]:
+    print("F", f, file=sys.stderr)
+    mb = mailbox.mbox(f)
+
+    for m in mb:
+        add_message(m)
+
+# Now I have a lot of 1 message threads
+# Merge them
+
+finished = False
+while not finished:
+    finished = True
+    for msgid in list(msg2thread.keys()):
+        thread = msg2thread[msgid]
+        for msgid2 in list(thread.messages.keys()):
+            msg = thread.messages[msgid2]
+            for r in msg.references:
+                if r in thread.messages:
+                    pass
+                else:
+                    # references may contain non-existant messages, so
+                    # be careful:
+                    if r in msg2thread:
+                        thread.merge_thread(msg2thread[r])
+                        finished = False
+
+thread_list = []
+for thread in msg2thread.values():
+    if thread.threadid:
+        continue
+    messages = iter(thread.messages.values())
+    msg = next(messages)
+    thread.date = msg.date
+    thread.threadid = msg.msgid
+    for msg in messages:
+        if msg.date < thread.date:
+            thread.threadid = msg.msgid
+            thread.date = msg.date
+    thread_list.append(thread)
+
+print("""
+<!DOCTYPE html>
+<html>
+
+    <head>
+        <meta charset="utf-8">
+        <style>
+            table.thread {
+                border-collapse: collapse;
+            }
+            table.thread tr {
+                height: 32px;
+                font-size: 16px;
+                background-color: #EFE;
+            }
+            table.thread td {
+                overflow: hidden;
+                white-space: nowrap;
+            }
+            .timestamp {
+                width: 8em;
+                padding-right: 0.5em;
+            }
+            .from {
+                max-width: 8em;
+                padding-left: 0.5em;
+                padding-right: 0.5em;
+            }
+            .subject {
+                max-width: 10em;
+                padding-left: 0.5em;
+            }
+        </style>
+    </head>
+    <body>
+""")
+for thread in sorted(thread_list, key=lambda x: x.date):
+    print(thread.as_html())
+
+# vim: tw=79