#!/usr/bin/python3

import datetime
import email.utils
import mailbox
import pdb
import re
import sys


def get_message_id(msg):
    """
    Extract the message id from a message

    Note that this assumes that there is (at least) one message id. If
    this is not the case, it will raise an exception (currently an
    IndexError, but we may use something more suitable in the future).
    """
    match = re.search(r'<(.*?)>', msg["Message-ID"])
    return match.group(1)


def encode_message_id(msgid):
    encmsgid = re.sub('[^!"$(-.0-9:=@-z|~]', lambda x: "{%02x}" % (ord(x.group(0))), msgid)
    return encmsgid


class Message:
    def __init__(self, msgid, in_reply_to, references, date, mfrom, subject):
        self.msgid = msgid
        self.in_reply_to = in_reply_to
        self.references = references
        self.date = date
        self.mfrom = mfrom
        self.subject = subject
        self.kids = False
        if self.date.tzinfo is None:
            # If timezone is missing, assume local time
            self.date = self.date.astimezone()
    
    def __repr__(self):
        return (
            self.msgid + " " +
            self.date.strftime("%Y-%m-%d %H:%M:%S%z") +
            " [" + ", ".join(self.references) + "]"
        )


msg2thread = {}

class Thread:
    def __init__(self):
        self.messages = {}
        self.threadid = None

    def add_message(self, msg):
        self.messages[msg.msgid] = msg
        msg2thread[msg.msgid] = self

    def merge_thread(self, other):
        for msg in other.messages.values():
            self.add_message(msg)

    def __repr__(self):
        if self.threadid:
            s = self.threadid
        else:
            s = str(id(self))
        if self.messages:
            s += " {" + ", ".join(self.messages.keys()) + "}"
        return s

    def fixup_in_reply_tos(self):
        # Fix up some problems with in_reply_to:
        # Sometimes an in_reply_to refers to a message which isn't in the
        # archive. Add a dummy message if this happens.
        # Sometimes an in_reply_to refers to a message with a later date.
        # In this case one of the two date headers must be wrong. We could try
        # to analyze other headers (especially received), but for now we just
        # assume that it is the referrer (although in the example I'm
        # currently looking at it is the referree) and adjust that. We should
        # preserve the original date header, though. Use separate sort_date and
        # date?
        missing = set()
        for m in self.messages.values():
            for r in m.in_reply_to:
                if r not in self.messages:
                    missing.add(r)
        for r in missing:
            firstdate = sorted(self.messages.values(), key=lambda x: x.date)[0].date
            missingdate = firstdate - datetime.timedelta(seconds=1)
            self.add_message(
                    Message(r, [], [],
                    missingdate,
                    "unknown@invalid", "(not in archive)")
                 )
        dates_ok = False
        while not dates_ok:
            dates_ok = True
            for m in self.messages.values():
                for r in m.in_reply_to:
                    rr = self.messages[r]
                    if rr.date >= m.date:
                        m.date = rr.date + datetime.timedelta(seconds=1)
                        dates_ok = False


    def as_html(self):
        self.fixup_in_reply_tos()
        y = 0
        x = 0
        nodes = []
        edges = []
        lines = []
        for m in sorted(self.messages.values(), key=lambda x: x.date):
            # We have already fudged the in_reply_to field to always contain
            # the latest reference(s), so we only need to consider that
            if len(m.in_reply_to) == 0:
                if y == 0:
                    # first message in thread
                    # Just add a node
                    nodes.append((x, y))
                    m.x = x
                    m.y = y
                else:
                    # Not in reply to anything, but not the start of the thread
                    # either. This will happen if fixup_in_reply_tos adds more
                    # than one dummy message, but it might also happen if we
                    # use different criteria for matching threads (e.g. Subject
                    # or Thread-Index)
                    # Just start a new column to get out of the way
                    x += 1
                    nodes.append((x, y))
                    m.x = x
                    m.y = y

            elif len(m.in_reply_to) == 1:
                p = self.messages[m.in_reply_to[0]]
                if p.kids:
                    # The parent already has kids, so we must move to the side
                    # to avoid running an edge through an existing kid. We
                    # could use a sophisticated algorithm to find the best
                    # position here, but I think it sufficient to just start a
                    # new column. This may waste some space (there might have
                    # been a suitable position in the existing columns, but it
                    # will avoid collisions and is very simple.
                    x += 1
                    m.x = x
                    m.y = y
                else:
                    # Just put the new kid directly below the parent
                    m.x = p.x
                    m.y = y
                nodes.append((m.x, m.y))
                edges.append((p.x, p.y, m.x, m.y))
                p.kids = True
            else:
                # Generic case with multiple references.
                # I think this should always work well if we start a new
                # column. There may be special cases where we can avoid it, not
                # sure.
                x += 1
                m.x = x
                m.y = y
                nodes.append((m.x, m.y))
                for r in m.in_reply_to:
                    p = self.messages[r]
                    edges.append((p.x, p.y, m.x, m.y))
            lines.append((m.date, m.mfrom, m.subject))
            y += 1
        s = "<table class='thread'>"
        s += "<tr>"
        s += f"<td rowspan={y}>"

        r = 4
        fx = 16
        fy = 32
        s += f"<svg width={(x + 1) * fx} height={y * fy}>"
        for e in edges:
            if e[0] == e[2]:
                s += f"<line x1={e[0] * fx + fx/2} y1={e[1] * fy + fy/2} x2={e[2] * fx + fx/2} y2={e[3] * fy + fy/2} stroke='black' />"
            else:
                if e[3] == e[1] + 1:
                    yc = (e[1] + e[2]) / 2
                else:
                    yc = e[1] + 1
                s += f"<path d='M {e[0] * fx + fx/2} {e[1] * fy + fy/2} Q {e[2] * fx + fx/2} {yc * fy + fy/2} {e[2] * fx + fx/2} {e[3] * fy + fy/2}' stroke='black' fill='none' />"
        for n in nodes:
            s += f"<circle cx={n[0] * fx + fx/2} cy={n[1] * fy + fy/2} r={r} />"
        s += "</svg>"
        s += "</td>"
        
        # XXX  - escape!
        s += f"<td class='date'>{lines[0][0]}</td>"
        s += f"<td class='from'>{lines[0][1]}</td>"
        s += f"<td class='subject'>{lines[0][2]}</td>"
        s += "</tr>"

        for ln in lines[1:]:
            s += "<tr>"
            s += f"<td class='date'>{ln[0]}</td>"
            s += f"<td class='from'>{ln[1]}</td>"
            s += f"<td class='subject'>{ln[2]}</td>"
            s += "</tr>"
        s += "</table>"
        return s


def add_message(msg):
    mid = get_message_id(msg)
    print("M", mid, file=sys.stderr)
    encmid = encode_message_id(mid)

    date = email.utils.parsedate_to_datetime(msg["Date"])

    # In-Reply-To headers with more than one message-id are rare, but
    # standard-conforming, and some MUAs (e.g., mutt) create them.
    in_reply_to = msg["In-Reply-To"]
    if in_reply_to:
        if isinstance(in_reply_to, email.header.Header):
            in_reply_to = in_reply_to.encode()
        in_reply_to_msgids = re.findall(r'<(.*?)>', in_reply_to)
    else:
        in_reply_to_msgids = []

    references = msg["References"]
    if references:
        references_msgids = re.findall(r'<(.*?)>', references)
    else:
        references_msgids = []

    for msgid in in_reply_to_msgids:
        if msgid not in references_msgids:
            references_msgids.append(msgid)
    if not in_reply_to_msgids and references_msgids:
        in_reply_to_msgid = [references_msgids[-1]]
    t = Thread()
    t.add_message(
        Message(
            mid,
            in_reply_to_msgids, references_msgids,
            date,
            msg["From"], msg["Subject"]))


for f in sys.argv[1:]:
    print("F", f, file=sys.stderr)
    mb = mailbox.mbox(f)

    for m in mb:
        add_message(m)

# Now I have a lot of 1 message threads
# Merge them

finished = False
while not finished:
    finished = True
    for msgid in list(msg2thread.keys()):
        thread = msg2thread[msgid]
        for msgid2 in list(thread.messages.keys()):
            msg = thread.messages[msgid2]
            for r in msg.references:
                if r in thread.messages:
                    pass
                else:
                    # references may contain non-existant messages, so
                    # be careful:
                    if r in msg2thread:
                        thread.merge_thread(msg2thread[r])
                        finished = False

thread_list = []
for thread in msg2thread.values():
    if thread.threadid:
        continue
    messages = iter(thread.messages.values())
    msg = next(messages)
    thread.date = msg.date
    thread.threadid = msg.msgid
    for msg in messages:
        if msg.date < thread.date:
            thread.threadid = msg.msgid
            thread.date = msg.date
    thread_list.append(thread)

print("""
<!DOCTYPE html>
<html>

    <head>
        <meta charset="utf-8">
        <style>
            table.thread {
                border-collapse: collapse;
            }
            table.thread tr {
                height: 32px;
                font-size: 16px;
                background-color: #EFE;
            }
            table.thread td {
                overflow: hidden;
                white-space: nowrap;
            }
            .timestamp {
                width: 8em;
                padding-right: 0.5em;
            }
            .from {
                max-width: 8em;
                padding-left: 0.5em;
                padding-right: 0.5em;
            }
            .subject {
                max-width: 10em;
                padding-left: 0.5em;
            }
        </style>
    </head>
    <body>
""")
for thread in sorted(thread_list, key=lambda x: x.date):
    print(thread.as_html())

# vim: tw=79