yama/mbox2threads

329 lines
10 KiB
Python
Executable File

#!/usr/bin/python3
import datetime
import email.utils
import mailbox
import pdb
import re
import sys
def get_message_id(msg):
"""
Extract the message id from a message
Note that this assumes that there is (at least) one message id. If
this is not the case, it will raise an exception (currently an
IndexError, but we may use something more suitable in the future).
"""
match = re.search(r'<(.*?)>', msg["Message-ID"])
return match.group(1)
def encode_message_id(msgid):
encmsgid = re.sub('[^!"$(-.0-9:=@-z|~]', lambda x: "{%02x}" % (ord(x.group(0))), msgid)
return encmsgid
class Message:
def __init__(self, msgid, in_reply_to, references, date, mfrom, subject):
self.msgid = msgid
self.in_reply_to = in_reply_to
self.references = references
self.date = date
self.mfrom = mfrom
self.subject = subject
self.kids = False
if self.date.tzinfo is None:
# If timezone is missing, assume local time
self.date = self.date.astimezone()
def __repr__(self):
return (
self.msgid + " " +
self.date.strftime("%Y-%m-%d %H:%M:%S%z") +
" [" + ", ".join(self.references) + "]"
)
msg2thread = {}
class Thread:
def __init__(self):
self.messages = {}
self.threadid = None
def add_message(self, msg):
self.messages[msg.msgid] = msg
msg2thread[msg.msgid] = self
def merge_thread(self, other):
for msg in other.messages.values():
self.add_message(msg)
def __repr__(self):
if self.threadid:
s = self.threadid
else:
s = str(id(self))
if self.messages:
s += " {" + ", ".join(self.messages.keys()) + "}"
return s
def fixup_in_reply_tos(self):
# Fix up some problems with in_reply_to:
# Sometimes an in_reply_to refers to a message which isn't in the
# archive. Add a dummy message if this happens.
# Sometimes an in_reply_to refers to a message with a later date.
# In this case one of the two date headers must be wrong. We could try
# to analyze other headers (especially received), but for now we just
# assume that it is the referrer (although in the example I'm
# currently looking at it is the referree) and adjust that. We should
# preserve the original date header, though. Use separate sort_date and
# date?
missing = set()
for m in self.messages.values():
for r in m.in_reply_to:
if r not in self.messages:
missing.add(r)
for r in missing:
firstdate = sorted(self.messages.values(), key=lambda x: x.date)[0].date
missingdate = firstdate - datetime.timedelta(seconds=1)
self.add_message(
Message(r, [], [],
missingdate,
"unknown@invalid", "(not in archive)")
)
dates_ok = False
while not dates_ok:
dates_ok = True
for m in self.messages.values():
for r in m.in_reply_to:
rr = self.messages[r]
if rr.date >= m.date:
m.date = rr.date + datetime.timedelta(seconds=1)
dates_ok = False
def as_html(self):
self.fixup_in_reply_tos()
y = 0
x = 0
nodes = []
edges = []
lines = []
for m in sorted(self.messages.values(), key=lambda x: x.date):
# We have already fudged the in_reply_to field to always contain
# the latest reference(s), so we only need to consider that
if len(m.in_reply_to) == 0:
if y == 0:
# first message in thread
# Just add a node
nodes.append((x, y))
m.x = x
m.y = y
else:
# Not in reply to anything, but not the start of the thread
# either. This will happen if fixup_in_reply_tos adds more
# than one dummy message, but it might also happen if we
# use different criteria for matching threads (e.g. Subject
# or Thread-Index)
# Just start a new column to get out of the way
x += 1
nodes.append((x, y))
m.x = x
m.y = y
elif len(m.in_reply_to) == 1:
p = self.messages[m.in_reply_to[0]]
if p.kids:
# The parent already has kids, so we must move to the side
# to avoid running an edge through an existing kid. We
# could use a sophisticated algorithm to find the best
# position here, but I think it sufficient to just start a
# new column. This may waste some space (there might have
# been a suitable position in the existing columns, but it
# will avoid collisions and is very simple.
x += 1
m.x = x
m.y = y
else:
# Just put the new kid directly below the parent
m.x = p.x
m.y = y
nodes.append((m.x, m.y))
edges.append((p.x, p.y, m.x, m.y))
p.kids = True
else:
# Generic case with multiple references.
# I think this should always work well if we start a new
# column. There may be special cases where we can avoid it, not
# sure.
x += 1
m.x = x
m.y = y
nodes.append((m.x, m.y))
for r in m.in_reply_to:
p = self.messages[r]
edges.append((p.x, p.y, m.x, m.y))
lines.append((m.date, m.mfrom, m.subject))
y += 1
s = "<table class='thread'>"
s += "<tr>"
s += f"<td rowspan={y}>"
r = 4
fx = 16
fy = 32
s += f"<svg width={(x + 1) * fx} height={y * fy}>"
for e in edges:
if e[0] == e[2]:
s += f"<line x1={e[0] * fx + fx/2} y1={e[1] * fy + fy/2} x2={e[2] * fx + fx/2} y2={e[3] * fy + fy/2} stroke='black' />"
else:
if e[3] == e[1] + 1:
yc = (e[1] + e[2]) / 2
else:
yc = e[1] + 1
s += f"<path d='M {e[0] * fx + fx/2} {e[1] * fy + fy/2} Q {e[2] * fx + fx/2} {yc * fy + fy/2} {e[2] * fx + fx/2} {e[3] * fy + fy/2}' stroke='black' fill='none' />"
for n in nodes:
s += f"<circle cx={n[0] * fx + fx/2} cy={n[1] * fy + fy/2} r={r} />"
s += "</svg>"
s += "</td>"
# XXX - escape!
s += f"<td class='date'>{lines[0][0]}</td>"
s += f"<td class='from'>{lines[0][1]}</td>"
s += f"<td class='subject'>{lines[0][2]}</td>"
s += "</tr>"
for ln in lines[1:]:
s += "<tr>"
s += f"<td class='date'>{ln[0]}</td>"
s += f"<td class='from'>{ln[1]}</td>"
s += f"<td class='subject'>{ln[2]}</td>"
s += "</tr>"
s += "</table>"
return s
def add_message(msg):
mid = get_message_id(msg)
print("M", mid, file=sys.stderr)
encmid = encode_message_id(mid)
date = email.utils.parsedate_to_datetime(msg["Date"])
# In-Reply-To headers with more than one message-id are rare, but
# standard-conforming, and some MUAs (e.g., mutt) create them.
in_reply_to = msg["In-Reply-To"]
if in_reply_to:
if isinstance(in_reply_to, email.header.Header):
in_reply_to = in_reply_to.encode()
in_reply_to_msgids = re.findall(r'<(.*?)>', in_reply_to)
else:
in_reply_to_msgids = []
references = msg["References"]
if references:
references_msgids = re.findall(r'<(.*?)>', references)
else:
references_msgids = []
for msgid in in_reply_to_msgids:
if msgid not in references_msgids:
references_msgids.append(msgid)
if not in_reply_to_msgids and references_msgids:
in_reply_to_msgid = [references_msgids[-1]]
t = Thread()
t.add_message(
Message(
mid,
in_reply_to_msgids, references_msgids,
date,
msg["From"], msg["Subject"]))
for f in sys.argv[1:]:
print("F", f, file=sys.stderr)
mb = mailbox.mbox(f)
for m in mb:
add_message(m)
# Now I have a lot of 1 message threads
# Merge them
finished = False
while not finished:
finished = True
for msgid in list(msg2thread.keys()):
thread = msg2thread[msgid]
for msgid2 in list(thread.messages.keys()):
msg = thread.messages[msgid2]
for r in msg.references:
if r in thread.messages:
pass
else:
# references may contain non-existant messages, so
# be careful:
if r in msg2thread:
thread.merge_thread(msg2thread[r])
finished = False
thread_list = []
for thread in msg2thread.values():
if thread.threadid:
continue
messages = iter(thread.messages.values())
msg = next(messages)
thread.date = msg.date
thread.threadid = msg.msgid
for msg in messages:
if msg.date < thread.date:
thread.threadid = msg.msgid
thread.date = msg.date
thread_list.append(thread)
print("""
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<style>
table.thread {
border-collapse: collapse;
}
table.thread tr {
height: 32px;
font-size: 16px;
background-color: #EFE;
}
table.thread td {
overflow: hidden;
white-space: nowrap;
}
.timestamp {
width: 8em;
padding-right: 0.5em;
}
.from {
max-width: 8em;
padding-left: 0.5em;
padding-right: 0.5em;
}
.subject {
max-width: 10em;
padding-left: 0.5em;
}
</style>
</head>
<body>
""")
for thread in sorted(thread_list, key=lambda x: x.date):
print(thread.as_html())
# vim: tw=79