Add test program for threading algorithm
This commit is contained in:
parent
29b5288519
commit
1ada1c3817
|
@ -0,0 +1,328 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
import datetime
|
||||
import email.utils
|
||||
import mailbox
|
||||
import pdb
|
||||
import re
|
||||
import sys
|
||||
|
||||
|
||||
def get_message_id(msg):
|
||||
"""
|
||||
Extract the message id from a message
|
||||
|
||||
Note that this assumes that there is (at least) one message id. If
|
||||
this is not the case, it will raise an exception (currently an
|
||||
IndexError, but we may use something more suitable in the future).
|
||||
"""
|
||||
match = re.search(r'<(.*?)>', msg["Message-ID"])
|
||||
return match.group(1)
|
||||
|
||||
|
||||
def encode_message_id(msgid):
|
||||
encmsgid = re.sub('[^!"$(-.0-9:=@-z|~]', lambda x: "{%02x}" % (ord(x.group(0))), msgid)
|
||||
return encmsgid
|
||||
|
||||
|
||||
class Message:
|
||||
def __init__(self, msgid, in_reply_to, references, date, mfrom, subject):
|
||||
self.msgid = msgid
|
||||
self.in_reply_to = in_reply_to
|
||||
self.references = references
|
||||
self.date = date
|
||||
self.mfrom = mfrom
|
||||
self.subject = subject
|
||||
self.kids = False
|
||||
if self.date.tzinfo is None:
|
||||
# If timezone is missing, assume local time
|
||||
self.date = self.date.astimezone()
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
self.msgid + " " +
|
||||
self.date.strftime("%Y-%m-%d %H:%M:%S%z") +
|
||||
" [" + ", ".join(self.references) + "]"
|
||||
)
|
||||
|
||||
|
||||
msg2thread = {}
|
||||
|
||||
class Thread:
|
||||
def __init__(self):
|
||||
self.messages = {}
|
||||
self.threadid = None
|
||||
|
||||
def add_message(self, msg):
|
||||
self.messages[msg.msgid] = msg
|
||||
msg2thread[msg.msgid] = self
|
||||
|
||||
def merge_thread(self, other):
|
||||
for msg in other.messages.values():
|
||||
self.add_message(msg)
|
||||
|
||||
def __repr__(self):
|
||||
if self.threadid:
|
||||
s = self.threadid
|
||||
else:
|
||||
s = str(id(self))
|
||||
if self.messages:
|
||||
s += " {" + ", ".join(self.messages.keys()) + "}"
|
||||
return s
|
||||
|
||||
def fixup_in_reply_tos(self):
|
||||
# Fix up some problems with in_reply_to:
|
||||
# Sometimes an in_reply_to refers to a message which isn't in the
|
||||
# archive. Add a dummy message if this happens.
|
||||
# Sometimes an in_reply_to refers to a message with a later date.
|
||||
# In this case one of the two date headers must be wrong. We could try
|
||||
# to analyze other headers (especially received), but for now we just
|
||||
# assume that it is the referrer (although in the example I'm
|
||||
# currently looking at it is the referree) and adjust that. We should
|
||||
# preserve the original date header, though. Use separate sort_date and
|
||||
# date?
|
||||
missing = set()
|
||||
for m in self.messages.values():
|
||||
for r in m.in_reply_to:
|
||||
if r not in self.messages:
|
||||
missing.add(r)
|
||||
for r in missing:
|
||||
firstdate = sorted(self.messages.values(), key=lambda x: x.date)[0].date
|
||||
missingdate = firstdate - datetime.timedelta(seconds=1)
|
||||
self.add_message(
|
||||
Message(r, [], [],
|
||||
missingdate,
|
||||
"unknown@invalid", "(not in archive)")
|
||||
)
|
||||
dates_ok = False
|
||||
while not dates_ok:
|
||||
dates_ok = True
|
||||
for m in self.messages.values():
|
||||
for r in m.in_reply_to:
|
||||
rr = self.messages[r]
|
||||
if rr.date >= m.date:
|
||||
m.date = rr.date + datetime.timedelta(seconds=1)
|
||||
dates_ok = False
|
||||
|
||||
|
||||
|
||||
def as_html(self):
|
||||
self.fixup_in_reply_tos()
|
||||
y = 0
|
||||
x = 0
|
||||
nodes = []
|
||||
edges = []
|
||||
lines = []
|
||||
for m in sorted(self.messages.values(), key=lambda x: x.date):
|
||||
# We have already fudged the in_reply_to field to always contain
|
||||
# the latest reference(s), so we only need to consider that
|
||||
if len(m.in_reply_to) == 0:
|
||||
if y == 0:
|
||||
# first message in thread
|
||||
# Just add a node
|
||||
nodes.append((x, y))
|
||||
m.x = x
|
||||
m.y = y
|
||||
else:
|
||||
# Not in reply to anything, but not the start of the thread
|
||||
# either. This will happen if fixup_in_reply_tos adds more
|
||||
# than one dummy message, but it might also happen if we
|
||||
# use different criteria for matching threads (e.g. Subject
|
||||
# or Thread-Index)
|
||||
# Just start a new column to get out of the way
|
||||
x += 1
|
||||
nodes.append((x, y))
|
||||
m.x = x
|
||||
m.y = y
|
||||
|
||||
elif len(m.in_reply_to) == 1:
|
||||
p = self.messages[m.in_reply_to[0]]
|
||||
if p.kids:
|
||||
# The parent already has kids, so we must move to the side
|
||||
# to avoid running an edge through an existing kid. We
|
||||
# could use a sophisticated algorithm to find the best
|
||||
# position here, but I think it sufficient to just start a
|
||||
# new column. This may waste some space (there might have
|
||||
# been a suitable position in the existing columns, but it
|
||||
# will avoid collisions and is very simple.
|
||||
x += 1
|
||||
m.x = x
|
||||
m.y = y
|
||||
else:
|
||||
# Just put the new kid directly below the parent
|
||||
m.x = p.x
|
||||
m.y = y
|
||||
nodes.append((m.x, m.y))
|
||||
edges.append((p.x, p.y, m.x, m.y))
|
||||
p.kids = True
|
||||
else:
|
||||
# Generic case with multiple references.
|
||||
# I think this should always work well if we start a new
|
||||
# column. There may be special cases where we can avoid it, not
|
||||
# sure.
|
||||
x += 1
|
||||
m.x = x
|
||||
m.y = y
|
||||
nodes.append((m.x, m.y))
|
||||
for r in m.in_reply_to:
|
||||
p = self.messages[r]
|
||||
edges.append((p.x, p.y, m.x, m.y))
|
||||
lines.append((m.date, m.mfrom, m.subject))
|
||||
y += 1
|
||||
s = "<table class='thread'>"
|
||||
s += "<tr>"
|
||||
s += f"<td rowspan={y}>"
|
||||
|
||||
r = 4
|
||||
fx = 16
|
||||
fy = 32
|
||||
s += f"<svg width={(x + 1) * fx} height={y * fy}>"
|
||||
for e in edges:
|
||||
if e[0] == e[2]:
|
||||
s += f"<line x1={e[0] * fx + fx/2} y1={e[1] * fy + fy/2} x2={e[2] * fx + fx/2} y2={e[3] * fy + fy/2} stroke='black' />"
|
||||
else:
|
||||
if e[3] == e[1] + 1:
|
||||
yc = (e[1] + e[2]) / 2
|
||||
else:
|
||||
yc = e[1] + 1
|
||||
s += f"<path d='M {e[0] * fx + fx/2} {e[1] * fy + fy/2} Q {e[2] * fx + fx/2} {yc * fy + fy/2} {e[2] * fx + fx/2} {e[3] * fy + fy/2}' stroke='black' fill='none' />"
|
||||
for n in nodes:
|
||||
s += f"<circle cx={n[0] * fx + fx/2} cy={n[1] * fy + fy/2} r={r} />"
|
||||
s += "</svg>"
|
||||
s += "</td>"
|
||||
|
||||
# XXX - escape!
|
||||
s += f"<td class='date'>{lines[0][0]}</td>"
|
||||
s += f"<td class='from'>{lines[0][1]}</td>"
|
||||
s += f"<td class='subject'>{lines[0][2]}</td>"
|
||||
s += "</tr>"
|
||||
|
||||
for ln in lines[1:]:
|
||||
s += "<tr>"
|
||||
s += f"<td class='date'>{ln[0]}</td>"
|
||||
s += f"<td class='from'>{ln[1]}</td>"
|
||||
s += f"<td class='subject'>{ln[2]}</td>"
|
||||
s += "</tr>"
|
||||
s += "</table>"
|
||||
return s
|
||||
|
||||
|
||||
def add_message(msg):
|
||||
mid = get_message_id(msg)
|
||||
print("M", mid, file=sys.stderr)
|
||||
encmid = encode_message_id(mid)
|
||||
|
||||
date = email.utils.parsedate_to_datetime(msg["Date"])
|
||||
|
||||
# In-Reply-To headers with more than one message-id are rare, but
|
||||
# standard-conforming, and some MUAs (e.g., mutt) create them.
|
||||
in_reply_to = msg["In-Reply-To"]
|
||||
if in_reply_to:
|
||||
if isinstance(in_reply_to, email.header.Header):
|
||||
in_reply_to = in_reply_to.encode()
|
||||
in_reply_to_msgids = re.findall(r'<(.*?)>', in_reply_to)
|
||||
else:
|
||||
in_reply_to_msgids = []
|
||||
|
||||
references = msg["References"]
|
||||
if references:
|
||||
references_msgids = re.findall(r'<(.*?)>', references)
|
||||
else:
|
||||
references_msgids = []
|
||||
|
||||
for msgid in in_reply_to_msgids:
|
||||
if msgid not in references_msgids:
|
||||
references_msgids.append(msgid)
|
||||
if not in_reply_to_msgids and references_msgids:
|
||||
in_reply_to_msgid = [references_msgids[-1]]
|
||||
t = Thread()
|
||||
t.add_message(
|
||||
Message(
|
||||
mid,
|
||||
in_reply_to_msgids, references_msgids,
|
||||
date,
|
||||
msg["From"], msg["Subject"]))
|
||||
|
||||
|
||||
|
||||
for f in sys.argv[1:]:
|
||||
print("F", f, file=sys.stderr)
|
||||
mb = mailbox.mbox(f)
|
||||
|
||||
for m in mb:
|
||||
add_message(m)
|
||||
|
||||
# Now I have a lot of 1 message threads
|
||||
# Merge them
|
||||
|
||||
finished = False
|
||||
while not finished:
|
||||
finished = True
|
||||
for msgid in list(msg2thread.keys()):
|
||||
thread = msg2thread[msgid]
|
||||
for msgid2 in list(thread.messages.keys()):
|
||||
msg = thread.messages[msgid2]
|
||||
for r in msg.references:
|
||||
if r in thread.messages:
|
||||
pass
|
||||
else:
|
||||
# references may contain non-existant messages, so
|
||||
# be careful:
|
||||
if r in msg2thread:
|
||||
thread.merge_thread(msg2thread[r])
|
||||
finished = False
|
||||
|
||||
thread_list = []
|
||||
for thread in msg2thread.values():
|
||||
if thread.threadid:
|
||||
continue
|
||||
messages = iter(thread.messages.values())
|
||||
msg = next(messages)
|
||||
thread.date = msg.date
|
||||
thread.threadid = msg.msgid
|
||||
for msg in messages:
|
||||
if msg.date < thread.date:
|
||||
thread.threadid = msg.msgid
|
||||
thread.date = msg.date
|
||||
thread_list.append(thread)
|
||||
|
||||
print("""
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<style>
|
||||
table.thread {
|
||||
border-collapse: collapse;
|
||||
}
|
||||
table.thread tr {
|
||||
height: 32px;
|
||||
font-size: 16px;
|
||||
background-color: #EFE;
|
||||
}
|
||||
table.thread td {
|
||||
overflow: hidden;
|
||||
white-space: nowrap;
|
||||
}
|
||||
.timestamp {
|
||||
width: 8em;
|
||||
padding-right: 0.5em;
|
||||
}
|
||||
.from {
|
||||
max-width: 8em;
|
||||
padding-left: 0.5em;
|
||||
padding-right: 0.5em;
|
||||
}
|
||||
.subject {
|
||||
max-width: 10em;
|
||||
padding-left: 0.5em;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
""")
|
||||
for thread in sorted(thread_list, key=lambda x: x.date):
|
||||
print(thread.as_html())
|
||||
|
||||
# vim: tw=79
|
Loading…
Reference in New Issue