Merge thread-handling from mbox2threads into mbox2web

This commit is contained in:
Peter J. Holzer 2020-04-12 23:08:10 +02:00
parent 1ada1c3817
commit 6923e6273a
4 changed files with 425 additions and 24 deletions

392
mbox2web
View File

@ -1,5 +1,6 @@
#!/usr/bin/python3 #!/usr/bin/python3
import datetime
import email.header import email.header
import email.parser import email.parser
import hashlib import hashlib
@ -7,6 +8,7 @@ import html
import html.parser import html.parser
import mailbox import mailbox
import os import os
import pprint
import re import re
import subprocess import subprocess
import sys import sys
@ -243,7 +245,9 @@ def render_body(msg, extra=None):
if not whole_msg_embedded_id: if not whole_msg_embedded_id:
whole_msg.add_header("Message-Id", "<" + whole_msg_id + ">") whole_msg.add_header("Message-Id", "<" + whole_msg_id + ">")
whole_msg_embedded_id = whole_msg_id whole_msg_embedded_id = whole_msg_id
archive(whole_msg) if whole_msg["Date"] is None:
whole_msg["Date"] = msg["Date"]
arch.add_message(whole_msg) # XXX - global
del partial_message_cache[whole_msg_id] del partial_message_cache[whole_msg_id]
return "<p>This is part %d of %d of <a href='../%s/'>%s</a></p>" % ( return "<p>This is part %d of %d of <a href='../%s/'>%s</a></p>" % (
int(msg.get_param("number")), int(msg.get_param("number")),
@ -415,7 +419,7 @@ def render_body(msg, extra=None):
content_type = msg.get_content_type() content_type = msg.get_content_type()
content_disposition = msg.get_content_disposition() content_disposition = msg.get_content_disposition()
if content_disposition == "attachment": if content_disposition == "attachment":
# XXX - not sure, if we should just store all content-types. # XXX - not sure if we should just store all content-types.
# We probably should clean up html. Alternatively we could just store # We probably should clean up html. Alternatively we could just store
# all of them application/octet-stream, which browsers should download # all of them application/octet-stream, which browsers should download
# and not try to display. # and not try to display.
@ -426,27 +430,6 @@ def render_body(msg, extra=None):
return jinja2.Markup(bodyhtml) return jinja2.Markup(bodyhtml)
def archive(msg):
mid = get_message_id(msg)
print("M", mid, file=sys.stderr)
encmid = encode_message_id(mid)
msgdir = basedir + "/msg/" + encmid
os.makedirs(msgdir, exist_ok=True)
with open(msgdir + "/index.html", "w") as hfd:
msgtmpl = jenv.get_template("message.html")
bodyhtml = render_body(msg)
context = {
"list": "LUGA",
"message_id": mid,
"subject": decode_rfc2047(msg["Subject"]),
"from": decode_rfc2047(msg["From"]),
"date": msg["Date"],
"bodyhtml": bodyhtml,
}
msghtml = msgtmpl.render(context)
hfd.write(msghtml)
class HTMLPart(html.parser.HTMLParser): class HTMLPart(html.parser.HTMLParser):
""" """
A text/html part A text/html part
@ -913,15 +896,376 @@ class TextFlowedPart:
return s return s
class Message:
def __init__(self, msg):
self.msgid = get_message_id(msg)
print("M", self.msgid, file=sys.stderr)
self.encmsgid = encode_message_id(self.msgid)
self.date = email.utils.parsedate_to_datetime(msg["Date"])
# In-Reply-To headers with more than one message-id are rare, but
# standard-conforming, and some MUAs (e.g., mutt) create them.
in_reply_to = msg["In-Reply-To"]
if in_reply_to:
if isinstance(in_reply_to, email.header.Header):
in_reply_to = in_reply_to.encode()
in_reply_to_msgids = re.findall(r'<(.*?)>', in_reply_to)
else:
in_reply_to_msgids = []
references = msg["References"]
if references:
references_msgids = re.findall(r'<(.*?)>', references)
else:
references_msgids = []
for msgid in in_reply_to_msgids:
if msgid not in references_msgids:
references_msgids.append(msgid)
if not in_reply_to_msgids and references_msgids:
in_reply_to_msgid = [references_msgids[-1]]
self.in_reply_to = in_reply_to_msgids
self.references = references_msgids
self.mfrom = msg["From"]
self.subject = msg["Subject"]
self.msg = msg
self.kids = False
if self.date.tzinfo is None:
# If timezone is missing, assume local time
self.date = self.date.astimezone()
def __repr__(self):
return (
self.msgid + " " +
self.date.strftime("%Y-%m-%d %H:%M:%S%z") +
" [" + ", ".join(self.references) + "]"
)
def webify(self):
msg = self.msg
mid = self.msgid
print("M", mid, file=sys.stderr)
encmid = self.encmsgid
msgdir = basedir + "/msg/" + encmid
os.makedirs(msgdir, exist_ok=True)
with open(msgdir + "/index.html", "w") as hfd:
msgtmpl = jenv.get_template("message.html")
bodyhtml = render_body(msg)
context = {
"list": "LUGA",
"message_id": mid,
"subject": decode_rfc2047(msg["Subject"]),
"from": decode_rfc2047(msg["From"]),
"date": msg["Date"],
"bodyhtml": bodyhtml,
"threadhtml": self.thread.as_html(),
}
msghtml = msgtmpl.render(context)
hfd.write(msghtml)
# For each message-id, record the thread it belongs to.
# This should probably be an instance variable of Archive instead of global,
# but for it doesn't matter.
msg2thread = {}
class Thread:
def __init__(self, archive):
self.archive = archive
self.messages = {}
self.threadid = None
self._as_html = None
def add_message(self, msg):
self.messages[msg.msgid] = msg
self.archive.msg2thread[msg.msgid] = self
msg.thread = self
def merge_thread(self, other):
for msg in other.messages.values():
self.add_message(msg)
def __repr__(self):
if self.threadid:
s = self.threadid
else:
s = str(id(self))
if self.messages:
s += " {" + ", ".join(self.messages.keys()) + "}"
return s
def fixup_in_reply_tos(self):
# Fix up some problems with in_reply_to:
# Sometimes an in_reply_to refers to a message which isn't in the
# archive. Add a dummy message if this happens.
# Sometimes an in_reply_to refers to a message with a later date.
# In this case one of the two date headers must be wrong. We could try
# to analyze other headers (especially received), but for now we just
# assume that it is the referrer (although in the example I'm
# currently looking at it is the referree) and adjust that. We should
# preserve the original date header, though. Use separate sort_date and
# date?
missing = set()
for m in self.messages.values():
for r in m.in_reply_to:
if r not in self.messages:
missing.add(r)
for r in missing:
firstdate = sorted(self.messages.values(), key=lambda x: x.date)[0].date
missingdate = firstdate - datetime.timedelta(seconds=1)
msg = email.message.EmailMessage()
msg["Message-Id"] = f"<{r}>"
msg["Date"] = missingdate
msg["From"] = "unknown@invalid"
msg["Subject"] = "(not in archive)"
self.add_message(Message(msg))
dates_ok = False
while not dates_ok:
dates_ok = True
for m in self.messages.values():
for r in m.in_reply_to:
rr = self.messages[r]
if rr.date >= m.date:
m.date = rr.date + datetime.timedelta(seconds=1)
dates_ok = False
def as_html(self):
if self._as_html:
# This method isn't that expensive, but it isn't idempotent - so we
# must not run the algorithm twice on the same thread. Therefore we
# remember the result and return it on subsequent runs.
s = self._as_html
return jinja2.Markup(s)
self.fixup_in_reply_tos()
y = 0
x = 0
nodes = []
edges = []
lines = []
for m in sorted(self.messages.values(), key=lambda x: x.date):
# We have already fudged the in_reply_to field to always contain
# the latest reference(s), so we only need to consider that
if len(m.in_reply_to) == 0:
if y == 0:
# first message in thread
# Just add a node
nodes.append((x, y))
m.x = x
m.y = y
else:
# Not in reply to anything, but not the start of the thread
# either. This will happen if fixup_in_reply_tos adds more
# than one dummy message, but it might also happen if we
# use different criteria for matching threads (e.g. Subject
# or Thread-Index)
# Just start a new column to get out of the way
x += 1
nodes.append((x, y))
m.x = x
m.y = y
elif len(m.in_reply_to) == 1:
p = self.messages[m.in_reply_to[0]]
if p.kids:
# The parent already has kids, so we must move to the side
# to avoid running an edge through an existing kid. We
# could use a sophisticated algorithm to find the best
# position here, but I think it sufficient to just start a
# new column. This may waste some space (there might have
# been a suitable position in the existing columns, but it
# will avoid collisions and is very simple.
x += 1
m.x = x
m.y = y
else:
# Just put the new kid directly below the parent
m.x = p.x
m.y = y
nodes.append((m.x, m.y))
edges.append((p.x, p.y, m.x, m.y))
p.kids = True
else:
# Generic case with multiple references.
# I think this should always work well if we start a new
# column. There may be special cases where we can avoid it, not
# sure.
x += 1
m.x = x
m.y = y
nodes.append((m.x, m.y))
for r in m.in_reply_to:
p = self.messages[r]
edges.append((p.x, p.y, m.x, m.y))
lines.append((m.date, m.mfrom, m.subject, m.encmsgid))
y += 1
s = "<table class='thread'>"
s += "<tr>"
s += f"<td rowspan={y}>"
r = 4
fx = 16
fy = 32
s += f"<svg width={(x + 1) * fx} height={y * fy}>"
for e in edges:
if e[0] == e[2]:
s += f"<line x1={e[0] * fx + fx/2} y1={e[1] * fy + fy/2} x2={e[2] * fx + fx/2} y2={e[3] * fy + fy/2} stroke='black' />"
else:
if e[3] == e[1] + 1:
yc = (e[1] + e[2]) / 2
else:
yc = e[1] + 1
s += f"<path d='M {e[0] * fx + fx/2} {e[1] * fy + fy/2} Q {e[2] * fx + fx/2} {yc * fy + fy/2} {e[2] * fx + fx/2} {e[3] * fy + fy/2}' stroke='black' fill='none' />"
for n in nodes:
s += f"<circle cx={n[0] * fx + fx/2} cy={n[1] * fy + fy/2} r={r} />"
s += "</svg>"
s += "</td>"
# XXX - escape!
s += f"<td class='date'><a href='/msg/{lines[0][3]}/'>{lines[0][0]}</a></td>"
s += f"<td class='from'>{lines[0][1]}</td>"
s += f"<td class='subject'>{lines[0][2]}</td>"
s += "</tr>"
for ln in lines[1:]:
s += "<tr>"
s += f"<td class='date'><a href='/msg/{ln[3]}/'>{ln[0]}</a></td>"
s += f"<td class='from'>{ln[1]}</td>"
s += f"<td class='subject'>{ln[2]}</td>"
s += "</tr>"
s += "</table>"
self._as_html = s
return jinja2.Markup(s)
@property
def subject(self):
return list(self.messages.values())[0].subject
class Archive:
def __init__(self):
self.messages = []
self.msg2thread = {}
def add_message(self, msg):
self.self_check()
m = Message(msg)
if m.msgid in self.msg2thread:
# We have already seen this message, so ignore it
return
t = Thread(self)
t.add_message(m)
self.messages.append(m)
self.self_check()
def merge_threads(self):
self.self_check()
finished = False
while not finished:
finished = True
for msgid in list(self.msg2thread.keys()):
thread = self.msg2thread[msgid]
for msgid2 in list(thread.messages.keys()):
msg = thread.messages[msgid2]
for r in msg.references:
if r in thread.messages:
pass
else:
# references may contain non-existant messages, so
# be careful:
if r in self.msg2thread:
thread.merge_thread(self.msg2thread[r])
finished = False
self.thread_list = []
for thread in self.msg2thread.values():
if thread.threadid:
continue
messages = iter(thread.messages.values())
msg = next(messages)
thread.date = msg.date
thread.threadid = msg.msgid
for msg in messages:
if msg.date < thread.date:
thread.threadid = msg.msgid
thread.date = msg.date
self.thread_list.append(thread)
def webify_messages(self):
self.self_check()
for m in self.messages:
m.webify()
def webify_threads(self):
self.self_check()
threadtmpl = jenv.get_template("thread.html")
for t in self.thread_list:
threaddir = basedir + "/thread/" + t.threadid
os.makedirs(threaddir, exist_ok=True)
with open(threaddir + "/index.html", "w") as hfd:
context = {
"list": "LUGA",
"threadhtml": t.as_html(),
}
threadhtml = threadtmpl.render(context)
hfd.write(threadhtml)
def webify_calendar(self):
caltmpl = jenv.get_template("calendar.html")
cal = {}
for t in self.thread_list:
y = t.date.year
m = t.date.month
if y not in cal:
cal[y] = {}
if m not in cal[y]:
cal[y][m] = []
cal[y][m].append(t)
caldir = basedir + "/cal"
os.makedirs(caldir, exist_ok=True)
with open(caldir + "/index.html", "w") as hfd:
context = {
"list": "LUGA",
"cal": cal,
}
calhtml = caltmpl.render(context)
hfd.write(calhtml)
def self_check(self):
# The messages in self.messages must be unique:
seen = set()
for m in self.messages:
assert m.msgid not in seen, m.msgid
seen.add(m.msgid)
arch = Archive()
for f in sys.argv[1:]: for f in sys.argv[1:]:
print("F", f, file=sys.stderr) print("F", f, file=sys.stderr)
mb = mailbox.mbox(f) mb = mailbox.mbox(f)
for m in mb: for m in mb:
archive(m) arch.add_message(m)
# Now I have a lot of 1 message threads
# Merge them
arch.merge_threads()
# Then dump all the messages
arch.webify_messages()
# And the threads
arch.webify_threads()
# And a calendar view
arch.webify_calendar()
# vim: tw=79 # vim: tw=79

37
templates/calendar.html Normal file
View File

@ -0,0 +1,37 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>
{{list}}: {{subject}}
</title>
<link rel="stylesheet" href="../../style/debug.css">
</head>
<body>
<h1>{{list}} by date</h1>
<nav>
<ul>
</ul>
{% for y in cal | dictsort %}
<li>
{{y.0}}
<ul>
{% for m in y.1 | dictsort %}
{{m.0}}
<ul>
{% for t in m.1 %}
<li>
<a href="../thread/{{t.threadid}}/">{{t.subject}}</a>
</li>
{% endfor %}
</ul>
{% endfor %}
</ul>
</li>
{% endfor %}
</nav>
</body>
</html>

View File

@ -9,6 +9,9 @@
</head> </head>
<body> <body>
<h1>{{subject}}</h1> <h1>{{subject}}</h1>
<nav>
{{threadhtml}}
</nav>
<table> <table>
<tr> <th>Message-Id </th> <td>{{message_id}} </td> </tr> <tr> <th>Message-Id </th> <td>{{message_id}} </td> </tr>
<tr> <th>From </th> <td>{{from}} </td> </tr> <tr> <th>From </th> <td>{{from}} </td> </tr>

17
templates/thread.html Normal file
View File

@ -0,0 +1,17 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>
{{list}}: {{subject}}
</title>
<link rel="stylesheet" href="../../style/debug.css">
</head>
<body>
<h1>{{subject}}</h1>
<nav>
{{threadhtml}}
</nav>
</body>
</html>