Merge thread-handling from mbox2threads into mbox2web
This commit is contained in:
parent
1ada1c3817
commit
6923e6273a
392
mbox2web
392
mbox2web
|
@ -1,5 +1,6 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
import datetime
|
||||
import email.header
|
||||
import email.parser
|
||||
import hashlib
|
||||
|
@ -7,6 +8,7 @@ import html
|
|||
import html.parser
|
||||
import mailbox
|
||||
import os
|
||||
import pprint
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
|
@ -243,7 +245,9 @@ def render_body(msg, extra=None):
|
|||
if not whole_msg_embedded_id:
|
||||
whole_msg.add_header("Message-Id", "<" + whole_msg_id + ">")
|
||||
whole_msg_embedded_id = whole_msg_id
|
||||
archive(whole_msg)
|
||||
if whole_msg["Date"] is None:
|
||||
whole_msg["Date"] = msg["Date"]
|
||||
arch.add_message(whole_msg) # XXX - global
|
||||
del partial_message_cache[whole_msg_id]
|
||||
return "<p>This is part %d of %d of <a href='../%s/'>%s</a></p>" % (
|
||||
int(msg.get_param("number")),
|
||||
|
@ -415,7 +419,7 @@ def render_body(msg, extra=None):
|
|||
content_type = msg.get_content_type()
|
||||
content_disposition = msg.get_content_disposition()
|
||||
if content_disposition == "attachment":
|
||||
# XXX - not sure, if we should just store all content-types.
|
||||
# XXX - not sure if we should just store all content-types.
|
||||
# We probably should clean up html. Alternatively we could just store
|
||||
# all of them application/octet-stream, which browsers should download
|
||||
# and not try to display.
|
||||
|
@ -426,27 +430,6 @@ def render_body(msg, extra=None):
|
|||
return jinja2.Markup(bodyhtml)
|
||||
|
||||
|
||||
def archive(msg):
|
||||
mid = get_message_id(msg)
|
||||
print("M", mid, file=sys.stderr)
|
||||
encmid = encode_message_id(mid)
|
||||
msgdir = basedir + "/msg/" + encmid
|
||||
os.makedirs(msgdir, exist_ok=True)
|
||||
with open(msgdir + "/index.html", "w") as hfd:
|
||||
msgtmpl = jenv.get_template("message.html")
|
||||
bodyhtml = render_body(msg)
|
||||
context = {
|
||||
"list": "LUGA",
|
||||
"message_id": mid,
|
||||
"subject": decode_rfc2047(msg["Subject"]),
|
||||
"from": decode_rfc2047(msg["From"]),
|
||||
"date": msg["Date"],
|
||||
"bodyhtml": bodyhtml,
|
||||
}
|
||||
msghtml = msgtmpl.render(context)
|
||||
hfd.write(msghtml)
|
||||
|
||||
|
||||
class HTMLPart(html.parser.HTMLParser):
|
||||
"""
|
||||
A text/html part
|
||||
|
@ -913,15 +896,376 @@ class TextFlowedPart:
|
|||
return s
|
||||
|
||||
|
||||
class Message:
|
||||
def __init__(self, msg):
|
||||
self.msgid = get_message_id(msg)
|
||||
print("M", self.msgid, file=sys.stderr)
|
||||
self.encmsgid = encode_message_id(self.msgid)
|
||||
|
||||
self.date = email.utils.parsedate_to_datetime(msg["Date"])
|
||||
|
||||
# In-Reply-To headers with more than one message-id are rare, but
|
||||
# standard-conforming, and some MUAs (e.g., mutt) create them.
|
||||
in_reply_to = msg["In-Reply-To"]
|
||||
if in_reply_to:
|
||||
if isinstance(in_reply_to, email.header.Header):
|
||||
in_reply_to = in_reply_to.encode()
|
||||
in_reply_to_msgids = re.findall(r'<(.*?)>', in_reply_to)
|
||||
else:
|
||||
in_reply_to_msgids = []
|
||||
|
||||
references = msg["References"]
|
||||
if references:
|
||||
references_msgids = re.findall(r'<(.*?)>', references)
|
||||
else:
|
||||
references_msgids = []
|
||||
|
||||
for msgid in in_reply_to_msgids:
|
||||
if msgid not in references_msgids:
|
||||
references_msgids.append(msgid)
|
||||
if not in_reply_to_msgids and references_msgids:
|
||||
in_reply_to_msgid = [references_msgids[-1]]
|
||||
self.in_reply_to = in_reply_to_msgids
|
||||
self.references = references_msgids
|
||||
self.mfrom = msg["From"]
|
||||
self.subject = msg["Subject"]
|
||||
self.msg = msg
|
||||
self.kids = False
|
||||
if self.date.tzinfo is None:
|
||||
# If timezone is missing, assume local time
|
||||
self.date = self.date.astimezone()
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
self.msgid + " " +
|
||||
self.date.strftime("%Y-%m-%d %H:%M:%S%z") +
|
||||
" [" + ", ".join(self.references) + "]"
|
||||
)
|
||||
|
||||
|
||||
def webify(self):
|
||||
msg = self.msg
|
||||
mid = self.msgid
|
||||
print("M", mid, file=sys.stderr)
|
||||
encmid = self.encmsgid
|
||||
msgdir = basedir + "/msg/" + encmid
|
||||
os.makedirs(msgdir, exist_ok=True)
|
||||
with open(msgdir + "/index.html", "w") as hfd:
|
||||
msgtmpl = jenv.get_template("message.html")
|
||||
bodyhtml = render_body(msg)
|
||||
context = {
|
||||
"list": "LUGA",
|
||||
"message_id": mid,
|
||||
"subject": decode_rfc2047(msg["Subject"]),
|
||||
"from": decode_rfc2047(msg["From"]),
|
||||
"date": msg["Date"],
|
||||
"bodyhtml": bodyhtml,
|
||||
"threadhtml": self.thread.as_html(),
|
||||
}
|
||||
msghtml = msgtmpl.render(context)
|
||||
hfd.write(msghtml)
|
||||
|
||||
|
||||
# For each message-id, record the thread it belongs to.
|
||||
# This should probably be an instance variable of Archive instead of global,
|
||||
# but for it doesn't matter.
|
||||
msg2thread = {}
|
||||
|
||||
class Thread:
|
||||
def __init__(self, archive):
|
||||
self.archive = archive
|
||||
self.messages = {}
|
||||
self.threadid = None
|
||||
self._as_html = None
|
||||
|
||||
def add_message(self, msg):
|
||||
self.messages[msg.msgid] = msg
|
||||
self.archive.msg2thread[msg.msgid] = self
|
||||
msg.thread = self
|
||||
|
||||
def merge_thread(self, other):
|
||||
for msg in other.messages.values():
|
||||
self.add_message(msg)
|
||||
|
||||
def __repr__(self):
|
||||
if self.threadid:
|
||||
s = self.threadid
|
||||
else:
|
||||
s = str(id(self))
|
||||
if self.messages:
|
||||
s += " {" + ", ".join(self.messages.keys()) + "}"
|
||||
return s
|
||||
|
||||
def fixup_in_reply_tos(self):
|
||||
# Fix up some problems with in_reply_to:
|
||||
# Sometimes an in_reply_to refers to a message which isn't in the
|
||||
# archive. Add a dummy message if this happens.
|
||||
# Sometimes an in_reply_to refers to a message with a later date.
|
||||
# In this case one of the two date headers must be wrong. We could try
|
||||
# to analyze other headers (especially received), but for now we just
|
||||
# assume that it is the referrer (although in the example I'm
|
||||
# currently looking at it is the referree) and adjust that. We should
|
||||
# preserve the original date header, though. Use separate sort_date and
|
||||
# date?
|
||||
missing = set()
|
||||
for m in self.messages.values():
|
||||
for r in m.in_reply_to:
|
||||
if r not in self.messages:
|
||||
missing.add(r)
|
||||
for r in missing:
|
||||
firstdate = sorted(self.messages.values(), key=lambda x: x.date)[0].date
|
||||
missingdate = firstdate - datetime.timedelta(seconds=1)
|
||||
msg = email.message.EmailMessage()
|
||||
msg["Message-Id"] = f"<{r}>"
|
||||
msg["Date"] = missingdate
|
||||
msg["From"] = "unknown@invalid"
|
||||
msg["Subject"] = "(not in archive)"
|
||||
self.add_message(Message(msg))
|
||||
dates_ok = False
|
||||
while not dates_ok:
|
||||
dates_ok = True
|
||||
for m in self.messages.values():
|
||||
for r in m.in_reply_to:
|
||||
rr = self.messages[r]
|
||||
if rr.date >= m.date:
|
||||
m.date = rr.date + datetime.timedelta(seconds=1)
|
||||
dates_ok = False
|
||||
|
||||
|
||||
def as_html(self):
|
||||
if self._as_html:
|
||||
# This method isn't that expensive, but it isn't idempotent - so we
|
||||
# must not run the algorithm twice on the same thread. Therefore we
|
||||
# remember the result and return it on subsequent runs.
|
||||
s = self._as_html
|
||||
return jinja2.Markup(s)
|
||||
self.fixup_in_reply_tos()
|
||||
y = 0
|
||||
x = 0
|
||||
nodes = []
|
||||
edges = []
|
||||
lines = []
|
||||
for m in sorted(self.messages.values(), key=lambda x: x.date):
|
||||
# We have already fudged the in_reply_to field to always contain
|
||||
# the latest reference(s), so we only need to consider that
|
||||
if len(m.in_reply_to) == 0:
|
||||
if y == 0:
|
||||
# first message in thread
|
||||
# Just add a node
|
||||
nodes.append((x, y))
|
||||
m.x = x
|
||||
m.y = y
|
||||
else:
|
||||
# Not in reply to anything, but not the start of the thread
|
||||
# either. This will happen if fixup_in_reply_tos adds more
|
||||
# than one dummy message, but it might also happen if we
|
||||
# use different criteria for matching threads (e.g. Subject
|
||||
# or Thread-Index)
|
||||
# Just start a new column to get out of the way
|
||||
x += 1
|
||||
nodes.append((x, y))
|
||||
m.x = x
|
||||
m.y = y
|
||||
|
||||
elif len(m.in_reply_to) == 1:
|
||||
p = self.messages[m.in_reply_to[0]]
|
||||
if p.kids:
|
||||
# The parent already has kids, so we must move to the side
|
||||
# to avoid running an edge through an existing kid. We
|
||||
# could use a sophisticated algorithm to find the best
|
||||
# position here, but I think it sufficient to just start a
|
||||
# new column. This may waste some space (there might have
|
||||
# been a suitable position in the existing columns, but it
|
||||
# will avoid collisions and is very simple.
|
||||
x += 1
|
||||
m.x = x
|
||||
m.y = y
|
||||
else:
|
||||
# Just put the new kid directly below the parent
|
||||
m.x = p.x
|
||||
m.y = y
|
||||
nodes.append((m.x, m.y))
|
||||
edges.append((p.x, p.y, m.x, m.y))
|
||||
p.kids = True
|
||||
else:
|
||||
# Generic case with multiple references.
|
||||
# I think this should always work well if we start a new
|
||||
# column. There may be special cases where we can avoid it, not
|
||||
# sure.
|
||||
x += 1
|
||||
m.x = x
|
||||
m.y = y
|
||||
nodes.append((m.x, m.y))
|
||||
for r in m.in_reply_to:
|
||||
p = self.messages[r]
|
||||
edges.append((p.x, p.y, m.x, m.y))
|
||||
lines.append((m.date, m.mfrom, m.subject, m.encmsgid))
|
||||
y += 1
|
||||
s = "<table class='thread'>"
|
||||
s += "<tr>"
|
||||
s += f"<td rowspan={y}>"
|
||||
|
||||
r = 4
|
||||
fx = 16
|
||||
fy = 32
|
||||
s += f"<svg width={(x + 1) * fx} height={y * fy}>"
|
||||
for e in edges:
|
||||
if e[0] == e[2]:
|
||||
s += f"<line x1={e[0] * fx + fx/2} y1={e[1] * fy + fy/2} x2={e[2] * fx + fx/2} y2={e[3] * fy + fy/2} stroke='black' />"
|
||||
else:
|
||||
if e[3] == e[1] + 1:
|
||||
yc = (e[1] + e[2]) / 2
|
||||
else:
|
||||
yc = e[1] + 1
|
||||
s += f"<path d='M {e[0] * fx + fx/2} {e[1] * fy + fy/2} Q {e[2] * fx + fx/2} {yc * fy + fy/2} {e[2] * fx + fx/2} {e[3] * fy + fy/2}' stroke='black' fill='none' />"
|
||||
for n in nodes:
|
||||
s += f"<circle cx={n[0] * fx + fx/2} cy={n[1] * fy + fy/2} r={r} />"
|
||||
s += "</svg>"
|
||||
s += "</td>"
|
||||
|
||||
# XXX - escape!
|
||||
s += f"<td class='date'><a href='/msg/{lines[0][3]}/'>{lines[0][0]}</a></td>"
|
||||
s += f"<td class='from'>{lines[0][1]}</td>"
|
||||
s += f"<td class='subject'>{lines[0][2]}</td>"
|
||||
s += "</tr>"
|
||||
|
||||
for ln in lines[1:]:
|
||||
s += "<tr>"
|
||||
s += f"<td class='date'><a href='/msg/{ln[3]}/'>{ln[0]}</a></td>"
|
||||
s += f"<td class='from'>{ln[1]}</td>"
|
||||
s += f"<td class='subject'>{ln[2]}</td>"
|
||||
s += "</tr>"
|
||||
s += "</table>"
|
||||
self._as_html = s
|
||||
return jinja2.Markup(s)
|
||||
|
||||
|
||||
@property
|
||||
def subject(self):
|
||||
return list(self.messages.values())[0].subject
|
||||
|
||||
|
||||
class Archive:
|
||||
def __init__(self):
|
||||
self.messages = []
|
||||
self.msg2thread = {}
|
||||
|
||||
def add_message(self, msg):
|
||||
self.self_check()
|
||||
m = Message(msg)
|
||||
if m.msgid in self.msg2thread:
|
||||
# We have already seen this message, so ignore it
|
||||
return
|
||||
t = Thread(self)
|
||||
t.add_message(m)
|
||||
self.messages.append(m)
|
||||
self.self_check()
|
||||
|
||||
def merge_threads(self):
|
||||
self.self_check()
|
||||
finished = False
|
||||
while not finished:
|
||||
finished = True
|
||||
for msgid in list(self.msg2thread.keys()):
|
||||
thread = self.msg2thread[msgid]
|
||||
for msgid2 in list(thread.messages.keys()):
|
||||
msg = thread.messages[msgid2]
|
||||
for r in msg.references:
|
||||
if r in thread.messages:
|
||||
pass
|
||||
else:
|
||||
# references may contain non-existant messages, so
|
||||
# be careful:
|
||||
if r in self.msg2thread:
|
||||
thread.merge_thread(self.msg2thread[r])
|
||||
finished = False
|
||||
|
||||
self.thread_list = []
|
||||
for thread in self.msg2thread.values():
|
||||
if thread.threadid:
|
||||
continue
|
||||
messages = iter(thread.messages.values())
|
||||
msg = next(messages)
|
||||
thread.date = msg.date
|
||||
thread.threadid = msg.msgid
|
||||
for msg in messages:
|
||||
if msg.date < thread.date:
|
||||
thread.threadid = msg.msgid
|
||||
thread.date = msg.date
|
||||
self.thread_list.append(thread)
|
||||
|
||||
def webify_messages(self):
|
||||
self.self_check()
|
||||
for m in self.messages:
|
||||
m.webify()
|
||||
|
||||
def webify_threads(self):
|
||||
self.self_check()
|
||||
threadtmpl = jenv.get_template("thread.html")
|
||||
for t in self.thread_list:
|
||||
threaddir = basedir + "/thread/" + t.threadid
|
||||
os.makedirs(threaddir, exist_ok=True)
|
||||
with open(threaddir + "/index.html", "w") as hfd:
|
||||
context = {
|
||||
"list": "LUGA",
|
||||
"threadhtml": t.as_html(),
|
||||
}
|
||||
threadhtml = threadtmpl.render(context)
|
||||
hfd.write(threadhtml)
|
||||
|
||||
|
||||
def webify_calendar(self):
|
||||
caltmpl = jenv.get_template("calendar.html")
|
||||
cal = {}
|
||||
for t in self.thread_list:
|
||||
y = t.date.year
|
||||
m = t.date.month
|
||||
if y not in cal:
|
||||
cal[y] = {}
|
||||
if m not in cal[y]:
|
||||
cal[y][m] = []
|
||||
cal[y][m].append(t)
|
||||
caldir = basedir + "/cal"
|
||||
os.makedirs(caldir, exist_ok=True)
|
||||
with open(caldir + "/index.html", "w") as hfd:
|
||||
context = {
|
||||
"list": "LUGA",
|
||||
"cal": cal,
|
||||
}
|
||||
calhtml = caltmpl.render(context)
|
||||
hfd.write(calhtml)
|
||||
|
||||
|
||||
def self_check(self):
|
||||
# The messages in self.messages must be unique:
|
||||
|
||||
seen = set()
|
||||
for m in self.messages:
|
||||
assert m.msgid not in seen, m.msgid
|
||||
seen.add(m.msgid)
|
||||
|
||||
|
||||
arch = Archive()
|
||||
|
||||
for f in sys.argv[1:]:
|
||||
print("F", f, file=sys.stderr)
|
||||
mb = mailbox.mbox(f)
|
||||
|
||||
for m in mb:
|
||||
archive(m)
|
||||
arch.add_message(m)
|
||||
|
||||
|
||||
# Now I have a lot of 1 message threads
|
||||
# Merge them
|
||||
arch.merge_threads()
|
||||
|
||||
# Then dump all the messages
|
||||
arch.webify_messages()
|
||||
|
||||
# And the threads
|
||||
arch.webify_threads()
|
||||
|
||||
# And a calendar view
|
||||
arch.webify_calendar()
|
||||
|
||||
# vim: tw=79
|
||||
|
|
|
@ -0,0 +1,37 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>
|
||||
{{list}}: {{subject}}
|
||||
</title>
|
||||
<link rel="stylesheet" href="../../style/debug.css">
|
||||
</head>
|
||||
<body>
|
||||
<h1>{{list}} by date</h1>
|
||||
<nav>
|
||||
<ul>
|
||||
|
||||
</ul>
|
||||
{% for y in cal | dictsort %}
|
||||
<li>
|
||||
{{y.0}}
|
||||
<ul>
|
||||
{% for m in y.1 | dictsort %}
|
||||
{{m.0}}
|
||||
<ul>
|
||||
{% for t in m.1 %}
|
||||
<li>
|
||||
<a href="../thread/{{t.threadid}}/">{{t.subject}}</a>
|
||||
</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</li>
|
||||
{% endfor %}
|
||||
</nav>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
|
|
@ -9,6 +9,9 @@
|
|||
</head>
|
||||
<body>
|
||||
<h1>{{subject}}</h1>
|
||||
<nav>
|
||||
{{threadhtml}}
|
||||
</nav>
|
||||
<table>
|
||||
<tr> <th>Message-Id </th> <td>{{message_id}} </td> </tr>
|
||||
<tr> <th>From </th> <td>{{from}} </td> </tr>
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>
|
||||
{{list}}: {{subject}}
|
||||
</title>
|
||||
<link rel="stylesheet" href="../../style/debug.css">
|
||||
</head>
|
||||
<body>
|
||||
<h1>{{subject}}</h1>
|
||||
<nav>
|
||||
{{threadhtml}}
|
||||
</nav>
|
||||
</body>
|
||||
</html>
|
||||
|
Loading…
Reference in New Issue