Convert mbox files to standalone html files

No thread or date structure, just one isolated file per message.
Only text/plain and some multipart formats
This commit is contained in:
Peter J. Holzer 2019-02-03 18:44:50 +01:00
parent 80352f727f
commit b238c56edb
1 changed files with 113 additions and 0 deletions

113
mbox2web Executable file
View File

@ -0,0 +1,113 @@
#!/usr/bin/python3
import jinja2
import mailbox
import os
import re
import sys
basedir = "."
jenv = jinja2.Environment(
loader=jinja2.FileSystemLoader(["templates"]),
autoescape=True,
)
def get_message_id(msg):
"""
Extract the message id from a message
Note that this assumes that there is (at least) one message id. If
this is not the case, it will raise an exception (currently an
IndexError, but we may use something more suitable in the future).
"""
match = re.search(r'<(.*?)>', msg["Message-ID"])
return match.group(1)
def encode_message_id(msgid):
encmsgid = re.sub('[^!"$-.0-9:=@-z|~]', lambda x: "{%02x}" % (ord(x.group(0))), msgid)
return encmsgid
def render_message(msg):
msgtmpl = jenv.get_template("message2.html")
bodyhtml = render_body(msg)
context = {
"message_id": msg["Message-Id"],
"subject": msg["Subject"],
"from": msg["From"],
"date": msg["Date"],
"bodyhtml": bodyhtml,
}
msghtml = msgtmpl.render(context)
return jinja2.Markup(msghtml)
def render_body(msg):
content_type = msg.get_content_type()
if content_type == "text/plain":
bodytmpl = jenv.get_template("body_text_plain.html")
context = {
"body": msg.get_payload()
}
bodyhtml = bodytmpl.render(context)
return jinja2.Markup(bodyhtml)
elif content_type == "multipart/mixed":
partshtml = []
for part in msg.get_payload():
partshtml.append(render_body(part))
bodytmpl = jenv.get_template("body_multipart_mixed.html")
context = {
"parts": partshtml
}
bodyhtml = bodytmpl.render(context)
return jinja2.Markup(bodyhtml)
elif content_type == "multipart/digest":
partshtml = []
for part in msg.get_payload():
partshtml.append(render_message(part))
bodytmpl = jenv.get_template("body_multipart_digest.html")
context = {
"parts": partshtml
}
bodyhtml = bodytmpl.render(context)
return jinja2.Markup(bodyhtml)
elif content_type == "message/rfc822":
partshtml = []
for part in msg.get_payload():
partshtml.append(render_message(part))
bodytmpl = jenv.get_template("body_message_rfc822.html")
context = {
"parts": partshtml
}
bodyhtml = bodytmpl.render(context)
return jinja2.Markup(bodyhtml)
else:
raise RuntimeError("Content-type " + content_type + " not implemented yet")
def archive(msg):
mid = get_message_id(msg)
encmid = encode_message_id(mid)
msgdir = basedir + "/msg/" + encmid
os.makedirs(msgdir, exist_ok=True)
with open(msgdir + "/index.html", "w") as hfd:
msgtmpl = jenv.get_template("message.html")
bodyhtml = render_body(msg)
context = {
"list": "LUGA",
"message_id": mid,
"subject": msg["Subject"],
"from": msg["From"],
"date": msg["Date"],
"bodyhtml": bodyhtml,
}
msghtml = msgtmpl.render(context)
hfd.write(msghtml)
for f in sys.argv[1:]:
print("F", f)
mb = mailbox.mbox(f)
for m in mb:
archive(m)