2019-02-03 18:44:50 +01:00
2019-04-30 21:55:21 +02:00
import email.header
2019-02-28 17:17:44 +01:00
import email.parser
2019-03-01 11:58:22 +01:00
import hashlib
2019-02-28 09:30:47 +01:00
import html
import html.parser
2019-02-03 18:44:50 +01:00
import mailbox
import os
import re
2019-03-01 13:54:13 +01:00
import subprocess
2019-02-03 18:44:50 +01:00
import sys
2019-03-01 13:54:13 +01:00
import tempfile
2019-02-28 09:30:47 +01:00
import urllib.parse
import jinja2
2019-02-03 18:44:50 +01:00
basedir = "."
jenv = jinja2.Environment(
def get_message_id(msg):
Extract the message id from a message
Note that this assumes that there is (at least) one message id. If
this is not the case, it will raise an exception (currently an
IndexError, but we may use something more suitable in the future).
match = re.search(r'<(.*?)>', msg["Message-ID"])
return match.group(1)
2019-04-30 21:55:21 +02:00
2019-02-03 18:44:50 +01:00
def encode_message_id(msgid):
2019-02-28 17:16:05 +01:00
encmsgid = re.sub('[^!"$(-.0-9:=@-z|~]', lambda x: "{%02x}" % (ord(x.group(0))), msgid)
2019-02-03 18:44:50 +01:00
return encmsgid
2019-04-30 21:55:21 +02:00
def decode_rfc2047(s):
if s is None:
return None
r = ""
for chunk in email.header.decode_header(s):
if chunk[1]:
r += chunk[0].decode(chunk[1])
except LookupError:
r += chunk[0].decode("windows-1252")
except UnicodeDecodeError:
r += chunk[0].decode("windows-1252")
elif type(chunk[0]) == bytes:
r += chunk[0].decode('us-ascii')
r += chunk[0]
return r
2019-02-03 18:44:50 +01:00
def render_message(msg):
msgtmpl = jenv.get_template("message2.html")
bodyhtml = render_body(msg)
context = {
2019-03-10 22:47:10 +01:00
"msg": msg,
2019-02-03 18:44:50 +01:00
"message_id": msg["Message-Id"],
2019-04-30 21:55:21 +02:00
"subject": decode_rfc2047(msg["Subject"]),
"from": decode_rfc2047(msg["From"]),
2019-02-03 18:44:50 +01:00
"date": msg["Date"],
"bodyhtml": bodyhtml,
msghtml = msgtmpl.render(context)
return jinja2.Markup(msghtml)
2019-03-02 12:10:01 +01:00
2019-03-10 23:27:30 +01:00
def save_part(msg, disposition):
2019-03-02 12:10:01 +01:00
content_type = msg.get_content_type()
extension = {
2019-05-12 23:06:52 +02:00
"application/octet-stream": ".bin",
"text/html": ".html",
"text/x-vcard": ".vcf",
"text/plain": ".txt",
"application/x-gzip": ".gz", # more likely tar.gz, but we can't know without looking into it which we ain't
"image/gif": ".gif",
"text/x-c": ".c",
"application/x-perl": ".pl",
"application/msword": ".doc",
"application/ms-tnef": ".ms-tnef",
"application/x-bzip2": ".bz2", # more likely tar.bz2, but we can't know without looking into it which we ain't
"application/x-shellscript": ".sh",
"application/x-java-vm": ".bin", # The only instances are mis-labelled
"image/png": ".png",
"application/pgp-keys": ".pgp",
"application/x-gunzip": ".gz", # that sort of makes sense, but not really
"image/jpeg": ".jpg",
"text/x-python": ".py",
"text/x-java": ".java",
"application/x-sh": ".sh",
"text/x-patch": ".patch",
"text/x-c++src": ".c++",
"application/x-compressed-tar": ".tar.gz",
"application/vnd.oasis.opendocument.text": ".odt",
2019-05-12 23:11:21 +02:00
"text/x-perl": ".pl",
2019-05-12 23:17:17 +02:00
"application/pgp-signature": ".pgp",
2019-05-20 23:16:30 +02:00
"image/svg+xml": ".svg",
2019-03-02 12:10:01 +01:00
name = msg.get_param("name") or "(data)"
m = hashlib.sha256()
payload = msg.get_payload(decode=True)
filename = m.hexdigest() + extension
os.makedirs("parts", exist_ok=True)
with open("parts/" + filename, "wb") as fh:
2019-03-31 23:48:57 +02:00
url = "../../parts/" + filename
if disposition == "_url":
return url
template_name = disposition + "_" + content_type.replace("/", "_") + ".html"
bodytmpl = jenv.get_template(template_name)
context = {
"name": name,
"url": url,
bodyhtml = bodytmpl.render(context)
return bodyhtml
2019-03-02 12:10:01 +01:00
2019-02-28 17:17:44 +01:00
partial_message_cache = {}
2019-03-31 23:48:57 +02:00
def render_body(msg, extra=None):
def render_text_plain(msg, extra=None):
2019-05-20 00:32:33 +02:00
# msg.get_charset() doesn't work
ct_params = dict(msg.get_params() or [])
charset = ct_params.get("charset", "iso-8859-1")
format = ct_params.get("format", "fixed")
if format == "fixed":
bodytmpl = jenv.get_template("body_text_plain.html")
2019-05-20 23:06:08 +02:00
partbytes = msg.get_payload(decode=True)
parttext = partbytes.decode(charset, errors="replace")
except LookupError as e:
# Unknown encoding? Probably win-1252
print(e, file=sys.stderr)
parttext = partbytes.decode("windows-1252", errors="replace")
2019-05-20 00:32:33 +02:00
context = {
2019-05-20 23:06:08 +02:00
"body": parttext
2019-05-20 00:32:33 +02:00
return bodytmpl.render(context)
elif format == "flowed":
bodytmpl = jenv.get_template("body_text_plain_flowed.html")
parthtml = TextFlowedPart(msg).as_string()
context = {
"body": jinja2.Markup(parthtml),
return bodytmpl.render(context)
raise NotImplementedError()
2019-03-31 23:48:57 +02:00
def render_multipart_mixed(msg, extra=None):
2019-03-17 22:17:02 +01:00
parts = msg.get_payload()
if type(parts) == str:
# mislabelled, assume text/plain
2019-03-17 22:24:17 +01:00
return render_text_plain(msg)
2019-05-12 22:06:51 +02:00
# First, scan for parts with a content-id. A multipart/mixed shouldn't
# have them, but I've seen them in the wild and it should be harmless
# to support at least images. We don't want all content types, though,
# because save_part doesn't support nested parts and I don't want to
# fully implement what is really just a workaround for buggy software.
for i, part in enumerate(msg.get_payload()):
content_id = part.get("Content-Id")
content_type = part.get_content_type()
if content_id and content_type.startswith("image/"):
if extra is None:
extra = {}
extra[content_id] = {
"i": i,
"part": part,
"url": save_part(part, "_url"),
2019-02-03 18:44:50 +01:00
partshtml = []
for part in msg.get_payload():
2019-05-12 22:06:51 +02:00
partshtml.append(render_body(part, extra))
2019-02-03 18:44:50 +01:00
bodytmpl = jenv.get_template("body_multipart_mixed.html")
context = {
"parts": partshtml
2019-03-17 22:30:27 +01:00
return bodytmpl.render(context)
2019-03-31 23:48:57 +02:00
def render_multipart_digest(msg, extra=None):
2019-02-03 18:44:50 +01:00
partshtml = []
for part in msg.get_payload():
bodytmpl = jenv.get_template("body_multipart_digest.html")
context = {
"parts": partshtml
2019-03-17 22:30:27 +01:00
return bodytmpl.render(context)
2019-03-31 23:48:57 +02:00
def render_message_rfc822(msg, extra=None):
2019-02-03 18:44:50 +01:00
partshtml = []
for part in msg.get_payload():
bodytmpl = jenv.get_template("body_message_rfc822.html")
context = {
"parts": partshtml
2019-03-17 22:30:27 +01:00
return bodytmpl.render(context)
2019-03-31 23:48:57 +02:00
def render_text_html(msg, extra=None):
htmlpart = HTMLPart(extra)
2019-05-20 23:25:49 +02:00
ct_params = dict(msg.get_params())
charset = ct_params.get("charset", "iso-8859-1")
raw_text = msg.get_payload(decode=True).decode(charset, errors="replace")
2019-03-01 11:13:09 +01:00
bodytmpl = jenv.get_template("body_text_html.html")
context = {
"body": jinja2.Markup(htmlpart.as_string())
2019-03-17 22:30:27 +01:00
return bodytmpl.render(context)
2019-03-31 23:48:57 +02:00
def render_text_enriched(msg, extra=None):
2019-03-16 21:53:06 +01:00
payload = msg.get_payload(decode=True).decode(msg.get_charset() or "iso-8859-1")
tepart = TextEnrichedPart(payload)
2019-03-01 11:13:09 +01:00
bodytmpl = jenv.get_template("body_text_enriched.html")
context = {
"body": jinja2.Markup(tepart.as_string())
2019-03-17 22:30:27 +01:00
return bodytmpl.render(context)
2019-03-31 23:48:57 +02:00
def render_message_partial(msg, extra=None):
2019-02-28 17:17:44 +01:00
# Default header for get_param is Content-Type
whole_msg_id = msg.get_param("id")
if not whole_msg_id in partial_message_cache:
# For now we assume that total is present on all parts. This
# isn't guarantueed, however, and we may need to handle the
# case where total is only present on the last part.
partial_message_cache[whole_msg_id] = [None] * int(msg.get_param("total"))
payload = msg.get_payload()
s = payload[0].as_string() # Only one part
partial_message_cache[whole_msg_id][int(msg.get_param("number"))-1] = s
if not None in partial_message_cache[whole_msg_id]:
p = email.parser.Parser()
whole_msg = p.parsestr("".join(partial_message_cache[whole_msg_id]))
whole_msg_embedded_id = whole_msg["Message-Id"]
if not whole_msg_embedded_id:
whole_msg.add_header("Message-Id", "<" + whole_msg_id + ">")
whole_msg_embedded_id = whole_msg_id
del partial_message_cache[whole_msg_id]
2019-03-17 22:30:27 +01:00
return "<p>This is part %d of %d of <a href='../%s/'>%s</a></p>" % (
2019-02-28 17:17:44 +01:00
2019-03-31 23:48:57 +02:00
def render_application_octet_stream(msg, extra=None):
2019-03-17 22:30:27 +01:00
return save_part(msg, "attachment")
2019-03-01 11:58:22 +01:00
2019-03-31 23:48:57 +02:00
def render_multipart_signed(msg, extra=None):
2019-03-01 13:54:13 +01:00
content, signature = msg.get_payload()
with tempfile.NamedTemporaryFile(buffering=0) as content_fh:
with tempfile.NamedTemporaryFile(buffering=0, suffix=".asc") as signature_fh:
r = subprocess.run(["gpg", "--verify", signature_fh.name, content_fh.name],
gpgresult = r.stderr
2019-03-02 12:24:56 +01:00
# XXX - Analyze gpgresult or just use r,returncode?
2019-03-01 13:54:13 +01:00
gpgstatus = "dubious"
contenthtml = render_message(content)
bodytmpl = jenv.get_template("body_multipart_signed.html")
context = {
"content": contenthtml,
"gpgresult": gpgresult,
"gpgstatus": gpgstatus,
2019-03-17 22:30:27 +01:00
return bodytmpl.render(context)
2019-03-01 11:58:22 +01:00
2019-03-31 23:48:57 +02:00
def render_application_pgp(msg, extra=None):
2019-03-02 12:24:56 +01:00
with tempfile.NamedTemporaryFile(buffering=0) as content_fh:
r = subprocess.run(["gpg", "--decrypt", content_fh.name],
gpgresult = r.stderr.decode()
# XXX - Analyze gpgresult or just use r,returncode?
gpgstatus = "dubious"
decrypted_content = r.stdout
p = email.parser.BytesParser()
embedded_message = p.parsebytes(decrypted_content)
contenthtml = render_message(embedded_message)
bodytmpl = jenv.get_template("body_application_pgp.html")
context = {
"content": contenthtml,
"gpgresult": gpgresult,
"gpgstatus": gpgstatus,
2019-03-17 22:30:27 +01:00
return bodytmpl.render(context)
2019-03-02 12:24:56 +01:00
2019-03-31 23:48:57 +02:00
def render_multipart_alternative(msg, extra=None):
2019-03-02 23:33:39 +01:00
partshtml = []
partstypes = []
for part in msg.get_payload():
Pass extra parameter to children of multipart/alternative
A structure like this is quite common:
─><no description> [multipa/related, 7bit, 12K]
├─><no description> [multipa/alternativ, 7bit, 9.0K]
│ ├─><no description> [text/plain, quoted, iso-8859-1, 3.7K]
│ └─><no description> [text/html, quoted, iso-8859-1, 4.9K]
└─>2b0063.jpg [image/jpeg, base64, 3.3K]
Here the main content of multipart/related isn't the html part, but a
multipart/alternative containing the html part and a text part. The html
part still needs access to the onther content of the multipart/related
part, so we need to pass this through.
2019-05-12 21:37:26 +02:00
partshtml.append(render_body(part, extra))
2019-03-02 23:33:39 +01:00
bodytmpl = jenv.get_template("body_multipart_alternative.html")
context = {
"types": partstypes,
"parts": partshtml,
2019-03-17 22:30:27 +01:00
return bodytmpl.render(context)
2019-03-02 23:33:39 +01:00
2019-03-31 23:48:57 +02:00
def render_application_x_unknown_content_type_scpfile(msg, extra=None):
2019-03-02 23:33:39 +01:00
bodytmpl = jenv.get_template("body_application_x-unknown-content-type-scpfile.html")
context = {
"body": msg.get_payload(decode=True).decode(msg.get_charset() or "iso-8859-1")
2019-03-17 22:30:27 +01:00
return bodytmpl.render(context)
2019-03-02 23:33:39 +01:00
2019-03-31 23:48:57 +02:00
def render_application_pgp_signature(msg, extra=None):
2019-03-04 21:23:03 +01:00
# A PGP signature outside of a multipart/signed - useless
bodytmpl = jenv.get_template("body_application_pgp-signature.html")
context = {
2019-03-17 22:30:27 +01:00
return bodytmpl.render(context)
2019-03-04 21:23:03 +01:00
2019-03-31 23:48:57 +02:00
def render_application_x_gzip(msg, extra=None):
2019-03-17 22:30:27 +01:00
return save_part(msg, "attachment")
2019-03-04 21:49:46 +01:00
2019-03-31 23:48:57 +02:00
def render_message_news(msg, extra=None):
2019-03-10 22:47:10 +01:00
partshtml = []
for part in msg.get_payload():
bodytmpl = jenv.get_template("body_message_news.html")
context = {
"msg": msg,
"parts": partshtml,
2019-03-17 22:30:27 +01:00
return bodytmpl.render(context)
2019-03-10 23:27:30 +01:00
2019-03-31 23:48:57 +02:00
def render_image_gif(msg, extra=None):
2019-03-17 22:30:27 +01:00
return save_part(msg, "inline")
2019-03-10 23:27:30 +01:00
2019-03-31 23:48:57 +02:00
def render_multipart_related(msg, extra=None):
start = msg.get_param("start")
start_part = None
# collect content-ids
content = {}
for i, part in enumerate(msg.get_payload()):
content_id = part.get("Content-Id")
if start_part is None and (start is None or content_id == start):
start_part = part
if content_id:
content[content_id] = {
"i": i,
"part": part,
"url": save_part(part, "_url"),
parthtml = render_body(start_part, content)
bodytmpl = jenv.get_template("body_multipart_related.html")
context = {
"msg": msg,
"parts": [parthtml],
return bodytmpl.render(context)
2019-05-12 22:12:12 +02:00
def render_image_jpeg(msg, extra=None):
return save_part(msg, "inline")
2019-05-20 23:59:26 +02:00
def render_message_delivery_status(msg, extra=None):
bodytmpl = jenv.get_template("body_message_delivery_status.html")
# A message/delivery status consists of one per-message block
# followed by one or more per-recipient blocks.
# Pythons message parser apparently parses each block as a message
# consisting only of headers. So we just stringify and concatenate them
parts = msg.get_payload()
parttext = "".join([str(p) for p in parts])
context = {
"body": parttext
return bodytmpl.render(context)
2019-03-17 22:17:02 +01:00
renderers = {
"text/plain": render_text_plain,
"multipart/mixed": render_multipart_mixed,
"multipart/digest": render_multipart_digest,
"message/rfc822": render_message_rfc822,
"text/html": render_text_html,
"text/enriched": render_text_enriched,
"message/partial": render_message_partial,
"application/octet-stream": render_application_octet_stream,
"multipart/signed": render_multipart_signed,
"application/pgp": render_application_pgp,
"multipart/alternative": render_multipart_alternative,
"application/x-unknown-content-type-scpfile": render_application_x_unknown_content_type_scpfile,
"application/pgp-signature": render_application_pgp_signature,
"application/x-gzip": render_application_x_gzip,
"message/news": render_message_news,
"image/gif": render_image_gif,
2019-03-31 23:48:57 +02:00
"multipart/related": render_multipart_related,
2019-04-30 22:15:20 +02:00
"application/x-java-vm": render_application_octet_stream,
2019-05-12 22:12:12 +02:00
"image/jpeg": render_image_jpeg,
2019-05-12 22:58:41 +02:00
"application/x-compressed-tar": render_application_octet_stream,
2019-05-20 23:59:26 +02:00
"message/delivery-status": render_message_delivery_status,
2019-06-17 20:59:23 +02:00
"application/pgp-keys": render_application_octet_stream,
2019-03-17 22:17:02 +01:00
content_type = msg.get_content_type()
content_disposition = msg.get_content_disposition()
if content_disposition == "attachment":
# XXX - not sure, if we should just store all content-types.
# We probably should clean up html. Alternatively we could just store
# all of them application/octet-stream, which browsers should download
# and not try to display.
bodyhtml = save_part(msg, content_disposition)
2019-02-03 18:44:50 +01:00
2019-03-31 23:48:57 +02:00
bodyhtml = renderers[content_type](msg, extra)
2019-02-03 18:44:50 +01:00
2019-02-28 09:30:47 +01:00
return jinja2.Markup(bodyhtml)
2019-02-03 18:44:50 +01:00
def archive(msg):
mid = get_message_id(msg)
2019-03-04 21:23:03 +01:00
print("M", mid, file=sys.stderr)
2019-02-03 18:44:50 +01:00
encmid = encode_message_id(mid)
msgdir = basedir + "/msg/" + encmid
os.makedirs(msgdir, exist_ok=True)
with open(msgdir + "/index.html", "w") as hfd:
msgtmpl = jenv.get_template("message.html")
bodyhtml = render_body(msg)
context = {
"list": "LUGA",
"message_id": mid,
2019-04-30 21:55:21 +02:00
"subject": decode_rfc2047(msg["Subject"]),
"from": decode_rfc2047(msg["From"]),
2019-02-03 18:44:50 +01:00
"date": msg["Date"],
"bodyhtml": bodyhtml,
msghtml = msgtmpl.render(context)
2019-02-28 09:30:47 +01:00
class HTMLPart(html.parser.HTMLParser):
2019-03-01 10:21:57 +01:00
allowed_tags = [
"h2", "a", "wbr", "hr", "pre", "img", "font", "i", "br", "table", "tr",
"th", "td", "b", "select", "option", "input", "sup", "address",
2019-03-04 21:07:18 +01:00
"center", "p", "h1", "dl", "h3", "ul", "li", "ol", "u", "blockquote",
2019-03-10 23:27:30 +01:00
2019-03-01 10:21:57 +01:00
hide_tags = [ "title" ]
ignore_tags = [ "html", "head", "body", "marquee", "meta", "form", ]
2019-02-28 09:30:47 +01:00
2019-03-31 23:48:57 +02:00
def __init__(self, extra):
2019-02-28 09:30:47 +01:00
self.hide = False
self.content = []
2019-03-01 22:52:41 +01:00
self.base = None
2019-03-31 23:48:57 +02:00
self.extra = extra or {}
2019-02-28 09:30:47 +01:00
def handle_starttag(self, tag, attrs):
if tag == "base":
href = [x[1] for x in attrs if x[0] == "href"]
if href:
self.base = href[0]
elif tag in self.allowed_tags:
attrstr = "".join(
2019-03-01 10:21:57 +01:00
[' %s="%s"' % (a[0], html.escape(a[1])) if a[1] else ' %s' % (a[0])
2019-02-28 09:30:47 +01:00
for a in self.clean_attrs(tag, attrs)
self.content.append("<%s%s>" % ( tag, attrstr ))
elif tag in self.hide_tags:
self.hide = True
elif tag in self.ignore_tags:
print("Encountered unknown start tag", tag, attrs, file=sys.stderr)
def handle_endtag(self, tag):
if tag in self.allowed_tags:
self.content.append("</%s>" % tag)
elif tag in self.hide_tags:
self.hide = False # XXX - Need stack?
elif tag in self.ignore_tags:
print("Encountered unknown end tag", tag, file=sys.stderr)
def handle_data(self, data):
if not self.hide:
def as_string(self):
return "".join(self.content)
def clean_attrs(self, tag, attrs):
2019-03-01 10:21:57 +01:00
safe_attrs = [
"border", "alt", "size", "face", "width", "height", "hspace",
"cellpadding", "cellspacing", "bgcolor", "valign", "nowrap",
"color", "colspan", "name", "value", "type", "align", "clear",
2019-03-04 21:07:18 +01:00
"noshade", "type",
2019-03-01 10:21:57 +01:00
2019-02-28 09:30:47 +01:00
clean_attrs = []
for a in attrs:
2019-03-01 10:21:57 +01:00
if a[0] in safe_attrs:
elif a[0] == "href":
2019-02-28 09:30:47 +01:00
url = a[1]
url = urllib.parse.urljoin(self.base, url)
u = urllib.parse.urlparse(url)
if u[0] in ['https', 'http', 'ftp']:
clean_attrs.append((a[0], url))
elif a[0] == "src":
url = a[1]
url = urllib.parse.urljoin(self.base, url)
u = urllib.parse.urlparse(url)
if u[0] == "cid":
print("Encountered src cid attribute", a, file=sys.stderr)
2019-03-31 23:48:57 +02:00
clean_attrs.append((a[0], self.extra["<" + u.path + ">"]["url"]))
2019-02-28 09:30:47 +01:00
print("Ignored src attribute", a, file=sys.stderr)
2019-03-01 10:21:57 +01:00
elif a[0] == "target":
2019-02-28 09:30:47 +01:00
print("Encountered unknown attribute", a, file=sys.stderr)
return clean_attrs
class TextEnrichedPart:
class TEElement:
2019-03-16 21:53:06 +01:00
def __init__(self, t, parent):
2019-02-28 09:30:47 +01:00
self.type = t.lower()
self.content = []
2019-03-16 21:53:06 +01:00
if self.type == "nofill":
self.filled = False
elif parent:
self.filled = parent.filled
self.filled = True
2019-02-28 09:30:47 +01:00
def append_text(self, s):
s = s.replace("<<", "<")
if self.filled:
s = re.sub(r'\n+',
lambda m: m.group(0)[1:] if len(m.group(0)) > 1 else " ",
def as_string(self):
if self.type == "":
pre = "<div class='text-enriched'>"
post = "</div>"
elif self.type == "bold":
pre = "<b>"
post = "</b>"
2019-03-16 21:53:06 +01:00
elif self.type == "param":
# We shouldn't ever get here since the param should be consumed
# by the parent, but there are broken messages ...
return ""
elif self.type.startswith("x-"):
# Just ignore all experimental elements and render their
# contents.
pre = ""
post = ""
elif self.type == "flushleft":
pre = "<div class='flushleft'>"
post = "</div>"
elif self.type == "smaller":
# HTML has a "small" element, but that is meant for "side
# comments such as small print", while t/e "smaller" is purely
# typographical
pre = "<span style='font-size: 0.9em'>"
post = "</span>"
elif self.type == "color":
param = self.content.pop(0)
if param.type != "param":
raise RuntimeError("Expected 'param', got '%s'" % param.type)
colorstring = param.content[0]
if re.match(r'^\w+$', colorstring):
# a single word, i.e. a colorname like "red" or cyan".
# The 8 colors in the spec aren't a subset of the 17 colors in CSS2,
# but recognized by most/all browsers. And if we encounter a non-standard
# color the best we can do is let the browser handle it.
m = re.match(r'([0-9a-f]{4}),([0-9a-f]{4}),([0-9a-f]{4})', colorstring, re.IGNORECASE)
if m:
# an RGB triple. Use only the top 8 bits of each component:
colorstring = "#%s%s%s" % (m.group(1)[:2], m.group(2)[:2], m.group(3)[:2])
# syntax error. Replace with "black"
colorstring = "#000"
pre = "<span style='color: %s'>" % colorstring
post = "</span>"
elif self.type == "nofill":
pre = "<div class='nofill'>"
post = "</div>"
elif self.type == "fontfamily":
param = self.content.pop(0)
if param.type != "param":
raise RuntimeError("Expected 'param', got '%s'" % param.type)
fontfamily = param.content[0]
if "'" in fontfamily or '"' in fontfamily:
raise RuntimeError("Can't handle quotes in font names (%s)" % fontfamily)
pre = "<span style='font-family: \"%s\"'>" % fontfamily
post = "</span>"
elif self.type == "bigger":
# HTML used to have a "big" element, but that has been removed from HTML5
pre = "<span style='font-size: 1.1em'>"
post = "</span>"
2019-03-17 22:42:44 +01:00
elif self.type == "underline":
# HTML5 redefined the meaning of "u", but I'm using it anyway
pre = "<u>"
post = "</u>"
2019-02-28 09:30:47 +01:00
raise NotImplementedError("Unknown type " + self.type)
s = pre
for c in self.content:
if isinstance(c, type(self)):
s += c.as_string()
s += html.escape(c)
s += post
return s
def __init__(self, s):
2019-03-16 21:53:06 +01:00
self.stack = [ self.TEElement("", None) ]
2019-02-28 09:30:47 +01:00
while s:
stack_top = self.stack[-1]
m = re.match(r'(.*?)<(/?[A-Za-z0-9-]{,60})>(.*)', s, re.DOTALL)
if m:
2019-03-16 21:53:06 +01:00
text = m.group(1)
tag = m.group(2).lower()
if not (tag == "param" and re.match(r'\s*', text) or text == ""):
if tag[0] != "/":
new = self.TEElement(tag, stack_top)
2019-02-28 09:30:47 +01:00
2019-03-16 21:53:06 +01:00
closed_tag = tag[1:]
if stack_top.type == closed_tag:
elif closed_tag in [e.type for e in self.stack]:
# We close a tag which has been opened, but it
# wasn't the last one. This is clearly a nesting
# error, but there was broken software (e.g.
# http://www.fozztexx.com/Mynah/) which used
# non-closing tags, and by just popping them off
# the stack we can "re-synchronize".
while self.stack.pop().type != closed_tag:
2019-02-28 09:30:47 +01:00
2019-03-16 21:53:06 +01:00
raise RuntimeError("Nesting error: Expected %s, got %s near %s" % (self.stack[-1].type, closed_tag, s))
2019-02-28 09:30:47 +01:00
s = m.group(3)
s = ""
def as_string(self):
return self.stack[0].as_string()
2019-05-20 00:32:33 +02:00
class TextFlowedPart:
def __init__(self, msg):
self.quote_depth = 0
self.current_line = ""
self.flowed = False
self.lines = []
self.buffer_filled = False
ct_params = dict(msg.get_params())
charset = ct_params.get("charset", "iso-8859-1")
format = ct_params.get("format", "fixed")
delsp = ct_params.get("delsp", "no") == "yes"
2019-05-20 23:06:08 +02:00
charset_map = {
"x-mac-roman": "mac_roman",
if charset in charset_map:
charset = charset_map[charset]
raw_text = msg.get_payload(decode=True).decode(charset, errors="replace")
2019-05-20 00:32:33 +02:00
raw_lines = raw_text.split("\n")
for rl in raw_lines:
quote_depth = 0
while rl[:1] == ">":
quote_depth += 1
rl = rl[1:]
if rl[:1] == " ":
rl = rl[1:]
if rl == "-- ":
flowed = None
elif rl[-1:] == " ":
flowed = True
if delsp:
rl = rl[:-1]
flowed = False
self.add_buffer(rl, quote_depth, flowed)
def add_buffer(self, line, quote_depth, flowed):
if flowed is None:
flowed = False
if quote_depth != self.quote_depth:
self.quote_depth = quote_depth
self.current_line += line
self.flowed |= flowed
self.buffer_filled = True
if not flowed:
def flush(self):
if self.buffer_filled:
"quote_depth": self.quote_depth,
"flowed": self.flowed,
"content": self.current_line
self.current_line = ""
self.flowed = False
self.buffer_filled = False
def as_string(self):
prev_quote_depth = 0
s = ""
for ln in self.lines:
while ln["quote_depth"] > prev_quote_depth:
s += "<blockquote>\n"
prev_quote_depth += 1
while ln["quote_depth"] < prev_quote_depth:
s += "</blockquote>\n"
prev_quote_depth -= 1
if ln["flowed"]:
s += "<p class='flowed'>" + html.escape(ln["content"]) + "</p>\n"
s += "<p class='fixed'>" + html.escape(ln["content"]) + "</p>\n"
while 0 < prev_quote_depth:
s += "</blockquote>"
prev_quote_depth -= 1
return s
2019-02-03 18:44:50 +01:00
for f in sys.argv[1:]:
2019-03-01 10:21:57 +01:00
print("F", f, file=sys.stderr)
2019-02-03 18:44:50 +01:00
mb = mailbox.mbox(f)
for m in mb:
2019-03-01 10:21:57 +01:00
# vim: tw=79