From 29b52885195e272aa32fa2fc47aa872b9b5896f2 Mon Sep 17 00:00:00 2001 From: "Peter J. Holzer" Date: Thu, 31 Oct 2019 21:22:03 +0100 Subject: [PATCH] Clean up and include style sheets --- mbox2web | 158 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 156 insertions(+), 2 deletions(-) diff --git a/mbox2web b/mbox2web index 0cd4048..ee1a0b3 100755 --- a/mbox2web +++ b/mbox2web @@ -14,6 +14,7 @@ import tempfile import urllib.parse import jinja2 +import tinycss basedir = "." @@ -447,11 +448,45 @@ def archive(msg): class HTMLPart(html.parser.HTMLParser): + """ + A text/html part + + This is a subclass of HTMLParser, so the handle_* methods will be invoked + as appropriate during parsing. There are a few additional attributes to + keep track of the state: + + ... attribute:: content + + Accumulates parts fragments of the final, cleaned up, html message as + strings + + ... attribute:: base + + The base URL + + ... attribute:: extra + + Context information. This includes info about cids or references to + other messages + + ... attribute:: hide + + If true, the content of the current tag is omitted from the output. + This is set when encountering a start tag in hide_tags, and reset at + each end tag (so it works only for leaves). + + ... attribute:: current_tag + + The current tag. Similar to hide, this is set and reset when + encountering start end end tags, so it is only correct while processing + a leaf element. But since we use it only for style elements, that's + acceptable. + """ allowed_tags = [ "h2", "a", "wbr", "hr", "pre", "img", "font", "i", "br", "table", "tr", "th", "td", "b", "select", "option", "input", "sup", "address", "center", "p", "h1", "dl", "h3", "ul", "li", "ol", "u", "blockquote", - "h4", "div", "span" + "h4", "div", "span", "style", ] hide_tags = [ "title" ] ignore_tags = [ "html", "head", "body", "marquee", "meta", "form", ] @@ -462,8 +497,10 @@ class HTMLPart(html.parser.HTMLParser): self.content = [] self.base = None self.extra = extra or {} + self.current_tag = None def handle_starttag(self, tag, attrs): + self.current_tag = tag if tag == "base": href = [x[1] for x in attrs if x[0] == "href"] if href: @@ -494,8 +531,11 @@ class HTMLPart(html.parser.HTMLParser): pass else: print("Encountered unknown end tag", tag, file=sys.stderr) + self.current_tag = None def handle_data(self, data): + if self.current_tag == "style": + data = self.clean_style(data) if not self.hide: self.content.append(data) @@ -536,10 +576,124 @@ class HTMLPart(html.parser.HTMLParser): mid = a[1][4:] encmid = encode_message_id(mid) extra = "\u2397" % encmid + elif a[0] == "class": + clean_attrs.append((a[0], "msg-" + a[1],)) else: - print("Encountered unknown attribute", a, file=sys.stderr) + print("Encountered unknown attribute", a, "in", tag, file=sys.stderr) return clean_attrs, extra + def clean_style(self, stylesheet): + cssparser = tinycss.make_parser() + stylesheet = cssparser.parse_stylesheet(stylesheet) + clean_stylesheet = "" + + for rule in stylesheet.rules: + + # first clean up selectors: Prepend "msg-" to every class or id + next_is_local_id = False + new_selector = [] + for token in rule.selector: + if next_is_local_id and token.type == "IDENT": + new_id = "msg-" + token.value + new_selector.append(tinycss.token_data.Token(token.type, new_id, new_id, token.unit, token.line, token.column)) + else: + new_selector.append(token) + next_is_local_id = token.type == "DELIM" and (token.value == "." or token.value == "#") + rule.selector = tinycss.token_data.TokenList(new_selector) + clean_stylesheet += rule.selector.as_css() + + # Then clean up declarations. + # We keep only declarations we recognize + safe_declarations = { + "background-color", + "border-bottom-style", + "border-color", + "border-left", + "border-left-style", + "border-right-style", + "border-style", + "border-top-style", + "color", + "font-family", + "font-size", + "font-style", + "font-weight", + "height", + "list-style", + "margin", + "margin-bottom", + "margin-left", + "margin-right", + "margin-top", + "padding", + "padding-bottom", + "padding-left", + "padding-right", + "padding-top", + "page-break-after", + "text-align", + "text-decoration", + "white-space", + "width", + } + + # Ignore these silently to avoid cluttering logs + ignore_declarations = { + "mso-ansi-font-size", + "mso-ansi-language", + "mso-ascii-font-family", + "mso-bidi-font-family", + "mso-bidi-font-size", + "mso-bidi-font-weight", + "mso-fareast-font-family", + "mso-fareast-language", + "mso-gram-e", + "mso-hansi-font-family", + "mso-margin-bottom-alt", + "mso-margin-top-alt", + "mso-outline-level", + "mso-pagination", + "mso-spl-e", + "mso-style-link", + "mso-style-name", + "mso-style-next", + "mso-style-noshow", + "mso-style-parent", + "mso-style-priority", + "mso-style-type", + "page", # doesn't exist in CSS 2.2 + "panose-1", # doesn't exist in CSS 2.2 + "text-underline", # doesn't exist in CSS 2.2 + } + new_declarations = [] + for declaration in rule.declarations: + if declaration.name in safe_declarations: + new_declarations.append(declaration) + elif declaration.name == "background-image": + # check if URL is cid, discard if not + ok = False + if len(declaration.value) == 1: + if declaration.value[0].type == "URI": + if declaration.value[0].value.startswith("cid:"): + print("accepting url", declaration.value[0].value) + # Get the real converted url here + new_declarations.append( + parser.parse_style_attr("background-image: url(/whatever.png)")[0][0] + ) + ok = True + if not ok: + print("ignoring unsafe CSS property", declaration) + pass + elif declaration.name in ignore_declarations: + pass + else: + print("ignoring unknown CSS property", declaration.name) + clean_stylesheet += " {\n" + for declaration in new_declarations: + clean_stylesheet += "\t" + declaration.name + ":" + declaration.value.as_css() + ";\n" + clean_stylesheet += "}\n\n" + return clean_stylesheet + class TextEnrichedPart: class TEElement: