Clean up and include style sheets

2019-10-31 21:22:03 +01:00 · 2019-10-31 21:22:03 +01:00 · 29b5288519
parent b86ae9cb3f
commit 29b5288519
1 changed files with 156 additions and 2 deletions
--- a/158
+++ b/158
@ -14,6 +14,7 @@ import tempfile
 import urllib.parse

 import jinja2
+import tinycss

 basedir = "."

@ -447,11 +448,45 @@ def archive(msg):


 class HTMLPart(html.parser.HTMLParser):
+    """
+    A text/html part
+
+    This is a subclass of HTMLParser, so the handle_* methods will be invoked
+    as appropriate during parsing. There are a few additional attributes to
+    keep track of the state:
+
+    ... attribute:: content
+
+        Accumulates parts fragments of the final, cleaned up, html message as
+        strings
+
+    ... attribute:: base
+
+        The base URL
+
+    ... attribute:: extra
+
+        Context information. This includes info about cids or references to
+        other messages
+
+    ... attribute:: hide
+
+        If true, the content of the current tag is omitted from the output.
+        This is set when encountering a start tag in hide_tags, and reset at
+        each end tag (so it works only for leaves).
+
+    ... attribute:: current_tag
+
+        The current tag. Similar to hide, this is set and reset when
+        encountering start end end tags, so it is only correct while processing
+        a leaf element. But since we use it only for style elements, that's
+        acceptable.
+    """
    allowed_tags = [
        "h2", "a", "wbr", "hr", "pre", "img", "font", "i", "br", "table", "tr",
        "th", "td", "b", "select", "option", "input", "sup", "address",
        "center", "p", "h1", "dl", "h3", "ul", "li", "ol", "u", "blockquote",
-        "h4", "div", "span"
+        "h4", "div", "span", "style",
    ]
    hide_tags = [ "title" ]
    ignore_tags = [ "html", "head", "body", "marquee", "meta", "form", ]
@ -462,8 +497,10 @@ class HTMLPart(html.parser.HTMLParser):
        self.content = []
        self.base = None
        self.extra = extra or {}
+        self.current_tag = None

    def handle_starttag(self, tag, attrs):
+        self.current_tag = tag
        if tag == "base":
            href = [x[1] for x in attrs if x[0] == "href"]
            if href:
@ -494,8 +531,11 @@ class HTMLPart(html.parser.HTMLParser):
            pass
        else:
            print("Encountered unknown end tag", tag, file=sys.stderr)
+        self.current_tag = None

    def handle_data(self, data):
+        if self.current_tag == "style":
+            data = self.clean_style(data)
        if not self.hide:
            self.content.append(data)

@ -536,10 +576,124 @@ class HTMLPart(html.parser.HTMLParser):
                    mid = a[1][4:]
                    encmid = encode_message_id(mid)
                    extra = "<a class='citesource' href='../%s'>\u2397</a>" % encmid
+            elif a[0] == "class":
+                clean_attrs.append((a[0], "msg-" + a[1],))
            else:
-                print("Encountered unknown attribute", a, file=sys.stderr)
+                print("Encountered unknown attribute", a, "in", tag, file=sys.stderr)
        return clean_attrs, extra

+    def clean_style(self, stylesheet):
+        cssparser = tinycss.make_parser()
+        stylesheet = cssparser.parse_stylesheet(stylesheet)
+        clean_stylesheet = ""
+
+        for rule in stylesheet.rules:
+
+            # first clean up selectors: Prepend "msg-" to every class or id
+            next_is_local_id = False
+            new_selector = []
+            for token in rule.selector:
+                if next_is_local_id and token.type == "IDENT":
+                    new_id = "msg-" + token.value
+                    new_selector.append(tinycss.token_data.Token(token.type, new_id, new_id, token.unit, token.line, token.column))
+                else:
+                    new_selector.append(token)
+                next_is_local_id = token.type == "DELIM" and (token.value == "." or token.value == "#")
+            rule.selector = tinycss.token_data.TokenList(new_selector)
+            clean_stylesheet += rule.selector.as_css()
+
+            # Then clean up declarations.
+            # We keep only declarations we recognize
+            safe_declarations = {
+                "background-color",
+                "border-bottom-style",
+                "border-color",
+                "border-left",
+                "border-left-style",
+                "border-right-style",
+                "border-style",
+                "border-top-style",
+                "color",
+                "font-family",
+                "font-size",
+                "font-style",
+                "font-weight",
+                "height",
+                "list-style",
+                "margin",
+                "margin-bottom",
+                "margin-left",
+                "margin-right",
+                "margin-top",
+                "padding",
+                "padding-bottom",
+                "padding-left",
+                "padding-right",
+                "padding-top",
+                "page-break-after",
+                "text-align",
+                "text-decoration",
+                "white-space",
+                "width",
+            }
+
+            # Ignore these silently to avoid cluttering logs
+            ignore_declarations = {
+                "mso-ansi-font-size",
+                "mso-ansi-language",
+                "mso-ascii-font-family",
+                "mso-bidi-font-family",
+                "mso-bidi-font-size",
+                "mso-bidi-font-weight",
+                "mso-fareast-font-family",
+                "mso-fareast-language",
+                "mso-gram-e",
+                "mso-hansi-font-family",
+                "mso-margin-bottom-alt",
+                "mso-margin-top-alt",
+                "mso-outline-level",
+                "mso-pagination",
+                "mso-spl-e",
+                "mso-style-link",
+                "mso-style-name",
+                "mso-style-next",
+                "mso-style-noshow",
+                "mso-style-parent",
+                "mso-style-priority",
+                "mso-style-type",
+                "page", # doesn't exist in CSS 2.2
+                "panose-1", # doesn't exist in CSS 2.2
+                "text-underline", # doesn't exist in CSS 2.2
+            }
+            new_declarations = []
+            for declaration in rule.declarations:
+                if declaration.name in safe_declarations:
+                    new_declarations.append(declaration)
+                elif declaration.name == "background-image":
+                    # check if URL is cid, discard if not
+                    ok = False
+                    if len(declaration.value) == 1:
+                        if declaration.value[0].type == "URI":
+                            if declaration.value[0].value.startswith("cid:"):
+                                print("accepting url", declaration.value[0].value)
+                                # Get the real converted url here
+                                new_declarations.append(
+                                    parser.parse_style_attr("background-image: url(/whatever.png)")[0][0]
+                                )
+                                ok = True
+                    if not ok:
+                        print("ignoring unsafe CSS property", declaration)
+                    pass
+                elif declaration.name in ignore_declarations:
+                    pass
+                else:
+                    print("ignoring unknown CSS property", declaration.name)
+            clean_stylesheet += " {\n"
+            for declaration in new_declarations:
+                clean_stylesheet += "\t" +  declaration.name + ":" + declaration.value.as_css() + ";\n"
+            clean_stylesheet += "}\n\n"
+        return clean_stylesheet
+

 class TextEnrichedPart:
    class TEElement: