Clean up and include style sheets

2019-10-31 21:22:03 +01:00 · 2019-10-31 21:22:03 +01:00 · 29b5288519
parent b86ae9cb3f
commit 29b5288519
1 changed files with 156 additions and 2 deletions
--- a/158
+++ b/158
@ -14,6 +14,7 @@ import tempfile
 import urllib.parse
 import jinja2
 import tinycss
 basedir = "."
@ -447,11 +448,45 @@ def archive(msg):
 class HTMLPart(html.parser.HTMLParser):
    """
    A text/html part
    This is a subclass of HTMLParser, so the handle_* methods will be invoked
    as appropriate during parsing. There are a few additional attributes to
    keep track of the state:
    ... attribute:: content
        Accumulates parts fragments of the final, cleaned up, html message as
        strings
    ... attribute:: base
        The base URL
    ... attribute:: extra
        Context information. This includes info about cids or references to
        other messages
    ... attribute:: hide
        If true, the content of the current tag is omitted from the output.
        This is set when encountering a start tag in hide_tags, and reset at
        each end tag (so it works only for leaves).
    ... attribute:: current_tag
        The current tag. Similar to hide, this is set and reset when
        encountering start end end tags, so it is only correct while processing
        a leaf element. But since we use it only for style elements, that's
        acceptable.
    """
    allowed_tags = [
        "h2", "a", "wbr", "hr", "pre", "img", "font", "i", "br", "table", "tr",
        "th", "td", "b", "select", "option", "input", "sup", "address",
        "center", "p", "h1", "dl", "h3", "ul", "li", "ol", "u", "blockquote",
-        "h4", "div", "span"
+        "h4", "div", "span", "style",
    ]
    hide_tags = [ "title" ]
    ignore_tags = [ "html", "head", "body", "marquee", "meta", "form", ]
@ -462,8 +497,10 @@ class HTMLPart(html.parser.HTMLParser):
        self.content = []
        self.base = None
        self.extra = extra or {}
        self.current_tag = None
    def handle_starttag(self, tag, attrs):
        self.current_tag = tag
        if tag == "base":
            href = [x[1] for x in attrs if x[0] == "href"]
            if href:
@ -494,8 +531,11 @@ class HTMLPart(html.parser.HTMLParser):
            pass
        else:
            print("Encountered unknown end tag", tag, file=sys.stderr)
        self.current_tag = None
    def handle_data(self, data):
        if self.current_tag == "style":
            data = self.clean_style(data)
        if not self.hide:
            self.content.append(data)
@ -536,10 +576,124 @@ class HTMLPart(html.parser.HTMLParser):
                    mid = a[1][4:]
                    encmid = encode_message_id(mid)
                    extra = "<a class='citesource' href='../%s'>\u2397</a>" % encmid
            elif a[0] == "class":
                clean_attrs.append((a[0], "msg-" + a[1],))
            else:
-                print("Encountered unknown attribute", a, file=sys.stderr)
+                print("Encountered unknown attribute", a, "in", tag, file=sys.stderr)
        return clean_attrs, extra
    def clean_style(self, stylesheet):
        cssparser = tinycss.make_parser()
        stylesheet = cssparser.parse_stylesheet(stylesheet)
        clean_stylesheet = ""
        for rule in stylesheet.rules:
            # first clean up selectors: Prepend "msg-" to every class or id
            next_is_local_id = False
            new_selector = []
            for token in rule.selector:
                if next_is_local_id and token.type == "IDENT":
                    new_id = "msg-" + token.value
                    new_selector.append(tinycss.token_data.Token(token.type, new_id, new_id, token.unit, token.line, token.column))
                else:
                    new_selector.append(token)
                next_is_local_id = token.type == "DELIM" and (token.value == "." or token.value == "#")
            rule.selector = tinycss.token_data.TokenList(new_selector)
            clean_stylesheet += rule.selector.as_css()
            # Then clean up declarations.
            # We keep only declarations we recognize
            safe_declarations = {
                "background-color",
                "border-bottom-style",
                "border-color",
                "border-left",
                "border-left-style",
                "border-right-style",
                "border-style",
                "border-top-style",
                "color",
                "font-family",
                "font-size",
                "font-style",
                "font-weight",
                "height",
                "list-style",
                "margin",
                "margin-bottom",
                "margin-left",
                "margin-right",
                "margin-top",
                "padding",
                "padding-bottom",
                "padding-left",
                "padding-right",
                "padding-top",
                "page-break-after",
                "text-align",
                "text-decoration",
                "white-space",
                "width",
            }
            # Ignore these silently to avoid cluttering logs
            ignore_declarations = {
                "mso-ansi-font-size",
                "mso-ansi-language",
                "mso-ascii-font-family",
                "mso-bidi-font-family",
                "mso-bidi-font-size",
                "mso-bidi-font-weight",
                "mso-fareast-font-family",
                "mso-fareast-language",
                "mso-gram-e",
                "mso-hansi-font-family",
                "mso-margin-bottom-alt",
                "mso-margin-top-alt",
                "mso-outline-level",
                "mso-pagination",
                "mso-spl-e",
                "mso-style-link",
                "mso-style-name",
                "mso-style-next",
                "mso-style-noshow",
                "mso-style-parent",
                "mso-style-priority",
                "mso-style-type",
                "page", # doesn't exist in CSS 2.2
                "panose-1", # doesn't exist in CSS 2.2
                "text-underline", # doesn't exist in CSS 2.2
            }
            new_declarations = []
            for declaration in rule.declarations:
                if declaration.name in safe_declarations:
                    new_declarations.append(declaration)
                elif declaration.name == "background-image":
                    # check if URL is cid, discard if not
                    ok = False
                    if len(declaration.value) == 1:
                        if declaration.value[0].type == "URI":
                            if declaration.value[0].value.startswith("cid:"):
                                print("accepting url", declaration.value[0].value)
                                # Get the real converted url here
                                new_declarations.append(
                                    parser.parse_style_attr("background-image: url(/whatever.png)")[0][0]
                                )
                                ok = True
                    if not ok:
                        print("ignoring unsafe CSS property", declaration)
                    pass
                elif declaration.name in ignore_declarations:
                    pass
                else:
                    print("ignoring unknown CSS property", declaration.name)
            clean_stylesheet += " {\n"
            for declaration in new_declarations:
                clean_stylesheet += "\t" +  declaration.name + ":" + declaration.value.as_css() + ";\n"
            clean_stylesheet += "}\n\n"
        return clean_stylesheet
 class TextEnrichedPart:
    class TEElement: