Clean up and include style sheets
This commit is contained in:
parent
b86ae9cb3f
commit
29b5288519
158
mbox2web
158
mbox2web
|
@ -14,6 +14,7 @@ import tempfile
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
|
|
||||||
import jinja2
|
import jinja2
|
||||||
|
import tinycss
|
||||||
|
|
||||||
basedir = "."
|
basedir = "."
|
||||||
|
|
||||||
|
@ -447,11 +448,45 @@ def archive(msg):
|
||||||
|
|
||||||
|
|
||||||
class HTMLPart(html.parser.HTMLParser):
|
class HTMLPart(html.parser.HTMLParser):
|
||||||
|
"""
|
||||||
|
A text/html part
|
||||||
|
|
||||||
|
This is a subclass of HTMLParser, so the handle_* methods will be invoked
|
||||||
|
as appropriate during parsing. There are a few additional attributes to
|
||||||
|
keep track of the state:
|
||||||
|
|
||||||
|
... attribute:: content
|
||||||
|
|
||||||
|
Accumulates parts fragments of the final, cleaned up, html message as
|
||||||
|
strings
|
||||||
|
|
||||||
|
... attribute:: base
|
||||||
|
|
||||||
|
The base URL
|
||||||
|
|
||||||
|
... attribute:: extra
|
||||||
|
|
||||||
|
Context information. This includes info about cids or references to
|
||||||
|
other messages
|
||||||
|
|
||||||
|
... attribute:: hide
|
||||||
|
|
||||||
|
If true, the content of the current tag is omitted from the output.
|
||||||
|
This is set when encountering a start tag in hide_tags, and reset at
|
||||||
|
each end tag (so it works only for leaves).
|
||||||
|
|
||||||
|
... attribute:: current_tag
|
||||||
|
|
||||||
|
The current tag. Similar to hide, this is set and reset when
|
||||||
|
encountering start end end tags, so it is only correct while processing
|
||||||
|
a leaf element. But since we use it only for style elements, that's
|
||||||
|
acceptable.
|
||||||
|
"""
|
||||||
allowed_tags = [
|
allowed_tags = [
|
||||||
"h2", "a", "wbr", "hr", "pre", "img", "font", "i", "br", "table", "tr",
|
"h2", "a", "wbr", "hr", "pre", "img", "font", "i", "br", "table", "tr",
|
||||||
"th", "td", "b", "select", "option", "input", "sup", "address",
|
"th", "td", "b", "select", "option", "input", "sup", "address",
|
||||||
"center", "p", "h1", "dl", "h3", "ul", "li", "ol", "u", "blockquote",
|
"center", "p", "h1", "dl", "h3", "ul", "li", "ol", "u", "blockquote",
|
||||||
"h4", "div", "span"
|
"h4", "div", "span", "style",
|
||||||
]
|
]
|
||||||
hide_tags = [ "title" ]
|
hide_tags = [ "title" ]
|
||||||
ignore_tags = [ "html", "head", "body", "marquee", "meta", "form", ]
|
ignore_tags = [ "html", "head", "body", "marquee", "meta", "form", ]
|
||||||
|
@ -462,8 +497,10 @@ class HTMLPart(html.parser.HTMLParser):
|
||||||
self.content = []
|
self.content = []
|
||||||
self.base = None
|
self.base = None
|
||||||
self.extra = extra or {}
|
self.extra = extra or {}
|
||||||
|
self.current_tag = None
|
||||||
|
|
||||||
def handle_starttag(self, tag, attrs):
|
def handle_starttag(self, tag, attrs):
|
||||||
|
self.current_tag = tag
|
||||||
if tag == "base":
|
if tag == "base":
|
||||||
href = [x[1] for x in attrs if x[0] == "href"]
|
href = [x[1] for x in attrs if x[0] == "href"]
|
||||||
if href:
|
if href:
|
||||||
|
@ -494,8 +531,11 @@ class HTMLPart(html.parser.HTMLParser):
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
print("Encountered unknown end tag", tag, file=sys.stderr)
|
print("Encountered unknown end tag", tag, file=sys.stderr)
|
||||||
|
self.current_tag = None
|
||||||
|
|
||||||
def handle_data(self, data):
|
def handle_data(self, data):
|
||||||
|
if self.current_tag == "style":
|
||||||
|
data = self.clean_style(data)
|
||||||
if not self.hide:
|
if not self.hide:
|
||||||
self.content.append(data)
|
self.content.append(data)
|
||||||
|
|
||||||
|
@ -536,10 +576,124 @@ class HTMLPart(html.parser.HTMLParser):
|
||||||
mid = a[1][4:]
|
mid = a[1][4:]
|
||||||
encmid = encode_message_id(mid)
|
encmid = encode_message_id(mid)
|
||||||
extra = "<a class='citesource' href='../%s'>\u2397</a>" % encmid
|
extra = "<a class='citesource' href='../%s'>\u2397</a>" % encmid
|
||||||
|
elif a[0] == "class":
|
||||||
|
clean_attrs.append((a[0], "msg-" + a[1],))
|
||||||
else:
|
else:
|
||||||
print("Encountered unknown attribute", a, file=sys.stderr)
|
print("Encountered unknown attribute", a, "in", tag, file=sys.stderr)
|
||||||
return clean_attrs, extra
|
return clean_attrs, extra
|
||||||
|
|
||||||
|
def clean_style(self, stylesheet):
|
||||||
|
cssparser = tinycss.make_parser()
|
||||||
|
stylesheet = cssparser.parse_stylesheet(stylesheet)
|
||||||
|
clean_stylesheet = ""
|
||||||
|
|
||||||
|
for rule in stylesheet.rules:
|
||||||
|
|
||||||
|
# first clean up selectors: Prepend "msg-" to every class or id
|
||||||
|
next_is_local_id = False
|
||||||
|
new_selector = []
|
||||||
|
for token in rule.selector:
|
||||||
|
if next_is_local_id and token.type == "IDENT":
|
||||||
|
new_id = "msg-" + token.value
|
||||||
|
new_selector.append(tinycss.token_data.Token(token.type, new_id, new_id, token.unit, token.line, token.column))
|
||||||
|
else:
|
||||||
|
new_selector.append(token)
|
||||||
|
next_is_local_id = token.type == "DELIM" and (token.value == "." or token.value == "#")
|
||||||
|
rule.selector = tinycss.token_data.TokenList(new_selector)
|
||||||
|
clean_stylesheet += rule.selector.as_css()
|
||||||
|
|
||||||
|
# Then clean up declarations.
|
||||||
|
# We keep only declarations we recognize
|
||||||
|
safe_declarations = {
|
||||||
|
"background-color",
|
||||||
|
"border-bottom-style",
|
||||||
|
"border-color",
|
||||||
|
"border-left",
|
||||||
|
"border-left-style",
|
||||||
|
"border-right-style",
|
||||||
|
"border-style",
|
||||||
|
"border-top-style",
|
||||||
|
"color",
|
||||||
|
"font-family",
|
||||||
|
"font-size",
|
||||||
|
"font-style",
|
||||||
|
"font-weight",
|
||||||
|
"height",
|
||||||
|
"list-style",
|
||||||
|
"margin",
|
||||||
|
"margin-bottom",
|
||||||
|
"margin-left",
|
||||||
|
"margin-right",
|
||||||
|
"margin-top",
|
||||||
|
"padding",
|
||||||
|
"padding-bottom",
|
||||||
|
"padding-left",
|
||||||
|
"padding-right",
|
||||||
|
"padding-top",
|
||||||
|
"page-break-after",
|
||||||
|
"text-align",
|
||||||
|
"text-decoration",
|
||||||
|
"white-space",
|
||||||
|
"width",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Ignore these silently to avoid cluttering logs
|
||||||
|
ignore_declarations = {
|
||||||
|
"mso-ansi-font-size",
|
||||||
|
"mso-ansi-language",
|
||||||
|
"mso-ascii-font-family",
|
||||||
|
"mso-bidi-font-family",
|
||||||
|
"mso-bidi-font-size",
|
||||||
|
"mso-bidi-font-weight",
|
||||||
|
"mso-fareast-font-family",
|
||||||
|
"mso-fareast-language",
|
||||||
|
"mso-gram-e",
|
||||||
|
"mso-hansi-font-family",
|
||||||
|
"mso-margin-bottom-alt",
|
||||||
|
"mso-margin-top-alt",
|
||||||
|
"mso-outline-level",
|
||||||
|
"mso-pagination",
|
||||||
|
"mso-spl-e",
|
||||||
|
"mso-style-link",
|
||||||
|
"mso-style-name",
|
||||||
|
"mso-style-next",
|
||||||
|
"mso-style-noshow",
|
||||||
|
"mso-style-parent",
|
||||||
|
"mso-style-priority",
|
||||||
|
"mso-style-type",
|
||||||
|
"page", # doesn't exist in CSS 2.2
|
||||||
|
"panose-1", # doesn't exist in CSS 2.2
|
||||||
|
"text-underline", # doesn't exist in CSS 2.2
|
||||||
|
}
|
||||||
|
new_declarations = []
|
||||||
|
for declaration in rule.declarations:
|
||||||
|
if declaration.name in safe_declarations:
|
||||||
|
new_declarations.append(declaration)
|
||||||
|
elif declaration.name == "background-image":
|
||||||
|
# check if URL is cid, discard if not
|
||||||
|
ok = False
|
||||||
|
if len(declaration.value) == 1:
|
||||||
|
if declaration.value[0].type == "URI":
|
||||||
|
if declaration.value[0].value.startswith("cid:"):
|
||||||
|
print("accepting url", declaration.value[0].value)
|
||||||
|
# Get the real converted url here
|
||||||
|
new_declarations.append(
|
||||||
|
parser.parse_style_attr("background-image: url(/whatever.png)")[0][0]
|
||||||
|
)
|
||||||
|
ok = True
|
||||||
|
if not ok:
|
||||||
|
print("ignoring unsafe CSS property", declaration)
|
||||||
|
pass
|
||||||
|
elif declaration.name in ignore_declarations:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
print("ignoring unknown CSS property", declaration.name)
|
||||||
|
clean_stylesheet += " {\n"
|
||||||
|
for declaration in new_declarations:
|
||||||
|
clean_stylesheet += "\t" + declaration.name + ":" + declaration.value.as_css() + ";\n"
|
||||||
|
clean_stylesheet += "}\n\n"
|
||||||
|
return clean_stylesheet
|
||||||
|
|
||||||
|
|
||||||
class TextEnrichedPart:
|
class TextEnrichedPart:
|
||||||
class TEElement:
|
class TEElement:
|
||||||
|
|
Loading…
Reference in New Issue