Clean up and include style sheets

This commit is contained in:
Peter J. Holzer 2019-10-31 21:22:03 +01:00
parent b86ae9cb3f
commit 29b5288519
1 changed files with 156 additions and 2 deletions

158
mbox2web
View File

@ -14,6 +14,7 @@ import tempfile
import urllib.parse import urllib.parse
import jinja2 import jinja2
import tinycss
basedir = "." basedir = "."
@ -447,11 +448,45 @@ def archive(msg):
class HTMLPart(html.parser.HTMLParser): class HTMLPart(html.parser.HTMLParser):
"""
A text/html part
This is a subclass of HTMLParser, so the handle_* methods will be invoked
as appropriate during parsing. There are a few additional attributes to
keep track of the state:
... attribute:: content
Accumulates parts fragments of the final, cleaned up, html message as
strings
... attribute:: base
The base URL
... attribute:: extra
Context information. This includes info about cids or references to
other messages
... attribute:: hide
If true, the content of the current tag is omitted from the output.
This is set when encountering a start tag in hide_tags, and reset at
each end tag (so it works only for leaves).
... attribute:: current_tag
The current tag. Similar to hide, this is set and reset when
encountering start end end tags, so it is only correct while processing
a leaf element. But since we use it only for style elements, that's
acceptable.
"""
allowed_tags = [ allowed_tags = [
"h2", "a", "wbr", "hr", "pre", "img", "font", "i", "br", "table", "tr", "h2", "a", "wbr", "hr", "pre", "img", "font", "i", "br", "table", "tr",
"th", "td", "b", "select", "option", "input", "sup", "address", "th", "td", "b", "select", "option", "input", "sup", "address",
"center", "p", "h1", "dl", "h3", "ul", "li", "ol", "u", "blockquote", "center", "p", "h1", "dl", "h3", "ul", "li", "ol", "u", "blockquote",
"h4", "div", "span" "h4", "div", "span", "style",
] ]
hide_tags = [ "title" ] hide_tags = [ "title" ]
ignore_tags = [ "html", "head", "body", "marquee", "meta", "form", ] ignore_tags = [ "html", "head", "body", "marquee", "meta", "form", ]
@ -462,8 +497,10 @@ class HTMLPart(html.parser.HTMLParser):
self.content = [] self.content = []
self.base = None self.base = None
self.extra = extra or {} self.extra = extra or {}
self.current_tag = None
def handle_starttag(self, tag, attrs): def handle_starttag(self, tag, attrs):
self.current_tag = tag
if tag == "base": if tag == "base":
href = [x[1] for x in attrs if x[0] == "href"] href = [x[1] for x in attrs if x[0] == "href"]
if href: if href:
@ -494,8 +531,11 @@ class HTMLPart(html.parser.HTMLParser):
pass pass
else: else:
print("Encountered unknown end tag", tag, file=sys.stderr) print("Encountered unknown end tag", tag, file=sys.stderr)
self.current_tag = None
def handle_data(self, data): def handle_data(self, data):
if self.current_tag == "style":
data = self.clean_style(data)
if not self.hide: if not self.hide:
self.content.append(data) self.content.append(data)
@ -536,10 +576,124 @@ class HTMLPart(html.parser.HTMLParser):
mid = a[1][4:] mid = a[1][4:]
encmid = encode_message_id(mid) encmid = encode_message_id(mid)
extra = "<a class='citesource' href='../%s'>\u2397</a>" % encmid extra = "<a class='citesource' href='../%s'>\u2397</a>" % encmid
elif a[0] == "class":
clean_attrs.append((a[0], "msg-" + a[1],))
else: else:
print("Encountered unknown attribute", a, file=sys.stderr) print("Encountered unknown attribute", a, "in", tag, file=sys.stderr)
return clean_attrs, extra return clean_attrs, extra
def clean_style(self, stylesheet):
cssparser = tinycss.make_parser()
stylesheet = cssparser.parse_stylesheet(stylesheet)
clean_stylesheet = ""
for rule in stylesheet.rules:
# first clean up selectors: Prepend "msg-" to every class or id
next_is_local_id = False
new_selector = []
for token in rule.selector:
if next_is_local_id and token.type == "IDENT":
new_id = "msg-" + token.value
new_selector.append(tinycss.token_data.Token(token.type, new_id, new_id, token.unit, token.line, token.column))
else:
new_selector.append(token)
next_is_local_id = token.type == "DELIM" and (token.value == "." or token.value == "#")
rule.selector = tinycss.token_data.TokenList(new_selector)
clean_stylesheet += rule.selector.as_css()
# Then clean up declarations.
# We keep only declarations we recognize
safe_declarations = {
"background-color",
"border-bottom-style",
"border-color",
"border-left",
"border-left-style",
"border-right-style",
"border-style",
"border-top-style",
"color",
"font-family",
"font-size",
"font-style",
"font-weight",
"height",
"list-style",
"margin",
"margin-bottom",
"margin-left",
"margin-right",
"margin-top",
"padding",
"padding-bottom",
"padding-left",
"padding-right",
"padding-top",
"page-break-after",
"text-align",
"text-decoration",
"white-space",
"width",
}
# Ignore these silently to avoid cluttering logs
ignore_declarations = {
"mso-ansi-font-size",
"mso-ansi-language",
"mso-ascii-font-family",
"mso-bidi-font-family",
"mso-bidi-font-size",
"mso-bidi-font-weight",
"mso-fareast-font-family",
"mso-fareast-language",
"mso-gram-e",
"mso-hansi-font-family",
"mso-margin-bottom-alt",
"mso-margin-top-alt",
"mso-outline-level",
"mso-pagination",
"mso-spl-e",
"mso-style-link",
"mso-style-name",
"mso-style-next",
"mso-style-noshow",
"mso-style-parent",
"mso-style-priority",
"mso-style-type",
"page", # doesn't exist in CSS 2.2
"panose-1", # doesn't exist in CSS 2.2
"text-underline", # doesn't exist in CSS 2.2
}
new_declarations = []
for declaration in rule.declarations:
if declaration.name in safe_declarations:
new_declarations.append(declaration)
elif declaration.name == "background-image":
# check if URL is cid, discard if not
ok = False
if len(declaration.value) == 1:
if declaration.value[0].type == "URI":
if declaration.value[0].value.startswith("cid:"):
print("accepting url", declaration.value[0].value)
# Get the real converted url here
new_declarations.append(
parser.parse_style_attr("background-image: url(/whatever.png)")[0][0]
)
ok = True
if not ok:
print("ignoring unsafe CSS property", declaration)
pass
elif declaration.name in ignore_declarations:
pass
else:
print("ignoring unknown CSS property", declaration.name)
clean_stylesheet += " {\n"
for declaration in new_declarations:
clean_stylesheet += "\t" + declaration.name + ":" + declaration.value.as_css() + ";\n"
clean_stylesheet += "}\n\n"
return clean_stylesheet
class TextEnrichedPart: class TextEnrichedPart:
class TEElement: class TEElement: