Clean up and include style sheets
This commit is contained in:
parent
b86ae9cb3f
commit
29b5288519
158
mbox2web
158
mbox2web
|
@ -14,6 +14,7 @@ import tempfile
|
|||
import urllib.parse
|
||||
|
||||
import jinja2
|
||||
import tinycss
|
||||
|
||||
basedir = "."
|
||||
|
||||
|
@ -447,11 +448,45 @@ def archive(msg):
|
|||
|
||||
|
||||
class HTMLPart(html.parser.HTMLParser):
|
||||
"""
|
||||
A text/html part
|
||||
|
||||
This is a subclass of HTMLParser, so the handle_* methods will be invoked
|
||||
as appropriate during parsing. There are a few additional attributes to
|
||||
keep track of the state:
|
||||
|
||||
... attribute:: content
|
||||
|
||||
Accumulates parts fragments of the final, cleaned up, html message as
|
||||
strings
|
||||
|
||||
... attribute:: base
|
||||
|
||||
The base URL
|
||||
|
||||
... attribute:: extra
|
||||
|
||||
Context information. This includes info about cids or references to
|
||||
other messages
|
||||
|
||||
... attribute:: hide
|
||||
|
||||
If true, the content of the current tag is omitted from the output.
|
||||
This is set when encountering a start tag in hide_tags, and reset at
|
||||
each end tag (so it works only for leaves).
|
||||
|
||||
... attribute:: current_tag
|
||||
|
||||
The current tag. Similar to hide, this is set and reset when
|
||||
encountering start end end tags, so it is only correct while processing
|
||||
a leaf element. But since we use it only for style elements, that's
|
||||
acceptable.
|
||||
"""
|
||||
allowed_tags = [
|
||||
"h2", "a", "wbr", "hr", "pre", "img", "font", "i", "br", "table", "tr",
|
||||
"th", "td", "b", "select", "option", "input", "sup", "address",
|
||||
"center", "p", "h1", "dl", "h3", "ul", "li", "ol", "u", "blockquote",
|
||||
"h4", "div", "span"
|
||||
"h4", "div", "span", "style",
|
||||
]
|
||||
hide_tags = [ "title" ]
|
||||
ignore_tags = [ "html", "head", "body", "marquee", "meta", "form", ]
|
||||
|
@ -462,8 +497,10 @@ class HTMLPart(html.parser.HTMLParser):
|
|||
self.content = []
|
||||
self.base = None
|
||||
self.extra = extra or {}
|
||||
self.current_tag = None
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
self.current_tag = tag
|
||||
if tag == "base":
|
||||
href = [x[1] for x in attrs if x[0] == "href"]
|
||||
if href:
|
||||
|
@ -494,8 +531,11 @@ class HTMLPart(html.parser.HTMLParser):
|
|||
pass
|
||||
else:
|
||||
print("Encountered unknown end tag", tag, file=sys.stderr)
|
||||
self.current_tag = None
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.current_tag == "style":
|
||||
data = self.clean_style(data)
|
||||
if not self.hide:
|
||||
self.content.append(data)
|
||||
|
||||
|
@ -536,10 +576,124 @@ class HTMLPart(html.parser.HTMLParser):
|
|||
mid = a[1][4:]
|
||||
encmid = encode_message_id(mid)
|
||||
extra = "<a class='citesource' href='../%s'>\u2397</a>" % encmid
|
||||
elif a[0] == "class":
|
||||
clean_attrs.append((a[0], "msg-" + a[1],))
|
||||
else:
|
||||
print("Encountered unknown attribute", a, file=sys.stderr)
|
||||
print("Encountered unknown attribute", a, "in", tag, file=sys.stderr)
|
||||
return clean_attrs, extra
|
||||
|
||||
def clean_style(self, stylesheet):
|
||||
cssparser = tinycss.make_parser()
|
||||
stylesheet = cssparser.parse_stylesheet(stylesheet)
|
||||
clean_stylesheet = ""
|
||||
|
||||
for rule in stylesheet.rules:
|
||||
|
||||
# first clean up selectors: Prepend "msg-" to every class or id
|
||||
next_is_local_id = False
|
||||
new_selector = []
|
||||
for token in rule.selector:
|
||||
if next_is_local_id and token.type == "IDENT":
|
||||
new_id = "msg-" + token.value
|
||||
new_selector.append(tinycss.token_data.Token(token.type, new_id, new_id, token.unit, token.line, token.column))
|
||||
else:
|
||||
new_selector.append(token)
|
||||
next_is_local_id = token.type == "DELIM" and (token.value == "." or token.value == "#")
|
||||
rule.selector = tinycss.token_data.TokenList(new_selector)
|
||||
clean_stylesheet += rule.selector.as_css()
|
||||
|
||||
# Then clean up declarations.
|
||||
# We keep only declarations we recognize
|
||||
safe_declarations = {
|
||||
"background-color",
|
||||
"border-bottom-style",
|
||||
"border-color",
|
||||
"border-left",
|
||||
"border-left-style",
|
||||
"border-right-style",
|
||||
"border-style",
|
||||
"border-top-style",
|
||||
"color",
|
||||
"font-family",
|
||||
"font-size",
|
||||
"font-style",
|
||||
"font-weight",
|
||||
"height",
|
||||
"list-style",
|
||||
"margin",
|
||||
"margin-bottom",
|
||||
"margin-left",
|
||||
"margin-right",
|
||||
"margin-top",
|
||||
"padding",
|
||||
"padding-bottom",
|
||||
"padding-left",
|
||||
"padding-right",
|
||||
"padding-top",
|
||||
"page-break-after",
|
||||
"text-align",
|
||||
"text-decoration",
|
||||
"white-space",
|
||||
"width",
|
||||
}
|
||||
|
||||
# Ignore these silently to avoid cluttering logs
|
||||
ignore_declarations = {
|
||||
"mso-ansi-font-size",
|
||||
"mso-ansi-language",
|
||||
"mso-ascii-font-family",
|
||||
"mso-bidi-font-family",
|
||||
"mso-bidi-font-size",
|
||||
"mso-bidi-font-weight",
|
||||
"mso-fareast-font-family",
|
||||
"mso-fareast-language",
|
||||
"mso-gram-e",
|
||||
"mso-hansi-font-family",
|
||||
"mso-margin-bottom-alt",
|
||||
"mso-margin-top-alt",
|
||||
"mso-outline-level",
|
||||
"mso-pagination",
|
||||
"mso-spl-e",
|
||||
"mso-style-link",
|
||||
"mso-style-name",
|
||||
"mso-style-next",
|
||||
"mso-style-noshow",
|
||||
"mso-style-parent",
|
||||
"mso-style-priority",
|
||||
"mso-style-type",
|
||||
"page", # doesn't exist in CSS 2.2
|
||||
"panose-1", # doesn't exist in CSS 2.2
|
||||
"text-underline", # doesn't exist in CSS 2.2
|
||||
}
|
||||
new_declarations = []
|
||||
for declaration in rule.declarations:
|
||||
if declaration.name in safe_declarations:
|
||||
new_declarations.append(declaration)
|
||||
elif declaration.name == "background-image":
|
||||
# check if URL is cid, discard if not
|
||||
ok = False
|
||||
if len(declaration.value) == 1:
|
||||
if declaration.value[0].type == "URI":
|
||||
if declaration.value[0].value.startswith("cid:"):
|
||||
print("accepting url", declaration.value[0].value)
|
||||
# Get the real converted url here
|
||||
new_declarations.append(
|
||||
parser.parse_style_attr("background-image: url(/whatever.png)")[0][0]
|
||||
)
|
||||
ok = True
|
||||
if not ok:
|
||||
print("ignoring unsafe CSS property", declaration)
|
||||
pass
|
||||
elif declaration.name in ignore_declarations:
|
||||
pass
|
||||
else:
|
||||
print("ignoring unknown CSS property", declaration.name)
|
||||
clean_stylesheet += " {\n"
|
||||
for declaration in new_declarations:
|
||||
clean_stylesheet += "\t" + declaration.name + ":" + declaration.value.as_css() + ";\n"
|
||||
clean_stylesheet += "}\n\n"
|
||||
return clean_stylesheet
|
||||
|
||||
|
||||
class TextEnrichedPart:
|
||||
class TEElement:
|
||||
|
|
Loading…
Reference in New Issue