Handle some more html tags and attributes

This commit is contained in:
Peter J. Holzer 2019-03-01 10:21:57 +01:00
parent b5c979d5cb
commit d1dc1db853
1 changed files with 21 additions and 13 deletions

View File

@ -144,9 +144,13 @@ def archive(msg):
class HTMLPart(html.parser.HTMLParser):
allowed_tags = [ 'h2', 'a', 'wbr', 'hr', 'pre', 'img', 'font', 'i' ]
hide_tags = [ 'title' ]
ignore_tags = [ 'html', 'head', 'body' ]
allowed_tags = [
"h2", "a", "wbr", "hr", "pre", "img", "font", "i", "br", "table", "tr",
"th", "td", "b", "select", "option", "input", "sup", "address",
"center", "p", "h1", "dl", "h3"
]
hide_tags = [ "title" ]
ignore_tags = [ "html", "head", "body", "marquee", "meta", "form", ]
def __init__(self):
super().__init__()
@ -160,7 +164,7 @@ class HTMLPart(html.parser.HTMLParser):
self.base = href[0]
elif tag in self.allowed_tags:
attrstr = "".join(
[' %s="%s"' % (a[0], html.escape(a[1]))
[' %s="%s"' % (a[0], html.escape(a[1])) if a[1] else ' %s' % (a[0])
for a in self.clean_attrs(tag, attrs)
]
)
@ -190,9 +194,17 @@ class HTMLPart(html.parser.HTMLParser):
return "".join(self.content)
def clean_attrs(self, tag, attrs):
safe_attrs = [
"border", "alt", "size", "face", "width", "height", "hspace",
"cellpadding", "cellspacing", "bgcolor", "valign", "nowrap",
"color", "colspan", "name", "value", "type", "align", "clear",
"noshade"
]
clean_attrs = []
for a in attrs:
if a[0] == "href":
if a[0] in safe_attrs:
clean_attrs.append(a)
elif a[0] == "href":
url = a[1]
url = urllib.parse.urljoin(self.base, url)
u = urllib.parse.urlparse(url)
@ -208,13 +220,8 @@ class HTMLPart(html.parser.HTMLParser):
clean_attrs.append((a[0], url))
else:
print("Ignored src attribute", a, file=sys.stderr)
elif a[0] == "border":
clean_attrs.append(a)
elif a[0] == "alt":
clean_attrs.append(a)
elif a[0] == "size":
clean_attrs.append(a)
elif a[0] == "target":
pass
else:
print("Encountered unknown attribute", a, file=sys.stderr)
return clean_attrs
@ -283,9 +290,10 @@ class TextEnrichedPart:
for f in sys.argv[1:]:
print("F", f)
print("F", f, file=sys.stderr)
mb = mailbox.mbox(f)
for m in mb:
archive(m)
# vim: tw=79