From 5e96a737447ffe2541aca0ea9216a27a6e2f3cde Mon Sep 17 00:00:00 2001 From: "Peter J. Holzer" Date: Mon, 20 May 2019 23:06:08 +0200 Subject: [PATCH] Tolerate decoding errors Sometimes the charset is just wrong, or it uses a non-standard name. Try to do something useful in these cases. --- mbox2web | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/mbox2web b/mbox2web index cd3e842..902cae5 100755 --- a/mbox2web +++ b/mbox2web @@ -136,8 +136,16 @@ def render_body(msg, extra=None): format = ct_params.get("format", "fixed") if format == "fixed": bodytmpl = jenv.get_template("body_text_plain.html") + partbytes = msg.get_payload(decode=True) + try: + parttext = partbytes.decode(charset, errors="replace") + except LookupError as e: + # Unknown encoding? Probably win-1252 + print(e, file=sys.stderr) + parttext = partbytes.decode("windows-1252", errors="replace") + context = { - "body": msg.get_payload(decode=True).decode(charset) + "body": parttext } return bodytmpl.render(context) elif format == "flowed": @@ -653,7 +661,12 @@ class TextFlowedPart: charset = ct_params.get("charset", "iso-8859-1") format = ct_params.get("format", "fixed") delsp = ct_params.get("delsp", "no") == "yes" - raw_text = msg.get_payload(decode=True).decode(charset) + charset_map = { + "x-mac-roman": "mac_roman", + } + if charset in charset_map: + charset = charset_map[charset] + raw_text = msg.get_payload(decode=True).decode(charset, errors="replace") raw_lines = raw_text.split("\n") for rl in raw_lines: