From 80352f727f1226b4c7116d136ad5091678968242 Mon Sep 17 00:00:00 2001 From: "Peter J. Holzer" Date: Sat, 19 Jan 2019 21:18:14 +0100 Subject: [PATCH] Extract message ids from mbox files Parses all mbox files on the command line and extracts message ids from Message-Id, In-Reply-To and References headers. Just a test script to see what message ids look like in practice. --- get_message_ids | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100755 get_message_ids diff --git a/get_message_ids b/get_message_ids new file mode 100755 index 0000000..eb5b235 --- /dev/null +++ b/get_message_ids @@ -0,0 +1,29 @@ +#!/usr/bin/python3 + +import mailbox +import re +import sys + +for f in sys.argv[1:]: + print("F", f) + mb = mailbox.mbox(f) + + for m in mb: + try: + for match in re.findall(r'<(.*?)>', m["Message-ID"]): + print('M', match) + if "In-Reply-To" in m: + h = str(m["In-Reply-To"]) # sometimes it's a string, + # sometimes an email.header.Header. But the latter's + # __str__ method returns something sensible, so let's just + # force it to be a string + for match in re.findall(r'<(.*?)>', h): + print('I', match) + if "References" in m: + for match in re.findall(r'<(.*?)>', m["References"]): + print('R', match) + except: + print("Error in message:", file=sys.stderr) + print(m.as_string(), file=sys.stderr) + sys.exit(1) +