diff --git a/.gitignore b/.gitignore index 8e5f646e..97b36f23 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,9 @@ __pycache__/ !/example-msg-files/expected-outputs/2016-02-23_0657 MSG Test File/* !/example-msg-files/strangeDate.msg !/example-msg-files/unicode.msg +!/example-msg-files/multi-to.msg +!/example-msg-files/multi-to-to.msg +!/example-msg-files/unicode-header.msg # Reserved Folders /output diff --git a/CHANGELOG.md b/CHANGELOG.md index 3fbebaa2..cf816211 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,5 @@ **v0.55.0** +* [[TeamMsgExtractor #???](https://github.com/TeamMsgExtractor/msg-extractor/issues/???)] Fixed `MessageBase.asEmailMessage()` raising `ValueError` when a message has multiple `To` recipients. * [[TeamMsgExtractor #465](https://github.com/TeamMsgExtractor/msg-extractor/issues/465)] Added missing `msg.close()` to `openMsg()`. If the MSG file was actually just a plain OLE file, it would be left open. * Adjusted the default value of `maxNameLength` for `MessageBase.save()` to 40 instead of 256. * Adjusted exception handling for `MessageBase.save()` to properly report the reason a folder fails to be created. diff --git a/example-msg-files/multi-to-to.msg b/example-msg-files/multi-to-to.msg new file mode 100644 index 00000000..7f89476d Binary files /dev/null and b/example-msg-files/multi-to-to.msg differ diff --git a/example-msg-files/multi-to.msg b/example-msg-files/multi-to.msg new file mode 100644 index 00000000..8a6ace98 Binary files /dev/null and b/example-msg-files/multi-to.msg differ diff --git a/example-msg-files/unicode-header.msg b/example-msg-files/unicode-header.msg new file mode 100644 index 00000000..5c2ab54b Binary files /dev/null and b/example-msg-files/unicode-header.msg differ diff --git a/extract_msg/msg_classes/message_base.py b/extract_msg/msg_classes/message_base.py index 82b643c5..95d68a8a 100644 --- a/extract_msg/msg_classes/message_base.py +++ b/extract_msg/msg_classes/message_base.py @@ -25,6 +25,7 @@ from email import policy from email.charset import Charset, QP +from email.header import decode_header as _decode_header, Header as _Header from email.message import EmailMessage from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText @@ -56,6 +57,77 @@ logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) +_RFC2047_WORD = re.compile(r'=\?[^?]+\?[bBqQ]\?[^?]*\?=') + +# Encodings to try when a declared charset fails, keyed by normalised charset name. +# GBK is a strict superset of GB2312 and accepts the ASCII-range second bytes that +# GB2312 rejects, so real-world GB2312-labelled headers often decode correctly as GBK. +_CHARSET_FALLBACKS: Dict[str, Tuple[str, ...]] = { + 'gb2312': ('gbk', 'cp936'), + 'gb_2312': ('gbk', 'cp936'), + 'gb_2312-80': ('gbk', 'cp936'), +} + + +def _fix_encoded_word(m: re.Match) -> str: + """ + Regex substitution callback: decode one RFC 2047 encoded word and + re-emit it as a valid UTF-8 encoded word. + + If the declared charset fails (e.g. GB2312-labelled GBK bytes), charset + fallbacks from _CHARSET_FALLBACKS are tried before falling back to + latin-1 with replacement characters. + """ + word = m.group(0) + try: + parts = _decode_header(word) + if len(parts) != 1 or not isinstance(parts[0][0], bytes): + return word + btext, cs = parts[0] + try: + text = btext.decode(cs or 'ascii') + except (UnicodeDecodeError, LookupError): + for fallback in _CHARSET_FALLBACKS.get((cs or '').lower(), ()): + try: + text = btext.decode(fallback) + break + except (UnicodeDecodeError, LookupError): + continue + else: + text = btext.decode('latin-1', 'replace') + return str(_Header(text, charset='utf-8')) + except Exception: + return word + + +def _preprocess_encoded_words(text: str) -> str: + """ + Replace every RFC 2047 encoded word in *text* with a clean UTF-8 + encoded word. Safe to apply to an entire raw header block before + feeding it to the modern email policy parser. + """ + return _RFC2047_WORD.sub(_fix_encoded_word, text) + + +def _sanitize_header(value: str) -> str: + """ + Prepare a header value from a compat32-parsed Message for safe assignment + to an EmailMessage (fallback path when no raw headerText is available). + + RFC 2047 encoded words are re-encoded as UTF-8; raw non-ASCII characters + are also RFC 2047-encoded so that EmailMessage.as_bytes() never raises + UnicodeEncodeError. + """ + if '=?' in value: + value = _RFC2047_WORD.sub(_fix_encoded_word, value) + + try: + value.encode('ascii') + except (UnicodeEncodeError, UnicodeDecodeError): + value = str(_Header(value, charset='utf-8')) + + return value + class MessageBase(MSGFile): """ @@ -165,10 +237,48 @@ def asEmailMessage(self) -> EmailMessage: """ ret = EmailMessage() - # Copy the headers. - for key, value in self.header.items(): - if key.lower() != 'content-type': - ret[key] = value.replace('\r\n', '').replace('\n', '') + # Prefer the raw transport-header block: pre-fix any malformed RFC 2047 + # encoded words (e.g. GB2312-labelled GBK bytes), then parse with the + # modern policy so that RFC 5322 unfolding and RFC 2047 decoding are + # handled natively and the values arrive as clean Unicode strings. + # Fall back to the compat32-derived self.header when no raw text is + # stored (synthesised headers from MAPI properties). + _ADDRESS_HEADERS = frozenset({'to', 'cc', 'bcc', 'reply-to'}) + if self.headerText: + raw = self.headerText + if raw.startswith('Microsoft Mail Internet Headers Version 2.0'): + raw = raw[43:].lstrip() + raw = _preprocess_encoded_words(raw) + source_items = list(HeaderParser(policy = policy.default).parsestr(raw).items()) + sanitize = False + else: + source_items = list(self.header.items()) + sanitize = True + + # Address headers (To/CC/BCC/Reply-To) may appear once per recipient in + # the stored header block; merge them into a single comma-separated value. + # All other headers — including multi-valued trace headers like Received + # and Authentication-Results — are forwarded as-is; EmailMessage appends + # rather than replaces, so natural repetition is preserved correctly. + address_merged: Dict[str, Tuple[str, str]] = {} + for key, value in source_items: + if key.lower() == 'content-type': + continue + if sanitize: + # compat32 values are folded and may contain raw encoded words. + value = re.sub(r'\r?\n[ \t]', ' ', value) + value = value.replace('\r\n', '').replace('\n', '') + value = _sanitize_header(value) + lower = key.lower() + if lower in _ADDRESS_HEADERS: + if lower in address_merged: + address_merged[lower] = (address_merged[lower][0], address_merged[lower][1] + ', ' + value) + else: + address_merged[lower] = (key, value) + else: + ret[key] = value + for _, (key, value) in address_merged.items(): + ret[key] = value ret['Content-Type'] = 'multipart/mixed' diff --git a/extract_msg_tests/__init__.py b/extract_msg_tests/__init__.py index 3f2ba1ee..878fcdad 100644 --- a/extract_msg_tests/__init__.py +++ b/extract_msg_tests/__init__.py @@ -1,6 +1,7 @@ __all__ = [ 'AttachmentTests', 'CommandLineTests', + 'MessageTests', 'OleWriterEditingTests', 'OleWriterExportTests', 'PropTests', @@ -10,6 +11,7 @@ from .attachment_tests import AttachmentTests from .cmd_line_tests import CommandLineTests +from .message_tests import MessageTests from .ole_writer_tests import OleWriterEditingTests, OleWriterExportTests from .prop_tests import PropTests from .util_tests import UtilTests diff --git a/extract_msg_tests/message_tests.py b/extract_msg_tests/message_tests.py new file mode 100644 index 00000000..7b89244f --- /dev/null +++ b/extract_msg_tests/message_tests.py @@ -0,0 +1,117 @@ +__all__ = [ + 'MessageTests', +] + + +import base64 +import unittest +from email.message import EmailMessage + +from .constants import TEST_FILE_DIR +from extract_msg import openMsg +from extract_msg.msg_classes import Message + + +class MessageTests(unittest.TestCase): + def testMultiTo(self): + """ + Tests parsing a message with multiple To and CC recipients. + """ + with openMsg(TEST_FILE_DIR / 'multi-to.msg') as msg: + self.assertIsInstance(msg, Message) + + self.assertTrue(msg.subject.startswith('Test: multiple To recipients')) + self.assertEqual(msg.sender, 'Bob Sender ') + self.assertTrue((msg.body or '').startswith('Test email body.')) + + # Expect at least two To recipients (type 1) and at least one CC (type 2). + to_recipients = [r for r in msg.recipients if r.type == 1] + cc_recipients = [r for r in msg.recipients if r.type == 2] + + self.assertGreaterEqual(len(to_recipients), 2) + self.assertGreaterEqual(len(cc_recipients), 1) + + to_emails = [r.email.strip('\x00') for r in to_recipients] + self.assertIn('alice@example.com', to_emails) + self.assertIn('carol@example.com', to_emails) + + cc_emails = [r.email.strip('\x00') for r in cc_recipients] + self.assertIn('dave@example.com', cc_emails) + + def testMultiToTo(self): + """ + Tests parsing a message where To headers use mixed casing (e.g. 'To' vs 'TO'). + """ + with openMsg(TEST_FILE_DIR / 'multi-to-to.msg') as msg: + self.assertIsInstance(msg, Message) + self.assertTrue(msg.subject.startswith('Test: multiple To recipients')) + + to_recipients = [r for r in msg.recipients if r.type == 1] + cc_recipients = [r for r in msg.recipients if r.type == 2] + + self.assertGreaterEqual(len(to_recipients), 2) + self.assertGreaterEqual(len(cc_recipients), 1) + + to_emails = [r.email.strip('\x00') for r in to_recipients] + self.assertIn('alice@example.com', to_emails) + self.assertIn('carol@example.com', to_emails) + + def testMultiToToAsEmailMessage(self): + """ + Tests EML conversion when To headers appear with mixed casing across recipients. + """ + with openMsg(TEST_FILE_DIR / 'multi-to-to.msg') as msg: + em = msg.asEmailMessage() + + self.assertIsInstance(em, EmailMessage) + self.assertEqual(sum(1 for k in em.keys() if k.lower() == 'to'), 1) + to_header = em['TO'] or em['To'] + self.assertIn('alice@example.com', to_header) + self.assertIn('carol@example.com', to_header) + + def testUnicodeHeaderAsEmailMessage(self): + """ + Tests that a message whose To header mixes plain and RFC 2047-encoded + display names (using a charset with invalid byte sequences) can be + converted to EML bytes without a UnicodeEncodeError. This crashes on + Python 3.13 without the fix. + """ + with openMsg(TEST_FILE_DIR / 'unicode-header.msg') as msg: + em = msg.asEmailMessage() + self.assertIsInstance(em, EmailMessage) + raw = em.as_bytes() + self.assertIn(b'alice@example.com', raw) + + def testGbkFallbackDisplayName(self): + """ + Tests that RFC 2047 encoded words declared as GB2312 but containing + byte sequences only valid in GBK (a strict superset) are decoded + correctly rather than mangled via latin-1 fallback. + + The encoded word =?gb2312?B?6pCzydXCKG1heGNoZW4p?= decodes to + '陳成章(maxchen)' in GBK. Without the fix, the display name is + garbled as latin-1. + """ + with openMsg(TEST_FILE_DIR / 'unicode-header.msg') as msg: + em = msg.asEmailMessage() + raw = em.as_bytes() + # The correctly GBK-decoded name must appear RFC 2047-encoded as UTF-8. + # The Chinese characters 陳成章 base64-encoded in UTF-8 is the marker. + self.assertIn(base64.b64encode('陳成章'.encode('utf-8')), raw) + + def testMultiToAsEmailMessage(self): + """ + Tests that a message with multiple To recipients converts to EML without error, + and that duplicate To headers are merged into a single comma-separated value. + """ + with openMsg(TEST_FILE_DIR / 'multi-to.msg') as msg: + em = msg.asEmailMessage() + + self.assertIsInstance(em, EmailMessage) + # EmailMessage must have exactly one TO field (duplicates merged). + self.assertEqual(sum(1 for k in em.keys() if k == 'TO'), 1) + to_header = em['TO'] + self.assertIn('alice@example.com', to_header) + self.assertIn('carol@example.com', to_header) + self.assertEqual(em['CC'], 'Dave Jones ') + self.assertEqual(em['From'], 'Bob Sender ')