TeamMsgExtractor · glorat · Oct 8, 2024 · Oct 11, 2024 · Oct 11, 2024 · Oct 11, 2024
diff --git a/.gitignore b/.gitignore
@@ -32,6 +32,9 @@ __pycache__/
 !/example-msg-files/expected-outputs/2016-02-23_0657 MSG Test File/*
 !/example-msg-files/strangeDate.msg
 !/example-msg-files/unicode.msg
+!/example-msg-files/multi-to.msg
+!/example-msg-files/multi-to-to.msg
+!/example-msg-files/unicode-header.msg
 
 # Reserved Folders
 /output

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,5 @@
 **v0.55.0**
+* [[TeamMsgExtractor #???](https://github.com/TeamMsgExtractor/msg-extractor/issues/???)] Fixed `MessageBase.asEmailMessage()` raising `ValueError` when a message has multiple `To` recipients.
 * [[TeamMsgExtractor #465](https://github.com/TeamMsgExtractor/msg-extractor/issues/465)] Added missing `msg.close()` to `openMsg()`. If the MSG file was actually just a plain OLE file, it would be left open.
 * Adjusted the default value of `maxNameLength` for `MessageBase.save()` to 40 instead of 256.
 * Adjusted exception handling for `MessageBase.save()` to properly report the reason a folder fails to be created.

diff --git a/example-msg-files/multi-to-to.msg b/example-msg-files/multi-to-to.msg
diff --git a/example-msg-files/multi-to.msg b/example-msg-files/multi-to.msg
diff --git a/example-msg-files/unicode-header.msg b/example-msg-files/unicode-header.msg
diff --git a/extract_msg/msg_classes/message_base.py b/extract_msg/msg_classes/message_base.py
@@ -25,6 +25,7 @@
 
 from email import policy
 from email.charset import Charset, QP
+from email.header import decode_header as _decode_header, Header as _Header
 from email.message import EmailMessage
 from email.mime.multipart import MIMEMultipart
 from email.mime.text import MIMEText
@@ -56,6 +57,77 @@
 logger = logging.getLogger(__name__)
 logger.addHandler(logging.NullHandler())
 
+_RFC2047_WORD = re.compile(r'=\?[^?]+\?[bBqQ]\?[^?]*\?=')
+
+# Encodings to try when a declared charset fails, keyed by normalised charset name.
+# GBK is a strict superset of GB2312 and accepts the ASCII-range second bytes that
+# GB2312 rejects, so real-world GB2312-labelled headers often decode correctly as GBK.
+_CHARSET_FALLBACKS: Dict[str, Tuple[str, ...]] = {
+    'gb2312': ('gbk', 'cp936'),
+    'gb_2312': ('gbk', 'cp936'),
+    'gb_2312-80': ('gbk', 'cp936'),
+}
+
+
+def _fix_encoded_word(m: re.Match) -> str:
+    """
+    Regex substitution callback: decode one RFC 2047 encoded word and
+    re-emit it as a valid UTF-8 encoded word.
+
+    If the declared charset fails (e.g. GB2312-labelled GBK bytes), charset
+    fallbacks from _CHARSET_FALLBACKS are tried before falling back to
+    latin-1 with replacement characters.
+    """
+    word = m.group(0)
+    try:
+        parts = _decode_header(word)
+        if len(parts) != 1 or not isinstance(parts[0][0], bytes):
+            return word
+        btext, cs = parts[0]
+        try:
+            text = btext.decode(cs or 'ascii')
+        except (UnicodeDecodeError, LookupError):
+            for fallback in _CHARSET_FALLBACKS.get((cs or '').lower(), ()):
+                try:
+                    text = btext.decode(fallback)
+                    break
+                except (UnicodeDecodeError, LookupError):
+                    continue
+            else:
+                text = btext.decode('latin-1', 'replace')
+        return str(_Header(text, charset='utf-8'))
+    except Exception:
+        return word
+
+
+def _preprocess_encoded_words(text: str) -> str:
+    """
+    Replace every RFC 2047 encoded word in *text* with a clean UTF-8
+    encoded word.  Safe to apply to an entire raw header block before
+    feeding it to the modern email policy parser.
+    """
+    return _RFC2047_WORD.sub(_fix_encoded_word, text)
+
+
+def _sanitize_header(value: str) -> str:
+    """
+    Prepare a header value from a compat32-parsed Message for safe assignment
+    to an EmailMessage (fallback path when no raw headerText is available).
+
+    RFC 2047 encoded words are re-encoded as UTF-8; raw non-ASCII characters
+    are also RFC 2047-encoded so that EmailMessage.as_bytes() never raises
+    UnicodeEncodeError.
+    """
+    if '=?' in value:
+        value = _RFC2047_WORD.sub(_fix_encoded_word, value)
+
+    try:
+        value.encode('ascii')
+    except (UnicodeEncodeError, UnicodeDecodeError):
+        value = str(_Header(value, charset='utf-8'))
+
+    return value
+
 
 class MessageBase(MSGFile):
     """
@@ -165,10 +237,48 @@ def asEmailMessage(self) -> EmailMessage:
         """
         ret = EmailMessage()
 
-        # Copy the headers.
-        for key, value in self.header.items():
-            if key.lower() != 'content-type':
-                ret[key] = value.replace('\r\n', '').replace('\n', '')
+        # Prefer the raw transport-header block: pre-fix any malformed RFC 2047
+        # encoded words (e.g. GB2312-labelled GBK bytes), then parse with the
+        # modern policy so that RFC 5322 unfolding and RFC 2047 decoding are
+        # handled natively and the values arrive as clean Unicode strings.
+        # Fall back to the compat32-derived self.header when no raw text is
+        # stored (synthesised headers from MAPI properties).
+        _ADDRESS_HEADERS = frozenset({'to', 'cc', 'bcc', 'reply-to'})
+        if self.headerText:
+            raw = self.headerText
+            if raw.startswith('Microsoft Mail Internet Headers Version 2.0'):
+                raw = raw[43:].lstrip()
+            raw = _preprocess_encoded_words(raw)
+            source_items = list(HeaderParser(policy = policy.default).parsestr(raw).items())
+            sanitize = False
+        else:
+            source_items = list(self.header.items())
+            sanitize = True
+
+        # Address headers (To/CC/BCC/Reply-To) may appear once per recipient in
+        # the stored header block; merge them into a single comma-separated value.
+        # All other headers — including multi-valued trace headers like Received
+        # and Authentication-Results — are forwarded as-is; EmailMessage appends
+        # rather than replaces, so natural repetition is preserved correctly.
+        address_merged: Dict[str, Tuple[str, str]] = {}
+        for key, value in source_items:
+            if key.lower() == 'content-type':
+                continue
+            if sanitize:
+                # compat32 values are folded and may contain raw encoded words.
+                value = re.sub(r'\r?\n[ \t]', ' ', value)
+                value = value.replace('\r\n', '').replace('\n', '')
+                value = _sanitize_header(value)
+            lower = key.lower()
+            if lower in _ADDRESS_HEADERS:
+                if lower in address_merged:
+                    address_merged[lower] = (address_merged[lower][0], address_merged[lower][1] + ', ' + value)
+                else:
+                    address_merged[lower] = (key, value)
+            else:
+                ret[key] = value
+        for _, (key, value) in address_merged.items():
+            ret[key] = value
 
         ret['Content-Type'] = 'multipart/mixed'
 

diff --git a/extract_msg_tests/__init__.py b/extract_msg_tests/__init__.py
@@ -1,6 +1,7 @@
 __all__ = [
     'AttachmentTests',
     'CommandLineTests',
+    'MessageTests',
     'OleWriterEditingTests',
     'OleWriterExportTests',
     'PropTests',
@@ -10,6 +11,7 @@
 
 from .attachment_tests import AttachmentTests
 from .cmd_line_tests import CommandLineTests
+from .message_tests import MessageTests
 from .ole_writer_tests import OleWriterEditingTests, OleWriterExportTests
 from .prop_tests import PropTests
 from .util_tests import UtilTests

diff --git a/extract_msg_tests/message_tests.py b/extract_msg_tests/message_tests.py
@@ -0,0 +1,117 @@
+__all__ = [
+    'MessageTests',
+]
+
+
+import base64
+import unittest
+from email.message import EmailMessage
+
+from .constants import TEST_FILE_DIR
+from extract_msg import openMsg
+from extract_msg.msg_classes import Message
+
+
+class MessageTests(unittest.TestCase):
+    def testMultiTo(self):
+        """
+        Tests parsing a message with multiple To and CC recipients.
+        """
+        with openMsg(TEST_FILE_DIR / 'multi-to.msg') as msg:
+            self.assertIsInstance(msg, Message)
+
+            self.assertTrue(msg.subject.startswith('Test: multiple To recipients'))
+            self.assertEqual(msg.sender, 'Bob Sender <bob@example.com>')
+            self.assertTrue((msg.body or '').startswith('Test email body.'))
+
+            # Expect at least two To recipients (type 1) and at least one CC (type 2).
+            to_recipients = [r for r in msg.recipients if r.type == 1]
+            cc_recipients = [r for r in msg.recipients if r.type == 2]
+
+            self.assertGreaterEqual(len(to_recipients), 2)
+            self.assertGreaterEqual(len(cc_recipients), 1)
+
+            to_emails = [r.email.strip('\x00') for r in to_recipients]
+            self.assertIn('alice@example.com', to_emails)
+            self.assertIn('carol@example.com', to_emails)
+
+            cc_emails = [r.email.strip('\x00') for r in cc_recipients]
+            self.assertIn('dave@example.com', cc_emails)
+
+    def testMultiToTo(self):
+        """
+        Tests parsing a message where To headers use mixed casing (e.g. 'To' vs 'TO').
+        """
+        with openMsg(TEST_FILE_DIR / 'multi-to-to.msg') as msg:
+            self.assertIsInstance(msg, Message)
+            self.assertTrue(msg.subject.startswith('Test: multiple To recipients'))
+
+            to_recipients = [r for r in msg.recipients if r.type == 1]
+            cc_recipients = [r for r in msg.recipients if r.type == 2]
+
+            self.assertGreaterEqual(len(to_recipients), 2)
+            self.assertGreaterEqual(len(cc_recipients), 1)
+
+            to_emails = [r.email.strip('\x00') for r in to_recipients]
+            self.assertIn('alice@example.com', to_emails)
+            self.assertIn('carol@example.com', to_emails)
+
+    def testMultiToToAsEmailMessage(self):
+        """
+        Tests EML conversion when To headers appear with mixed casing across recipients.
+        """
+        with openMsg(TEST_FILE_DIR / 'multi-to-to.msg') as msg:
+            em = msg.asEmailMessage()
+
+            self.assertIsInstance(em, EmailMessage)
+            self.assertEqual(sum(1 for k in em.keys() if k.lower() == 'to'), 1)
+            to_header = em['TO'] or em['To']
+            self.assertIn('alice@example.com', to_header)
+            self.assertIn('carol@example.com', to_header)
+
+    def testUnicodeHeaderAsEmailMessage(self):
+        """
+        Tests that a message whose To header mixes plain and RFC 2047-encoded
+        display names (using a charset with invalid byte sequences) can be
+        converted to EML bytes without a UnicodeEncodeError. This crashes on
+        Python 3.13 without the fix.
+        """
+        with openMsg(TEST_FILE_DIR / 'unicode-header.msg') as msg:
+            em = msg.asEmailMessage()
+            self.assertIsInstance(em, EmailMessage)
+            raw = em.as_bytes()
+            self.assertIn(b'alice@example.com', raw)
+
+    def testGbkFallbackDisplayName(self):
+        """
+        Tests that RFC 2047 encoded words declared as GB2312 but containing
+        byte sequences only valid in GBK (a strict superset) are decoded
+        correctly rather than mangled via latin-1 fallback.
+
+        The encoded word =?gb2312?B?6pCzydXCKG1heGNoZW4p?= decodes to
+        '陳成章(maxchen)' in GBK. Without the fix, the display name is
+        garbled as latin-1.
+        """
+        with openMsg(TEST_FILE_DIR / 'unicode-header.msg') as msg:
+            em = msg.asEmailMessage()
+            raw = em.as_bytes()
+            # The correctly GBK-decoded name must appear RFC 2047-encoded as UTF-8.
+            # The Chinese characters 陳成章 base64-encoded in UTF-8 is the marker.
+            self.assertIn(base64.b64encode('陳成章'.encode('utf-8')), raw)
+
+    def testMultiToAsEmailMessage(self):
+        """
+        Tests that a message with multiple To recipients converts to EML without error,
+        and that duplicate To headers are merged into a single comma-separated value.
+        """
+        with openMsg(TEST_FILE_DIR / 'multi-to.msg') as msg:
+            em = msg.asEmailMessage()
+
+            self.assertIsInstance(em, EmailMessage)
+            # EmailMessage must have exactly one TO field (duplicates merged).
+            self.assertEqual(sum(1 for k in em.keys() if k == 'TO'), 1)
+            to_header = em['TO']
+            self.assertIn('alice@example.com', to_header)
+            self.assertIn('carol@example.com', to_header)
+            self.assertEqual(em['CC'], 'Dave Jones <dave@example.com>')
+            self.assertEqual(em['From'], 'Bob Sender <bob@example.com>')