Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
35512ec
Merge pull request #438 from TeamMsgExtractor/next-release
TheElementalOfDestruction Oct 8, 2024
fa902fc
Merge pull request #441 from TeamMsgExtractor/next-release
TheElementalOfDestruction Oct 11, 2024
9e276e0
Merge pull request #442 from TeamMsgExtractor/next-release
TheElementalOfDestruction Oct 11, 2024
1302d6f
Merge pull request #443 from TeamMsgExtractor/next-release
TheElementalOfDestruction Oct 11, 2024
373f6c1
Merge pull request #446 from TeamMsgExtractor/next-release
TheElementalOfDestruction Oct 22, 2024
207bfb9
Merge pull request #451 from TeamMsgExtractor/next-release
TheElementalOfDestruction Feb 5, 2025
4033e9e
Merge pull request #455 from TeamMsgExtractor/next-release
TheElementalOfDestruction Mar 14, 2025
5fe6f3a
Merge pull request #457 from TeamMsgExtractor/next-release
TheElementalOfDestruction Mar 24, 2025
5585364
Merge pull request #458 from TeamMsgExtractor/next-release
TheElementalOfDestruction Mar 24, 2025
68858f0
Merge pull request #463 from TeamMsgExtractor/next-release
TheElementalOfDestruction Apr 10, 2025
f9fae3d
Merge pull request #468 from TeamMsgExtractor/next-release
TheElementalOfDestruction Aug 12, 2025
88b024b
Fix asEmailMessage() failing with multiple To recipients
glorat Apr 18, 2026
aa9581d
Fix asEmailMessage() failing when To headers have mixed casing
glorat Apr 18, 2026
48c4375
Fix asEmailMessage() raising UnicodeEncodeError with RFC 2047 encoded…
glorat Apr 18, 2026
74bfd5b
Fix GB2312-labelled RFC 2047 headers mangled when bytes are valid GBK
glorat Apr 19, 2026
6adacd1
fix: clean up eml header handling
glorat Apr 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ __pycache__/
!/example-msg-files/expected-outputs/2016-02-23_0657 MSG Test File/*
!/example-msg-files/strangeDate.msg
!/example-msg-files/unicode.msg
!/example-msg-files/multi-to.msg
!/example-msg-files/multi-to-to.msg
!/example-msg-files/unicode-header.msg

# Reserved Folders
/output
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
**v0.55.0**
* [[TeamMsgExtractor #???](https://github.com/TeamMsgExtractor/msg-extractor/issues/???)] Fixed `MessageBase.asEmailMessage()` raising `ValueError` when a message has multiple `To` recipients.
* [[TeamMsgExtractor #465](https://github.com/TeamMsgExtractor/msg-extractor/issues/465)] Added missing `msg.close()` to `openMsg()`. If the MSG file was actually just a plain OLE file, it would be left open.
* Adjusted the default value of `maxNameLength` for `MessageBase.save()` to 40 instead of 256.
* Adjusted exception handling for `MessageBase.save()` to properly report the reason a folder fails to be created.
Expand Down
Binary file added example-msg-files/multi-to-to.msg
Binary file not shown.
Binary file added example-msg-files/multi-to.msg
Binary file not shown.
Binary file added example-msg-files/unicode-header.msg
Binary file not shown.
118 changes: 114 additions & 4 deletions extract_msg/msg_classes/message_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

from email import policy
from email.charset import Charset, QP
from email.header import decode_header as _decode_header, Header as _Header
from email.message import EmailMessage
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
Expand Down Expand Up @@ -56,6 +57,77 @@
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())

_RFC2047_WORD = re.compile(r'=\?[^?]+\?[bBqQ]\?[^?]*\?=')

# Encodings to try when a declared charset fails, keyed by normalised charset name.
# GBK is a strict superset of GB2312 and accepts the ASCII-range second bytes that
# GB2312 rejects, so real-world GB2312-labelled headers often decode correctly as GBK.
_CHARSET_FALLBACKS: Dict[str, Tuple[str, ...]] = {
'gb2312': ('gbk', 'cp936'),
'gb_2312': ('gbk', 'cp936'),
'gb_2312-80': ('gbk', 'cp936'),
}


def _fix_encoded_word(m: re.Match) -> str:
"""
Regex substitution callback: decode one RFC 2047 encoded word and
re-emit it as a valid UTF-8 encoded word.

If the declared charset fails (e.g. GB2312-labelled GBK bytes), charset
fallbacks from _CHARSET_FALLBACKS are tried before falling back to
latin-1 with replacement characters.
"""
word = m.group(0)
try:
parts = _decode_header(word)
if len(parts) != 1 or not isinstance(parts[0][0], bytes):
return word
btext, cs = parts[0]
try:
text = btext.decode(cs or 'ascii')
except (UnicodeDecodeError, LookupError):
for fallback in _CHARSET_FALLBACKS.get((cs or '').lower(), ()):
try:
text = btext.decode(fallback)
break
except (UnicodeDecodeError, LookupError):
continue
else:
text = btext.decode('latin-1', 'replace')
return str(_Header(text, charset='utf-8'))
except Exception:
return word


def _preprocess_encoded_words(text: str) -> str:
"""
Replace every RFC 2047 encoded word in *text* with a clean UTF-8
encoded word. Safe to apply to an entire raw header block before
feeding it to the modern email policy parser.
"""
return _RFC2047_WORD.sub(_fix_encoded_word, text)


def _sanitize_header(value: str) -> str:
"""
Prepare a header value from a compat32-parsed Message for safe assignment
to an EmailMessage (fallback path when no raw headerText is available).

RFC 2047 encoded words are re-encoded as UTF-8; raw non-ASCII characters
are also RFC 2047-encoded so that EmailMessage.as_bytes() never raises
UnicodeEncodeError.
"""
if '=?' in value:
value = _RFC2047_WORD.sub(_fix_encoded_word, value)

try:
value.encode('ascii')
except (UnicodeEncodeError, UnicodeDecodeError):
value = str(_Header(value, charset='utf-8'))

return value


class MessageBase(MSGFile):
"""
Expand Down Expand Up @@ -165,10 +237,48 @@ def asEmailMessage(self) -> EmailMessage:
"""
ret = EmailMessage()

# Copy the headers.
for key, value in self.header.items():
if key.lower() != 'content-type':
ret[key] = value.replace('\r\n', '').replace('\n', '')
# Prefer the raw transport-header block: pre-fix any malformed RFC 2047
# encoded words (e.g. GB2312-labelled GBK bytes), then parse with the
# modern policy so that RFC 5322 unfolding and RFC 2047 decoding are
# handled natively and the values arrive as clean Unicode strings.
# Fall back to the compat32-derived self.header when no raw text is
# stored (synthesised headers from MAPI properties).
_ADDRESS_HEADERS = frozenset({'to', 'cc', 'bcc', 'reply-to'})
if self.headerText:
raw = self.headerText
if raw.startswith('Microsoft Mail Internet Headers Version 2.0'):
raw = raw[43:].lstrip()
raw = _preprocess_encoded_words(raw)
source_items = list(HeaderParser(policy = policy.default).parsestr(raw).items())
sanitize = False
else:
source_items = list(self.header.items())
sanitize = True

# Address headers (To/CC/BCC/Reply-To) may appear once per recipient in
# the stored header block; merge them into a single comma-separated value.
# All other headers — including multi-valued trace headers like Received
# and Authentication-Results — are forwarded as-is; EmailMessage appends
# rather than replaces, so natural repetition is preserved correctly.
address_merged: Dict[str, Tuple[str, str]] = {}
for key, value in source_items:
if key.lower() == 'content-type':
continue
if sanitize:
# compat32 values are folded and may contain raw encoded words.
value = re.sub(r'\r?\n[ \t]', ' ', value)
value = value.replace('\r\n', '').replace('\n', '')
value = _sanitize_header(value)
lower = key.lower()
if lower in _ADDRESS_HEADERS:
if lower in address_merged:
address_merged[lower] = (address_merged[lower][0], address_merged[lower][1] + ', ' + value)
else:
address_merged[lower] = (key, value)
else:
ret[key] = value
for _, (key, value) in address_merged.items():
ret[key] = value

ret['Content-Type'] = 'multipart/mixed'

Expand Down
2 changes: 2 additions & 0 deletions extract_msg_tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
__all__ = [
'AttachmentTests',
'CommandLineTests',
'MessageTests',
'OleWriterEditingTests',
'OleWriterExportTests',
'PropTests',
Expand All @@ -10,6 +11,7 @@

from .attachment_tests import AttachmentTests
from .cmd_line_tests import CommandLineTests
from .message_tests import MessageTests
from .ole_writer_tests import OleWriterEditingTests, OleWriterExportTests
from .prop_tests import PropTests
from .util_tests import UtilTests
Expand Down
117 changes: 117 additions & 0 deletions extract_msg_tests/message_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
__all__ = [
'MessageTests',
]


import base64
import unittest
from email.message import EmailMessage

from .constants import TEST_FILE_DIR
from extract_msg import openMsg
from extract_msg.msg_classes import Message


class MessageTests(unittest.TestCase):
def testMultiTo(self):
"""
Tests parsing a message with multiple To and CC recipients.
"""
with openMsg(TEST_FILE_DIR / 'multi-to.msg') as msg:
self.assertIsInstance(msg, Message)

self.assertTrue(msg.subject.startswith('Test: multiple To recipients'))
self.assertEqual(msg.sender, 'Bob Sender <bob@example.com>')
self.assertTrue((msg.body or '').startswith('Test email body.'))

# Expect at least two To recipients (type 1) and at least one CC (type 2).
to_recipients = [r for r in msg.recipients if r.type == 1]
cc_recipients = [r for r in msg.recipients if r.type == 2]

self.assertGreaterEqual(len(to_recipients), 2)
self.assertGreaterEqual(len(cc_recipients), 1)

to_emails = [r.email.strip('\x00') for r in to_recipients]
self.assertIn('alice@example.com', to_emails)
self.assertIn('carol@example.com', to_emails)

cc_emails = [r.email.strip('\x00') for r in cc_recipients]
self.assertIn('dave@example.com', cc_emails)

def testMultiToTo(self):
"""
Tests parsing a message where To headers use mixed casing (e.g. 'To' vs 'TO').
"""
with openMsg(TEST_FILE_DIR / 'multi-to-to.msg') as msg:
self.assertIsInstance(msg, Message)
self.assertTrue(msg.subject.startswith('Test: multiple To recipients'))

to_recipients = [r for r in msg.recipients if r.type == 1]
cc_recipients = [r for r in msg.recipients if r.type == 2]

self.assertGreaterEqual(len(to_recipients), 2)
self.assertGreaterEqual(len(cc_recipients), 1)

to_emails = [r.email.strip('\x00') for r in to_recipients]
self.assertIn('alice@example.com', to_emails)
self.assertIn('carol@example.com', to_emails)

def testMultiToToAsEmailMessage(self):
"""
Tests EML conversion when To headers appear with mixed casing across recipients.
"""
with openMsg(TEST_FILE_DIR / 'multi-to-to.msg') as msg:
em = msg.asEmailMessage()

self.assertIsInstance(em, EmailMessage)
self.assertEqual(sum(1 for k in em.keys() if k.lower() == 'to'), 1)
to_header = em['TO'] or em['To']
self.assertIn('alice@example.com', to_header)
self.assertIn('carol@example.com', to_header)

def testUnicodeHeaderAsEmailMessage(self):
"""
Tests that a message whose To header mixes plain and RFC 2047-encoded
display names (using a charset with invalid byte sequences) can be
converted to EML bytes without a UnicodeEncodeError. This crashes on
Python 3.13 without the fix.
"""
with openMsg(TEST_FILE_DIR / 'unicode-header.msg') as msg:
em = msg.asEmailMessage()
self.assertIsInstance(em, EmailMessage)
raw = em.as_bytes()
self.assertIn(b'alice@example.com', raw)

def testGbkFallbackDisplayName(self):
"""
Tests that RFC 2047 encoded words declared as GB2312 but containing
byte sequences only valid in GBK (a strict superset) are decoded
correctly rather than mangled via latin-1 fallback.

The encoded word =?gb2312?B?6pCzydXCKG1heGNoZW4p?= decodes to
'陳成章(maxchen)' in GBK. Without the fix, the display name is
garbled as latin-1.
"""
with openMsg(TEST_FILE_DIR / 'unicode-header.msg') as msg:
em = msg.asEmailMessage()
raw = em.as_bytes()
# The correctly GBK-decoded name must appear RFC 2047-encoded as UTF-8.
# The Chinese characters 陳成章 base64-encoded in UTF-8 is the marker.
self.assertIn(base64.b64encode('陳成章'.encode('utf-8')), raw)

def testMultiToAsEmailMessage(self):
"""
Tests that a message with multiple To recipients converts to EML without error,
and that duplicate To headers are merged into a single comma-separated value.
"""
with openMsg(TEST_FILE_DIR / 'multi-to.msg') as msg:
em = msg.asEmailMessage()

self.assertIsInstance(em, EmailMessage)
# EmailMessage must have exactly one TO field (duplicates merged).
self.assertEqual(sum(1 for k in em.keys() if k == 'TO'), 1)
to_header = em['TO']
self.assertIn('alice@example.com', to_header)
self.assertIn('carol@example.com', to_header)
self.assertEqual(em['CC'], 'Dave Jones <dave@example.com>')
self.assertEqual(em['From'], 'Bob Sender <bob@example.com>')