From 88b024b57a1e333313ee67a6fe3b3827d44f9778 Mon Sep 17 00:00:00 2001 From: Kevin Tam Date: Sat, 18 Apr 2026 19:56:51 +0800 Subject: [PATCH 1/5] Fix asEmailMessage() failing with multiple To recipients Multiple To recipients produce duplicate TO keys in msg.header. The header-copy loop was assigning each individually, but EmailMessage enforces a single TO field. Duplicate keys are now merged into one comma-separated value before assignment. Adds example multi-to.msg and tests covering both parsing and EML conversion. --- .gitignore | 1 + CHANGELOG.md | 1 + example-msg-files/multi-to.msg | Bin 0 -> 86016 bytes extract_msg/msg_classes/message_base.py | 14 ++++-- extract_msg_tests/__init__.py | 2 + extract_msg_tests/message_tests.py | 55 ++++++++++++++++++++++++ 6 files changed, 70 insertions(+), 3 deletions(-) create mode 100644 example-msg-files/multi-to.msg create mode 100644 extract_msg_tests/message_tests.py diff --git a/.gitignore b/.gitignore index 8e5f646e..7759747a 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,7 @@ __pycache__/ !/example-msg-files/expected-outputs/2016-02-23_0657 MSG Test File/* !/example-msg-files/strangeDate.msg !/example-msg-files/unicode.msg +!/example-msg-files/multi-to.msg # Reserved Folders /output diff --git a/CHANGELOG.md b/CHANGELOG.md index 3fbebaa2..cf816211 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,5 @@ **v0.55.0** +* [[TeamMsgExtractor #???](https://github.com/TeamMsgExtractor/msg-extractor/issues/???)] Fixed `MessageBase.asEmailMessage()` raising `ValueError` when a message has multiple `To` recipients. * [[TeamMsgExtractor #465](https://github.com/TeamMsgExtractor/msg-extractor/issues/465)] Added missing `msg.close()` to `openMsg()`. If the MSG file was actually just a plain OLE file, it would be left open. * Adjusted the default value of `maxNameLength` for `MessageBase.save()` to 40 instead of 256. * Adjusted exception handling for `MessageBase.save()` to properly report the reason a folder fails to be created. diff --git a/example-msg-files/multi-to.msg b/example-msg-files/multi-to.msg new file mode 100644 index 0000000000000000000000000000000000000000..8a6ace985a0246fa582274457e7cbeeff8ed874f GIT binary patch literal 86016 zcmeHQdypeXdDrgxb!Q(wgE4-@>>!LeY_xjsO6%Rdec!EpdwbmKjCti4&5Ty^NgCbE z=x*1>aA0gOk0gYIgb*ME$R9~6MJiP(@~DKO@&Hm5s0sm00!98K|D-CZRHdi_oZr_o zEseAqO-s9bXI$M^-;R2w`}?|o{e545-96HM>zlXz=yQL1_kTpX(p+RL^79uCMQ-rj zAH`=PmX1aucjE))fBwP?FSs`&kr8}AR;FJN1b&8g5{+Tpt^-{UqA|Y_bQ9esXccqDppSs4 zy}yF%uY!IJ^fAz{gFX)W4bX3bJ^}hI&~Jl23Hlw-r$E08`ZVY>pwEJS5A-?E?}I)M z`U2>Spg#bS9GMOtfwRyg6Plwy^Xs@;Ok*$rg6WRP^&`BMcL>(`C6v1obqik)%V|QZ zYY~;nUPCKNXpzP4IM9ES>3b0CW0BF3Q9K+2Q9FXlb>Ao3q_hspF=6>C`mUla4wFq* zD38uEFvv@Brswy#{)3dp%D^CYYb& z<-4!PeDcW;iKK4%4ahU$O-L>wZa}(dI}*U3hVllGDE>5F3Jn+%_1}U-4RRX+QqN;{ zNOb*kYm;wp<4?X>Bz5B0u6XytE_;`sYKegSR=h=Xn@HNbe41_D zME$=J9}j_O?iERUmrpajoBaK!*gzz8%eUcyiqLrnv#x7z=b5%85gm};C0Rt!oAU$E z{}DX81thBfx1#LJJ$9u3LGsgxVnpPVT%J6(eJ@Axlwu4@{pByR-X~au<(BXMOZ~qY zH;;g*4c*Us<*Dy{uIMj+`9S*r3dld~HLRcf3`U+tKcxSR7xhWB|F@$o#gdfz%TFCh z{|S5YSCsn6pF~70qJN~%w}C|a|0 zm{J=;g`>b-AX)=Z>WBZ7nEtim4^jE|K(0cy1$rtngF9W|<572)q8*YCjpYY%R{*Oa`xc?>jD?;{1_Dni0I{pgE zhTA{MUlo#1a(M#l(DWLV2)BRgyO{qae?mG%Z3q?a$)*^XQn3D${PcnJ|Fw{NmPrqj zFXsQrpWF))?f>ggmi7`T_3OW@|Ka|ZkG-h;V~`u>f640jzmWKk?Sozio}%(kqHHyR zKS)9ScjX_9{}YLSNq&;)U%3Ave?m4T+J6;gzxXKrAoc5i`he#D2O#$nlO86Y@DJ5L zHkaN9exm(c1=BZ9$0tVdGCe zO;kRu(|=0Bk%HvsJk1l(KPGn&{PU3eV}@feDZs*zgH1`Ts!}p zfOPsNp#SWAgnWwV_%EQW6ViVkc?gXE!2V|mvPoYl1?itFUo8H55Paew(f%)^Y*_s1 z%I7GoqDJpT`yX8QMCH>yA~E{~Y-7Hc+3!63u-tCHtrOl;#xC@u#(~4x$u1{;uU8g#Dj}+;IIP$zt|T^Z67= zwEr6@doAt%4CIUXKQa5C1V1{F5bgh)P?px~l!C|q;Ow8)p8IM4^WZm#{cocDOUeGH z(VrO)16)EAol}0pD-}Hc2WS6pgY1Hkd@=if2)vGiMD_n2D0?mKpY|Ta`Y&exbKplY zwP^p}iLw+gQ3@XagR_5H|L>>$(|kUN{TEUGrDXq$=nuuTqT^3zf7depFGIfA{3mAr zw4R`tQndf{PSv%v|HmQ!0T88N`y>A=X8#n=)0$4S|MV_dc>Hy6_Fskk{j`6Irv|Zq zdWY_%Wd9GNKNQo8j{ilJy_WI64f)~vCm#Pl0)7cJk$Ie+JE-$^EvPm?f;V~dlLyq>Mvh>|CNrDMCHE=WkuvC@m9D< z|4A-S)TglY6bf+p|B5%p&)_uZ@9`$;iO5$Ye~0`(U^f4E3v^$XD?0 zI_+F{rj+;o<72o@>#AOfzjw;7{`0@*Bj3T?Q9z{eZTvfpcASg+GfszofVbA)1liBS zv#vq@$;h{Y=LnLb8?bl(`VQ>HStzbZH5aIJ~6#6%>;F%up zzKZbRW$u6{Be^{B`~9x(!^5w81@^1|ZwEhWLqECv*+=o=B5j%>U5oxd3%R`itPy+s z!?73b{~pEds{V^Ce+0cDdmO{{I?(l?8$k38XZ%XB9{+r=?=!#0ulIS+?#GqRQ|bO2yepmYP(Fq$op;dw1?7|8mGhu0?T6BS zaL#)+jVtXD(Hwu8|MPoL=L+b(Ufrj0eLv_K zFaH5tKL~ox%R{V8FF^!;ju``UC`P$6kze@HGoSsRpKO$m{?W&7iahxGKYO0e*^hy! zG(GoX*ei;6?@QOY-vc~^>!6AJ4)NtzUVjYVBqV=C@+ggCivJY`Q4Gxk%w6Rj#l|umgnog(UAp~r}Zc<@?75N$h|DzZttsEzAb;8<=gT{ zUq<{0J4)k`F_xz@7E0rh>sbCHZZxq;=ceCp31A$-OnH?lnWaY|#6n^@lXFnNCR z`}eSX8~=M*p7;0W$T-Wh!uT^5`QKap?HP;wAItOS<9PLkp7ZzLMn?{@Jbykq@(Pyc z?}v>?vMhheD=;3(v;5au`Bj!b>E-7mHztUG;_ZL^?mxNn`H}g^O&xiBTY=?|dgX79 z+`;m^e`Ap&8LIEjhBThHvpg?97Wv_Edj6k3V};P)|7Llv^W%}DkJEFSBPoqXZe#hM zx>=e>uh4TYZ#*)>@;~;Tk3~j1o_n_gFKdY4)-fZV#*9Fb^q{GLI`qw#>4<_Tyov9a z(l@{wxEi=q09VEL{Ve)!7=25Sc5wD5ZLSy2OfRi&%rBgM_{7Aeq*f8lRFfAp?T*CRg|xwQaO8o9EBw;yOvX%pxc*f)JO_PxY;d{@uHH*&Yw zx}gcACh*-gbLbV!BI*|R(D-)yhGSA>^5-$G+w5D2?#PSx>00a0$jEa~&p!J=Y5VB2 z|MJAspKOjke)#L3cw+R}AK&|#k?;M*cevZ=3(ha}`0vqvuC@O39T!c*+Vq0JUHfUZ4W+SY|EIn5QuDJjlAJoxST1+>D_sTx zC%*Jo6Zd@lWACEn{XpsTT4CWQfBWT;$R9?go~MQJfXUtGr@yq5FnEdvSa1}!Jp&KG zy+g2ezY!&)*lDJ?g1m)--wvtZO211`#}ncc?n~zI>nOyhm)~=F=$@wxZpq-ir}qdb za&WL8Ly?1ume)P+S7tr3@!WwvMG;#%h@fKFm5B}`=tW)!`)6z&!I!>ckyo;H+Njs= znC{$0oZU;`#aiu_YRey_R=aGNRkcg@{>0WlE01B(Lo1+@8>g1nPM;WX z4b+?SV0XFx;>r@&-w`iAtp4g=eXhS;AA|Il>o?b5eolS0^>+cfNFI#*@q)M3vyk^| ztlBCw!F-(wZKpEx*fS~y=&J`u>$2DQZ?F0rK0~#d+Vca`--V@<3#TR)uzWweFu%Nb z4b+$X8}onMpY50ae#7fE*I(M>rxc{W{5jWOI;Ws?we{DHCYsPj(qqy{gZW~cHL-DeeQoLN#^%Oq;b5(r+b^GQxczc_ z9(Mkb@so_7L~UU&<+%QG9$bI9J`bzEGJfKS_P)yasW&d7cW+YMRo_?K3aihKxgMA2 z!}#3(jQhR_l{>!{ts zCV&5j;W2Z(W}7k1tUE^CDeN?i<8w2srd2B#wMJES3|rZ*RqOU_wCp&I>3H1M%0^AK zCu$YVGHtWuOlW2;ZkJWdXqc6{6Tg&>d)ypwiWMc9*>)5hiAkLkfoyrbb_JEt?p(Ld#qVc8Y4J{wI=B%+E@*G#=q z-E_Oz}#s_mGTvE~>x1ve_Vt}V|-pG-_G zrzcY@Q?bNyW;vEw%x7b{snu*OpGmK*WL8qC)#U0^Q5V{2eK)#h!>FTO1fJ^H6Q;El zH%q08W^nl8sYGHjt~DXP_E@bNjf?emw}u={|MQuZTqeDojV+{>lCexapN!=atBKfB zZZVTvT*)u2t}MJb`o9|ydFN^(P{1iSouBufM@9FrkKp~7CHx~!y>{e?PuNb`9NsNF zjI(yPJ>lyjDt#8S1f9LPJB&J$Ut+7*8d}m`+4er&R%?ahtMDS;AE0)-Z(S7dd6+jah}$~e&0fbF5HY-S=>OGVYNh*2 zrc{ddUH$WG!JA*Hl<59_d-t9=63HYHkO)WwBmxoviGV~vA|Mfv2uK7Z0ulj2UC@*LboPn z;tl-2PVUn+32#VHEXEGO8K6yEzd?1 z+leIpC03>u)2pj1s|$d5poy@Ox*L}3r+I07$Wy992O2c8D zS+We(F%(s)87J>M)EN7FusP6~kuC7aYf`6q}A=N9T?lp4qnbX}491qgq>rLxW;BR1F;-ywPo0 z3@pnu(Rasc8Z29?7**XS=t^p0s%R8TsidA%Q>j$0ls1YPLz|kM%p`K!WK^l!DOY9c zcG~^mf{FIF8g}{e<%!Gb3Depl?2_@v)=#;5#{t{6#mhEYMF$upe}CQfq2065w4O`n zi^+T}r)9ITOj=9DiivzOmQ7O^OOshu%kws(e=w?4rf5uNCS$2&YBH9|7fZ2xCY_08 zQ>kR$n9NV5OKNMlDz%1bIZC}fP(t$Je7JYl7ZTs9Z&VGkSJr^ps1E`z06N{M8r$Rt z_$}MG*o4z)K0?(>TC~QYPHJQayFJxnvIgmtQ&eo-88IAFsOCUl%RW+ z`i7=8EvQ2xx~m^OE3c}yvnK|dhH7c$GcDsAsudf>#|_O{HXQslEzhgjf~?Zhn`PB9 z^fyZ7pb)5OO|>*PvC?sx2EOwOPf)aIvXY zE0`2??h6mhx$Cv|Oc3tgce|sw178L!aD9A36Pqh4^^)08oifc%Q}HvZW+Z_wfyMYaD`MBYj_iuQMIrqLa6U{~gT(3G^sq4n}Y&3^Sy;7~J#VQK7VgkKl zH>&E68wv)pC|7jdK=9eBRzaj`)zxZ9J%o4qM!mY@)#Q-dfTe2q=O+vQtA$n3$a0mU zYNLfvnd#G}W4QB$YeXH@y6ue2G8mDr`J{yk;($R=*~Ml6WOmhk@u1UVvTPxg#Cl^8 zhjh;oippXXmc=M8i}?eyr0BmO zQnP8SYSO7h7XA!vg>$ZL8V=P-LN>lY;$-v2)q9sHF@B z?(e%IB=Xe$fXMH;*(qT8cgm+i+@$gnu z{#no;#a{qK}Ziy zJ(MaaKU5ytTkX*v84G+`OY@*0 z!CmlD`86!@(PaNNk3ULi){ABn%U`|4vPTR1OIdC&kfRE1L(E30TeMuQZ*_?dSeY+q z8d`y9j5DCMEo(i6Q5|jXT4?oWNDFArvVWU~ii~fLQrtbltY#O|?4EC1YvUr zuXb<8BVgbMYiKTap>U#WZ~BjF|5rDlFnq`59=( zc677u+mJ3+P3=N!r~0+nZ})b8cWwO+-^!s*G_frAU?{? ztICknwYcXE?Rat`8E==R#d~|Z++Tq`X_Yjw{oHCy?_G6ni*d8qo&p2+r2V40+UBx9 zWoqnjr6pBw7ONG!i(nb7PhKP9=(Qcch#q3&JsnSE;)zsT*Ct?|?bAP@R`3q*hj_-T zvkjfw#ncV|d7(SmFBQ&rJ!nnJKJVjg9qca510L_d$Up0`tqxtA)-ddG;Dzi?x9OxLQGnf$KFkQjUzC*`A1Gc~o!a81)=p(*Y=+v6Fu(re0)3JZz#fu7y zae{h3?Npp9?Y4EqP#xVS#@+#lyUo=hrNHV3vbUS;{I{=wz{-02MVG-CxWHa;x6=CW zf$c^$usKz=zSV^59;Pj@dSlD9cKX&8Xp7r3?-ic?VP+YZDzuImkXy?v>vhv%K8@eu zXL|gm5|9ezitTP3DbTM9L1ZgV*-+}uTG6l+v!vMSCEPPDg*VckPzoraHX2o`K+7L6 z!A4WV#9{~sa|-5KrPa|SQEz#P42zA2I;>D!;*wp%B*eBZxgx!vK{*?vDS-zA#Y z_N}yst?MqT``3p4bR1w%zN@En%LAsXtu9O*{d6M@V(RWH^y{iy-Ph5fE4w`%y!ztu z(t>T9S_LjoUqS5P>vbC!1f}@)aU-@>c zYZ!+f*bLd0Zm0y=bo!~b=6OBtGGOXne^=!q+hNH4-9}q?yL6K)%vqykSoE4^*R0sj zA9QN@nvRzbhHZt8t>U5UVt;Pvx)hoFL-;}Zi}wTi*X{NZLRQqND?LVfaJT6)=`qwp zv!byczA^@T_{tdV;VWafhp&wB9=vjay-hHzp;K+~gNCp EmailMessage: """ ret = EmailMessage() - # Copy the headers. + # Merge duplicate keys (e.g. multiple TO entries) into one comma-separated value. + seen = {} for key, value in self.header.items(): - if key.lower() != 'content-type': - ret[key] = value.replace('\r\n', '').replace('\n', '') + if key.lower() == 'content-type': + continue + cleaned = value.replace('\r\n', '').replace('\n', '') + if key in seen: + seen[key] += ', ' + cleaned + else: + seen[key] = cleaned + for key, value in seen.items(): + ret[key] = value ret['Content-Type'] = 'multipart/mixed' diff --git a/extract_msg_tests/__init__.py b/extract_msg_tests/__init__.py index 3f2ba1ee..878fcdad 100644 --- a/extract_msg_tests/__init__.py +++ b/extract_msg_tests/__init__.py @@ -1,6 +1,7 @@ __all__ = [ 'AttachmentTests', 'CommandLineTests', + 'MessageTests', 'OleWriterEditingTests', 'OleWriterExportTests', 'PropTests', @@ -10,6 +11,7 @@ from .attachment_tests import AttachmentTests from .cmd_line_tests import CommandLineTests +from .message_tests import MessageTests from .ole_writer_tests import OleWriterEditingTests, OleWriterExportTests from .prop_tests import PropTests from .util_tests import UtilTests diff --git a/extract_msg_tests/message_tests.py b/extract_msg_tests/message_tests.py new file mode 100644 index 00000000..9d071269 --- /dev/null +++ b/extract_msg_tests/message_tests.py @@ -0,0 +1,55 @@ +__all__ = [ + 'MessageTests', +] + + +import unittest +from email.message import EmailMessage + +from .constants import TEST_FILE_DIR +from extract_msg import openMsg +from extract_msg.msg_classes import Message + + +class MessageTests(unittest.TestCase): + def testMultiTo(self): + """ + Tests parsing a message with multiple To and CC recipients. + """ + with openMsg(TEST_FILE_DIR / 'multi-to.msg') as msg: + self.assertIsInstance(msg, Message) + + self.assertTrue(msg.subject.startswith('Test: multiple To recipients')) + self.assertEqual(msg.sender, 'Bob Sender ') + self.assertTrue((msg.body or '').startswith('Test email body.')) + + # Expect at least two To recipients (type 1) and at least one CC (type 2). + to_recipients = [r for r in msg.recipients if r.type == 1] + cc_recipients = [r for r in msg.recipients if r.type == 2] + + self.assertGreaterEqual(len(to_recipients), 2) + self.assertGreaterEqual(len(cc_recipients), 1) + + to_emails = [r.email.strip('\x00') for r in to_recipients] + self.assertIn('alice@example.com', to_emails) + self.assertIn('carol@example.com', to_emails) + + cc_emails = [r.email.strip('\x00') for r in cc_recipients] + self.assertIn('dave@example.com', cc_emails) + + def testMultiToAsEmailMessage(self): + """ + Tests that a message with multiple To recipients converts to EML without error, + and that duplicate To headers are merged into a single comma-separated value. + """ + with openMsg(TEST_FILE_DIR / 'multi-to.msg') as msg: + em = msg.asEmailMessage() + + self.assertIsInstance(em, EmailMessage) + # EmailMessage must have exactly one TO field (duplicates merged). + self.assertEqual(sum(1 for k in em.keys() if k == 'TO'), 1) + to_header = em['TO'] + self.assertIn('alice@example.com', to_header) + self.assertIn('carol@example.com', to_header) + self.assertEqual(em['CC'], 'Dave Jones ') + self.assertEqual(em['From'], 'Bob Sender ') From aa9581d46a70b06e35628c8cab7372eaac0d69c5 Mon Sep 17 00:00:00 2001 From: Kevin Tam Date: Sat, 18 Apr 2026 22:36:42 +0800 Subject: [PATCH 2/5] Fix asEmailMessage() failing when To headers have mixed casing The header dedup dict was keyed case-sensitively, so 'TO' and 'To' were treated as separate keys and both assigned to the EmailMessage, triggering the single-field constraint. Dedup now keys by lowercased name while preserving the original casing of the first occurrence. Adds multi-to-to.msg and tests covering both parsing and EML conversion. --- .gitignore | 1 + example-msg-files/multi-to-to.msg | Bin 0 -> 86016 bytes extract_msg/msg_classes/message_base.py | 10 +++++--- extract_msg_tests/message_tests.py | 31 ++++++++++++++++++++++++ 4 files changed, 38 insertions(+), 4 deletions(-) create mode 100644 example-msg-files/multi-to-to.msg diff --git a/.gitignore b/.gitignore index 7759747a..a027c2e4 100644 --- a/.gitignore +++ b/.gitignore @@ -33,6 +33,7 @@ __pycache__/ !/example-msg-files/strangeDate.msg !/example-msg-files/unicode.msg !/example-msg-files/multi-to.msg +!/example-msg-files/multi-to-to.msg # Reserved Folders /output diff --git a/example-msg-files/multi-to-to.msg b/example-msg-files/multi-to-to.msg new file mode 100644 index 0000000000000000000000000000000000000000..7f89476d81a7665c757deb40284d515e960a2ba2 GIT binary patch literal 86016 zcmeHQdypeXdDrgxb!Q(wgE4-@>>$7#Hd?)RrSt*jRD}R0fg=Bre^QlHs!~(|&hP7) zmPT5QrlsAzvt8X+-;R2wyTAT^{e545-96HM`&+mD`15~y_kTpX(p+RL@{8vWMQ-rj zAH`=PmX1aucjE))fAReD&$~Axkr8}AR;CvT0zXGTiN-K)*MY7F(U{)|x(Rd$bQp9q z=oZis(91x#f}jGC+dwY|y#jPQ=#`*5K(7Mb3AzhJ`f-nUy%*P4gT_G$=ry1y=(V8N zfnE=~5A+7mG0+=9_k$h)y$KWpQG0P*6QCq01xkZ5pe$$-GzH3m@}OzZgPFrpx*|43iLamPlJ9J^cm1+L7xNt9_aI+-v@mG z^hMB@Kz{%tJ~ACV0%xH~CNxKZ=GSqxn8siL1k)Xn>qmGm?+~o>ODK0G>K48pmeYh* z*CHz8y@puw|Tkq@C0>muEFvv@Brswy#}PWJ;{@i3FhZ` z`R?m6AAj;gBB`5y1N=;Q6XHvV8{jV5j|A|ip}YYkia(8)LIZ|G@>}4jfo~&#>v_x$ ziLQTcZSw7H{K+?qq;B~K=TEban~4AQ_(*f*igz#UvUmQe77Ori#alGDiKM;rr`gs` zB>#>0cnCyuuSnWEf12stgf1Ci9t--ZV&LgyXKx~{#QXWEuTbU=ESWD!B{mhXrD zkKoZQAW{9l6=h%UHAng%#6OKFMub1{<;iQd@8u|-Qj9^VKmSE0eS$?;Zu#!NB>&C0 zc?3j#=ziWSPqOp5qCfxT1Cjp~;D6ZbSU>(5j698gi2STAl1Wtl+fkNcNlN|srw&AZ z!k+vUrGES;5mAfiAL;XLAW`{Wg|Z^}Pr{>!@F%`Jq3O_cCrWtXM_>6}>kGGkvL_l- z>O&}V6u1jSYXD09@ShUXzgGMq%KsklRfsuKfBs_jNA~nekSPBNlwJ0wuwed6kUo#` z54FF6<)`>F-2RBanE!tn+I#f?@>AJx`y>8@|NhHA4*uc(m-w#;*&o?6>9pwhD<~Uo z|HOY)h(Gb=2~CHl*PukW{gdos{+Ij-=@j)Ll)1;7Vqi+a`cM4R2O|G#!S^iV9>!nH z|C2wt7bGhG>rs~W5-9bP-_`$c|I5c-l>agC4fDTbb^KpQ{Kxh|uS1)n{7<56HGw}! zLGru&561tA#J|Kp$@DMW|Byc+n-Y~@McFSsj6X>Ia{C_|AUSizC_!Iu2`p4$d z*P%U8`QL=HUl-t?!A_plyySn=aGlN_z&!VmcX0zl~R!Yx%|cAuLsah93(3LGRlUyLl; zYX3R3$NNAsg(aH%UP|^)^C`_KqT^3%UmZj#c>G<y(1W|KRMO)}H%m|MO^X5c}Um`InOYPeYy= z4+C666P;6j(<>D`{s(9O?*Q+D5PvcIe-LdQ2Z`$cyHNI8+CS|*h{-Qz|8r=MVro(O z-;J^qFHs5}|AVuCTL15-{nLCti2WB){-tF9i;#z6TG8>Rv%hN@|ChmEZ2l9oe_Bsa zOercqy;F59?f+5mzaK;?*#5}>irGKK^R%WDm7m@v3y;4J&iHx4}PL|HR|}htM9y^rG^U@4J@v-voa#|0ibuw4PcUKz`c$ zzLxR-W$^dgfA9S_KK}c^|GN$T3W!pW|0VA3`CB-DT2E0tC#rutD0?S}Qh)vjv;X}V z_|kbLr6B(9&NI!wq4KkLpU%AIB+CCiC@aE0iMPT<XX{fA5rE{pWwrMZSxHS?P!4dAkxlzQRv^if@gZX`zq1~ zFLMVx8S&+b-|u&QKOTO=E3jYkzZ30IANuj-&pw6^7irTB=~~GD4ES>SSts`Rhhr}) z{~p=xs{Uf;KLTmU9>;LK4s<=}1`xf|OFrrlh+-vr$MY5tz2`}1msE%7Hs5E;(|ch2 z8NX7j$3Nfe`^@k0>+8H{_u)$CsdWF1-j&XHC?CU>&O2!Tg7Qi4%G;nT?T6BSaL#)+ zjVtXD(HNI5Wb%pRnUjBo) zehBoemj_#!UV;ex0y75YP>gbCBERtCr$6^UKiw!F{iBcH6nWqcfA$=mvmXOdX?pI( zuvZlC-j}a)zXx~-*Fh8c9cq_fdHpeblaTxo@uM`3l_TX}&T^EWV)>GnpT|xpJ->47 zYxn)-=J!TMM;>JPJ6h#uS)Q-|Mn@J{p4QKl#t{ioeIB2U!zwAy%a6lCDIWz<8iy59 zp7wMpjgDMod4=WZ`4cS9-*+7yx$71x-{yBW%X5CCBlog=yT7ky`8NM?mT&VPeHpb! z*ijmfjIlhOu}~V1T*vYsb+eKA$PFxi<G^;DoE3t+|IPAT=f@*QAEoCsM^YM(+{W@hbF(y$ zUZLll-*{w%<$vNmAB&83Jojz~Ue*x7tz$+!jTwO==|NKgb?BQf(-8$xcoW|-rEh>W za5ZqJfL0aX_p|7`Ve~CQ+QHeMw7Fh5GrhFBF~4y3%_q)ooI5kIv_W&MXgZ5`l&SA~ zNl!j7{>8uf!qK}vQ;+;`xL$f zn!tD0%%N8>i>O=NL+9K58?KQe<3Epa-DclHbVpuXrfaP~BO}i~HT%r{rR}57{L5oc zeX2S7=;3dC^0Cole|+y}N520T-{o$jFE}si@!zBUTx6^RsjadazI(<`^ zyxD`u^|~+6+`vT^;3+Hc0Mvs`c!3JMW100pW1hmq^C0V@e)g_Ea5sjoaINik26(wM zP!LMAT06Kj!)CWq;20e&_k)v5wB;i3zl1$r+FRr!L7(?JFNpG=e|GtYAO5coo%r%! zP2BT|kH3eO_XDNVYlVfM{_R&rB7Yc}dX5&x115K$pZ?NL!r&(AK?D`Uu1s_oMlbL>*gs?I2)^_ki@cJp(?-31-yFHM zW1TcQ^3#rW5q@o{BagR=S)RYAfpr3F|Cra_cx062+v~zHmcQS7&eu7#_eKf*%e(6u zH$NVEp5@#9o1gYScT0)=?G-<*Kr8519ca`m-VD72jiz7vpgEHChE^->_=EJwjih;u zU1s*6Go9KBh_id?yI8N?Qf>Z&)M}Rvv#NH<-k;d|XXOzrdT0f7a^uwU+UXMmu7P^D zJlI{Xzqqo*^>@U}539erSD))I*T*3J<@(L_m!DH#ZT($n=`%{tb$o25o29@@-6pYau+c8?If^ zTCV-Va$&>s%PZ#!n@cBG78g!GG_k(0c4}hd^!nP;*^SMO)xyDAHMd_r-*Efo_B`zT zBjYC-KZ)AHUdnO(u1?Bij2ag4)2fe7mE5zX7(VS_H$m(yYfygOJ$l<$&Rj?R9ya;q zAEu3&<2BoiX=dFq>P}&&VH}^ESv9R%!KgK=s$wm@h(LSZ!h|Pf0*QT ze-o8T(GISErY?9hHI))Q322`KP>Vz|NdzPU5&?;TL_i`S5s(N-1SA3y0f~S_Kq4R! zkO)WwBmxoviGV~vA|Mfv2uK7Z0uljYAxn>RYqXrc;XL zqQ~cEG-p~f>yA-(3OfyBQ!C>)f)(66KO2oIHMMp16*{ffjM?b2sWoe;#!C5~t}V|- z6WfU-{v}qX7SpS%E2|5O>128;lU`g+XC@bN$;sqOI-Sg~cGrEl)!TIUqh-U^tV+XS z!Yo;a>KKZu)Q!uE?sa6MyUM0!HqcRa+rxo6mq`}YQc~AbT2e1&v&DQdku~&OVk)KP zr%F?$r~=^X_O#~AM$3-Vn2yJ728%sWt7w*KnZ+s0 zeMs++8eec6t5R$_h8>+da(HIj)~DTGDUNDw84eAK-B2|MJb0(u zyck%PX+n0#Y8otCsu)$>Cg@6PVyb8qOR1!uR8y%`u9P;48AF?zoXjM0+GJFz+bLIN z>UP@w;DU+%wmNqC^5u!k=?T-?BJ7g!N7hfdddC6Vw#CafT15vKB)`0F|IqK*Xj;#u z^TlL7meaD?SSGEdV#P#08Ox?g#L{F|)$+WLkPk+c$`p;s%w#N;Oijiz`C=)S&!jW4 zY$}z^8RI7q@ItPCle!ZYxFzQ3q-dH+2RJ{{M-LO=eTMCsLIlaLZZ{-qf>!fOz zmrT893vqJUo(-_iT_(crd4@rD!t#Fm6R*T(D{gB+vxJcIQM?YVnpSPuD5=dVCWebm zwOYZXpmSe%V9s5ywP%8G_rBX5#U1!ESb^)~8=BZ$QK^^AhU%1QcAAQxQ7s4Bv4kMW zGGS62*WK<|c-{@{Xv@b9*LeSqw;Jc(mu8~bXn^Zgrz>^c*q)8%FsWCnRkc_};Z{tb zSL{Yr-El*~z$VHST{jSXwyISSXb`K$=`mTh5K3ab zF$hFG+Zq6-p7mciPz8nrZNQR)0oJF*9t>BR7FytTx+EbUyp>g280~gtYc)i}1$B$UFLk=ZQ?f@&>scGml3KNWtF1K* z)#F(&D2q{47Nf8%MsZopADBJCc<>IbV}7pq2HCb7)KV!sRYWY8iRJYYmMK%EVl0== zWnv{Yo5rdpol0ck&(K#m=h~*>P@P0Rm&|B-I+mTx8L^U)PsHd>ET>PVlBH6vRLo^q z9ks5NO^dN9s@Y7bn2u>_BZ(@xWK5l!nv7{`rZ|~TW^!5vAp_W8VcRh&*bf;ymraX$ z%3$FBzAHi^Pwfwg{GOXF1{&IQ0TOhFySTw41UIrLo9^sSTkV)T;BbphuX{9Ozld(Y-bP3u*jU+iELC$1 zU19woed;vfUhVHuVir(vn4nFkjQW;SRxYX4CPI!>HtX|hzL2D?TziN>cCAg?Xbl*I z^k}JvQUwKQJ!LK{bN?e~&j`|aLn zY8g&46+T(H)fL*PV)J^ZAMVw7fIW20wKTVSw`kdojTP>D5kMMsI?(W~_Gpic1wO5% zc~FqxE_kW@8kYFzvVWV$A0;&FMYDyIS z%oj8by+Aa^8=$o<>pg{09ewXwX!UQ97SO!O{(TxMGU{EaHFB*l5i6x9i?NKB#u_F$m5HT|R63*6c~Uy-u3p%f4e4Ul)GoAks$Yx!c5nB0*Vgawy&UR96U%ZBroiLRSfy?|SWZJpTo!tJ z!@AM5stie8i+kSCjwdIQ@pf5SytlW@{T0}gR!I}v&#lh%-c{$e7&nXUDKKzP+Rv)1 zZ*KFaOpP6`w7BZcVzq*I5iEnrv&C~5BpxBQ)}A7+744s$Nq^I zFDfj?36g%=sW?^IZR?1kI>e@iy#o+;o2!FMfz=Oe-fpt<-@Xh2E9>nST?T8w1@?lw zmDYa`Y&WWb-KnbettMReFnxj58(XHe)3>fbUtGz&S9tcPG0V7Ap>@Q7t+mXuUN;@) z)A${JrpIq80jW@~*zU%W0{yBGM7H9T4W-_!6%9)w69d;A%08InVMqAxoa3s+o?g-Ff10}w zv635P?9ZvV7qJMvx#eNQrzCf9&frBCn4|a1H$`$Jef#s$hg+wbS) zyIAwuzLoZ{b=^gE|Ju->jspzJcS%aOJYc%|>cZ3^ryFTdqwcOkKT+N4K0$}B>`FR# z^~L3-1=}{Y3S6GPg4n^?8G5+Ume)e?B<|b9AvflSwR<4&w6(T?qr^*TI7)IEoaV3V zxd%t>a^CRtt0!D#bH{d!+L{;s6HDQ$98ydQLG^^xtSyJDDc#byrs+;)YgO9?M|U%P z<=d^UVJ-B)X2`yDLnYWur=MzTo+oja0h4(BU6qS$havZO8*Sa~(oL=~XN{6!(QBGr zvtmDg(5dBXI$k~)wii0KiifU?{kfs*Qe^HA;Rlfy?+5g++wCKStf*60dW`hoZqsd~ z$50R56pi)pl`+`ESH@@$Um3$ad}WOH;FSyPZGvG9oob68G=$AjL+D?t1ACEx%_ EmailMessage: ret = EmailMessage() # Merge duplicate keys (e.g. multiple TO entries) into one comma-separated value. + # Keyed by lowercased name to handle mixed casing (e.g. 'TO' vs 'To'). seen = {} for key, value in self.header.items(): if key.lower() == 'content-type': continue cleaned = value.replace('\r\n', '').replace('\n', '') - if key in seen: - seen[key] += ', ' + cleaned + lower = key.lower() + if lower in seen: + seen[lower] = (seen[lower][0], seen[lower][1] + ', ' + cleaned) else: - seen[key] = cleaned - for key, value in seen.items(): + seen[lower] = (key, cleaned) + for _, (key, value) in seen.items(): ret[key] = value ret['Content-Type'] = 'multipart/mixed' diff --git a/extract_msg_tests/message_tests.py b/extract_msg_tests/message_tests.py index 9d071269..57ca5537 100644 --- a/extract_msg_tests/message_tests.py +++ b/extract_msg_tests/message_tests.py @@ -37,6 +37,37 @@ def testMultiTo(self): cc_emails = [r.email.strip('\x00') for r in cc_recipients] self.assertIn('dave@example.com', cc_emails) + def testMultiToTo(self): + """ + Tests parsing a message where To headers use mixed casing (e.g. 'To' vs 'TO'). + """ + with openMsg(TEST_FILE_DIR / 'multi-to-to.msg') as msg: + self.assertIsInstance(msg, Message) + self.assertTrue(msg.subject.startswith('Test: multiple To recipients')) + + to_recipients = [r for r in msg.recipients if r.type == 1] + cc_recipients = [r for r in msg.recipients if r.type == 2] + + self.assertGreaterEqual(len(to_recipients), 2) + self.assertGreaterEqual(len(cc_recipients), 1) + + to_emails = [r.email.strip('\x00') for r in to_recipients] + self.assertIn('alice@example.com', to_emails) + self.assertIn('carol@example.com', to_emails) + + def testMultiToToAsEmailMessage(self): + """ + Tests EML conversion when To headers appear with mixed casing across recipients. + """ + with openMsg(TEST_FILE_DIR / 'multi-to-to.msg') as msg: + em = msg.asEmailMessage() + + self.assertIsInstance(em, EmailMessage) + self.assertEqual(sum(1 for k in em.keys() if k.lower() == 'to'), 1) + to_header = em['TO'] or em['To'] + self.assertIn('alice@example.com', to_header) + self.assertIn('carol@example.com', to_header) + def testMultiToAsEmailMessage(self): """ Tests that a message with multiple To recipients converts to EML without error, From 48c437517e23a9f1b68d4630461b532799ab53fc Mon Sep 17 00:00:00 2001 From: Kevin Tam Date: Sun, 19 Apr 2026 07:30:56 +0800 Subject: [PATCH 3/5] Fix asEmailMessage() raising UnicodeEncodeError with RFC 2047 encoded headers Improper header unfolding left tab characters mid-value after stripping newlines. RFC 2047 encoded words using invalid charsets (e.g. malformed GB2312) were passed raw to EmailMessage, whose folding code then failed to re-encode the decoded replacement characters via as_bytes(). Fix uses proper RFC 5322 unfolding and re-encodes problematic encoded words as UTF-8 before assigning to EmailMessage. --- .gitignore | 1 + example-msg-files/unicode-header.msg | Bin 0 -> 86016 bytes extract_msg/msg_classes/message_base.py | 44 ++++++++++++++++++++++-- extract_msg_tests/message_tests.py | 13 +++++++ 4 files changed, 56 insertions(+), 2 deletions(-) create mode 100644 example-msg-files/unicode-header.msg diff --git a/.gitignore b/.gitignore index a027c2e4..97b36f23 100644 --- a/.gitignore +++ b/.gitignore @@ -34,6 +34,7 @@ __pycache__/ !/example-msg-files/unicode.msg !/example-msg-files/multi-to.msg !/example-msg-files/multi-to-to.msg +!/example-msg-files/unicode-header.msg # Reserved Folders /output diff --git a/example-msg-files/unicode-header.msg b/example-msg-files/unicode-header.msg new file mode 100644 index 0000000000000000000000000000000000000000..5c2ab54b456f8e8a76be7d8fa30f35e2150f2aef GIT binary patch literal 86016 zcmeHQdypeXdDrgxb!Qu&!5BYcb`W3=8?D~E(t3Aq-*<~|Z;xA@F|Qn>nbAr$pvZsZpHwB4suWd#^ZRyGOcjee?DoeeO^1`Hx6fnv0A@e*XNC$PK>x zqxek3(y>V79(DppSs4 zy}yF%uY!IJ^fAz{gFX)W4bX3bJ^}hI&~Jl23Hlw-r$E08`ZVY>pwEJS5A-?E?}I)M z`U2>Spg#bS9GMOtfpgF#6Plwy^Xs@;Ok*$rg6Yo4^&`BMcM8_|C6v1wbqik)%V|QZ zYY~;nUPCKNXpzP4IM9ES>3b0CAo3q_hspF=6>C`mUla4wFq* zD3@C0>8uEFvv@BkNJy#{)3dp%D^CYYb& z<-4!Pee%fwZa}(dI}*U3hVllGDE>5F3Jn+%_1}U-4RRX+QZHb3 zNOb*kYm;wp<4?X>Bz5B0u6XytE_;`sYKegSR=h=Xn@HNbe41_D zME$=JACG`&?iERUmrpajoBaK!*gzz8%eUcyiqLrnv#x7z7nrss5gm};C0Rt!oAU$E z{}DX81thBfx1sDyJ$9u3LGsgxVnpPVT%J6(eJ{uGlwu4@{pByR-X~au<(BXMOZ~qY zH*W<|8@iwO%2VI@T+v_t@`3dKWsrZ=Ygj+|8H_xQen|frFY1$M|L;IqiX|!am!CS2 z{uB1(uPF7CKZ%H1ME^*iZwHC?{}m`Jg8w8uiimuY%M)0Krn^wW3qShy-?hGQ`zL#% zF{L(y3P*vvL9_;-)DQnDG5u@BAENT_gj;GL({8JBHaF|?_&O!{0Zq4wINivC!1nmO2PV1^3w;>|5roqIVL?! zzL@_fe{vs4wEwR~S=vjW)UW@p{)hWtKK7#Wk3(*l|0S#A|3czFwhww8c#6tDg|gKI z{vZYQ-<5wb{!b+SCHYCFf8qXz{0Z5VX#Z7|{o*6|gVeA8=>wYoAAsD;OnR7n!ar31 z*j)M=@DuI-8&LLZ0`fE1$rO-3@cc)rM>NQOMB<6nC2E)gn{MqY3)Q0|&-KVsArI>+IQ2$-|ZVN*E4;z2- zX`=FJo&Hl2jua$6=V_jR{xP|O;Gc)wA2aE${Jri!kvTI6r_Kye6jfJLGX!#MEk#tvSIP3 zE1#pViWfe7M|BD@B_nb^rKK14v5{?usKkw;Zxc?ip|0f~W zAOG&v{&V2R+dzE^OEmXgOZHFmDa|RO<4U++x#9XplEv(w=JP3# zX#Y1*_Au@LEaZ#%KQa5C1V1{F5bgh)P?px~l!C|q;Ow8)p8IM4^WZm#{cocDwPgR( z=+BIY0WP75&MCj)l?opJgR}p)L3Tk%zL@<#1YRdVqWb?1ls!!Qr@aTU{)^fF9QaX8 zE!zKgqAbNrl!C|q;Ow8)|NCkGG@lP*|3#F)mh68K{h^pvbo}Y;?=a*4GUSWRe`5Ae z>j{b}Mf*?hR2`=MKL+^^fG7pqANgM~`=@xG)^wu%r+3N1Mn`_)$zR+JEwWhiU&!$QSc}V)jq#skH(6 zPkY~o8UJ5_e82to-hboczyJHc+mNq-C*@h0lY$X6qOhx|WaHvf0zzZ&^A=!eLEGxGh&SMcsS z?Ob=Jl=uGQqqt4$s$Pk|cgnB+^S|dJ-@)8bK&0_){5ykooR9o7PKSPgx7ObT+0VhV zu0j5($hU&zGu)~FZ|Nl*_qbes^~+yFBx>O8*J0C9Z~yhJU_fmU;qz`3`ZurO*&gq{ zitylN?t~{Jxjgav{jTrB!>@Y<_N)JI2R~{Hh1zE1mIBK87ouchLR?<&)l(^Pns3hths< z&U-eEEA0`{9vMBS{Sw+Qn)mXH-qp{8e_q2gimxfgq)D&ipdS{{#Ms>UXQMIJTpNJuZ-2FtiPWB^LtR|D(Jml-KTJUKj>*M z{{dV-2zu7bL##~KAOb(fjDa~6quiOuFZ}4~&;HL(Hp<8T=wmlU9(>)OJxAy4$3aw@ zo_jIuRmHparR&`90Up71&_sTR`0^{SKZb7-l0PDOl*X}gr2I=+j`CA1U-I(v*a@ZQ zSC4=7{=eM(?#SrKLo9!1tNbj>^Y!28$O6mL`kB%=A|a~JZktvVZvuDawE%=AEz`HxrybC50mF7 zzke^wxADJ^<#~T^j*PQBD~vy5k^jBT-=49^|FJxOK8{y!=sAD?ZFJ-a%k$@>BQIll z{(jhaB+K%by#nKrJj;Krm0xA~Q(k^Pa$|z{C*J@j*yb0nql$n7lu zQ#VWV=v8{o<&8&1SpLV}^RdWi$8+y?;AIUF+&X5&Gnf%5k{&b_P=~(xG96J6g*WjX zQ~Cy216Kog3gD{vzMn6MbkOF zqfBkzOM35v<6roj&mX({)Ah&?Ms6#>lt!*D;q3?7Q`!W&1@=u}jeRe19^cio@QvIp zwr*$wsR?{{%^Z3KvxvIIJv6@EzTub@nf!T->o)rqqC4{9eL8IY85w!@so7^9C~Y5m z=3gFv>XXgU$Buse6OWG`|Ks~UGxEK^_zrg)eZl#K9{)Ys&tdCN-+>W$NZ;J0Z_Fx) z(&?MRl=?B*@QO-r+e;Iqcw71AdfN6z5JfbL-#yoa7za7J-tUj zk%NQ%7>XQJw7l+lzcTBQjpq*ZDT>(AK?D`Uu1s_oK`-(;*gs?I2)^_ki@cnz(?-2^ z-yFHEW1TcQ@{^8r5q@o{BagR=S)RYAfpr4of866Y9vNl%_PTJ4T~_&`WU3YT)(;g@^k7JT7MUyi{!z`A1`=oJqvli z#;UC{6U^6{&~_>_k3FMefWCTgv@Uy%|MsfC;WJdLsXae1{asi(wQzc30n7Ju3-im1 zhoHXX-Awt zs+g%SLQCnFmnhztVDScdXN~DQ#UCa3>GobgTh|VPyNlDHe}m$tLHP_@zKzLn9Ks{J z;rNQya_twE3mcwaUO8XbTspO~xNzp-iS>oG(-RwK*4LKKZES9=77o^`x&89_hTAW< z=V9j`89&MRNz@khQjY5{=fU-t>+`VsE8{1QXz#0xpL*jWdiN&9UG;s%t+4vcnCo$Q zK8(*DPrL7nP`M*G)%q^WbFUp%+dV_3RUe%yxo1l;eA>Zog4kKspz^v~^p-E5y^h*F zZ1VSi7#=exYPK2E%(`RLox)DTI59V~YFf2|QEOCH$FP;{TD5M^M$3-Vn2yJ7t!&g( zd!km+EYmhi&V**x;&xfJjD}gMJMqiuxW~-_w|EJ>?C9Lga>WL}ow;RGYt|s@bfsvi z)((l9iSvguD|*H0U4%WEIb&Su{g@ui#5)=;wsSgj0{v4i8$B11L?WsfbJe5dH#3=@6lFOu*v$2KLQZkmw=aaEqVl@$4 z$}MJci!1qs)s=-8NB?&tBJaGA2+#>XouBufM+e!%c2BYuoR#x^V5;q&Bc!kb?R~;L zhc^n3V&<5|8+ltOox*PpCGqzJ-orV8-@j>KXZ&%@2ReRDdkJshoy5~Jc%5W#JiHZq zL2>;FteYeHLH*R+D+Yj>F8lp(bauNZFfJ8tdAQ6xVNCYGT z5&?;TL_i`S5s(N-1SA3y0f~S_Kq4R!kO)WwBmxoviGV~vA|Mfv2uK7Z0ulj&dn4}eP?c_F;_5bN5OwERjDd;Yhote z!2j#yK8>@Y2h$A6RU#k}kO)WwBmxoviGV~vA|Mfv2uK7Z0ulj}Dic!~0y;9$rjW(T9EEhd7 zH={Yznpt;@x>ML`7@JxdzY(n9=7rg4RH>=0tFO=*wPwslmrbo%Lp4^)_jGM}Hk#N@ zB=Ik?GPRgqU0qpSSWG9=QoIF0Fe+-9)Y6SazFnYLMSCN#4aH%q08X2es8#ALi?II6BXYTSne zXH-*%VbrkDddsO8Heck9uaZCjsqTctRvwPiRoD0V~D(BZ)w-Im3` zvP=_wcdVwtvZabq)op^Vq$Z|{MzNGi>Pa<~O65vvqnI(YsmaMqBBxD8mAajBRiy@vZts)gXIi4VaDkAm9R^(~YXJO@4si zvW-hkIE_}mlLxoFb+xW;G3Kq#_TZ&ftDG%oV;QB{D5=c4p;y#R!@3NETK7x|x>u=h zXj;>PIwYdI`q8uUs%krXVz6nbmR3I7GQOc&u~B@?(41w%!C%wzyqYb@Dm}efRxLw+ zbHyp2x8V_os0Y#brFYSqsp+a#f`TWrS&7iTqSDohg;lS~{JAPZS!9^mIYp zUQ!*cyilzQ*6AGlW%%`ideNv4ReNLU+)(vS8g;``X>KW0YUK0=SG<+Wu&q<7U0yQv zo*d%jvOODMpSw(i-SZ5C?1bh0_9tG6%~ssjf@TRJ=c9NHS~acOvQbi-RZI+*nrgLz zNkQkn@W7nAUTe<;;qHC6JBmB-Wv~L*$2T;wxuQ}pnGMw`)9f@AKdV{}v||ZDl4ZiA zIIg?hvGBYb*wB`b8;*JZhPRk=?@Ke$Y&5|2s?(LaZfws+bC}dC)v8*oqHrrF&?|PM zs_wXRJEA165#1&;~3y7+`%k_J9qxY#;;|u*5!m_Xuaf*mm|{mjXdb8_aA^FdcQ< zteZ9NSZIOU>5_zGa4xH~Fxu_P)@q1`3+fhyU+Q#+r(}0rI$NXIJ4YF-Fh*K#$RYWY8iRJYYmMK%EVl0==Wnv{Y zo5rdpol0ck&(Kyl=h~*>P@P0Rm&|B-I+mTx8L^U)PsHd>ET>PVlBH6vRLo^q9ks5N zO^b;ss@Y7bn2u>_BZ(@xWK5l!nv7{`rZ|~TW^!5vAp^uSTw41T)!_O?S4Zt#-^EaJWUM*FBoCzl(0b-bP3u*jU+iELC$1U19AY zed;vfUi|miV-`?wn4nFkjrx{TRxYd6CPI!>HtX|hzL2D?TziN>cCAg?Xbl*I^x)J( zse%Hu9<}#~>OSgG_v)*KQEI>_WH+M$AlhB|g+$18V?t{`p^YQC_PgfSe!DlCT85KM zg-=#)b%i#n*u389hkG#(u!pX>mgZLP7A?E6vBG^X0!X7y2O7TB9_^8_z^An|4+;|8 z1uvCf!xA4&_HXm}ql9L?Xg0C@)mtokw6MRF<@N$Os?avXY?Qi1%hmc;m*{|%`GTgQ z6^O<-16td%)>9bO(e|!|R)2=HfaWaww`r)zsCT8-$h9WL-6PCub|KB~`L@M&$6akb zYwvot)i^zn8Y`VF{BvP%F#!rK8cMy;fko%`Y#Svyxe3|iK~=VNQrP~kf(>3}$*eYO zv{S22@0um66&qWHPZ;Sm)-BmYtdyQC#xhzOYnbFzCYCl*>5NY2N$IS+hUs)3-3_(N z<`uTwnvL@PWyG{9lYPY~4tNyPrDKEn8|G`4$|jB~PMhY%W~1dK`g?>&C|cxpN2cu6 z?(KL44E$gX&E+l>PIPU2(((>Ft|jaltB%di`EW3K5sTtoXQh@A^BqKmMI0(W1MS$3 zZq|Jp(#5K&U2N@CzZ(1P-tO58WIn;?JmgOEyfybY*O5Jv_oQ9IPBJ}o#b)#uj z8Irmd_q?GUPfjG`?Xt9ZZ*Q0TE3hZ6k|wsFTaD?xtIlmPZWh~9VBns#UsPAyT=u6; zjUBGEr0UILwSspMEQ9sQYeXEqw&R!3Lu|aKk&5fu1kAI2`X|&1-r@Za&vw;Ky5T=BbSL|z!iBB}tx4JEeY~xM-GzC;;~g0JXFaynp=;9`<{q3bc83BTn;k8g zyvbf=)+v_*9piKc6G9xOE7;k0=oo0g7PvuJ$7>RO#PfSsnsD92v;|ggY?;AaeL;y!m~fjEaP&8))518Ynf%eZaU1T@jLuX zkKa@RQlVV2-Hjs!`c)x_Y{e-XO1)Vt8kS;~6kENFd#0uEM%oif0VUK%qe>NM`2!}{ zXlj^P4B=o-!Cb2}8#LM_446+_j^EoKw99_`gMcNes~7w%oPqqpG^d6izm zz>8*dM!dM^(eH3sHIHrIaoeTL9X@9h1J@kNK3LDgj_}<%$5o3xy`t6sG0O_wQJ4cldR--_Of;iRQI^ zEA3(Hx{K=mwV^*92N;y^>M7mwfaz+h3sXlw-AIF&y1NSfy6RTiZ5IZe>wJGNug*1YhaL<(2skYZ8@swbpo?S8nL(%t*kG~KCet!lgA=w`!LzTN5? z#-Rr`L$;+GDnT}#eyXi`UXQyBn7Y^BRk_G^7;=BN(bnBA-Q)^$&L|lcy{6eUEB5mT zom#%8;yM2U^6?N)LkC7hSZMsZ)4E4~g zXsm~?jKLnhGDds&${6n9D`UI|uUuen6AWwUR9pO@A#9EsLjPI~*h>%CtP)}?Jv str: + """ + Prepare a header value from a compat32-parsed Message for safe assignment + to an EmailMessage. + + RFC 2047 encoded words that use unknown or invalid charsets are re-encoded + as UTF-8 so that EmailMessage.as_bytes() does not raise UnicodeEncodeError + when folding the header. Raw non-ASCII characters are also RFC 2047-encoded. + """ + if '=?' in value: + def _fix_word(m: re.Match) -> str: + word = m.group(0) + try: + parts = _decode_header(word) + if len(parts) != 1 or not isinstance(parts[0][0], bytes): + return word + btext, cs = parts[0] + try: + text = btext.decode(cs or 'ascii') + except (UnicodeDecodeError, LookupError): + text = btext.decode('latin-1', 'replace') + return str(_Header(text, charset='utf-8')) + except Exception: + return word + + value = _RFC2047_WORD.sub(_fix_word, value) + + try: + value.encode('ascii') + except (UnicodeEncodeError, UnicodeDecodeError): + value = str(_Header(value, charset='utf-8')) + + return value + class MessageBase(MSGFile): """ @@ -171,14 +209,16 @@ def asEmailMessage(self) -> EmailMessage: for key, value in self.header.items(): if key.lower() == 'content-type': continue - cleaned = value.replace('\r\n', '').replace('\n', '') + # RFC 5322 unfolding: replace CRLF/LF + whitespace with a single space. + cleaned = re.sub(r'\r?\n[ \t]', ' ', value) + cleaned = cleaned.replace('\r\n', '').replace('\n', '') lower = key.lower() if lower in seen: seen[lower] = (seen[lower][0], seen[lower][1] + ', ' + cleaned) else: seen[lower] = (key, cleaned) for _, (key, value) in seen.items(): - ret[key] = value + ret[key] = _sanitize_header(value) ret['Content-Type'] = 'multipart/mixed' diff --git a/extract_msg_tests/message_tests.py b/extract_msg_tests/message_tests.py index 57ca5537..5e2cc42a 100644 --- a/extract_msg_tests/message_tests.py +++ b/extract_msg_tests/message_tests.py @@ -68,6 +68,19 @@ def testMultiToToAsEmailMessage(self): self.assertIn('alice@example.com', to_header) self.assertIn('carol@example.com', to_header) + def testUnicodeHeaderAsEmailMessage(self): + """ + Tests that a message whose To header mixes plain and RFC 2047-encoded + display names (using a charset with invalid byte sequences) can be + converted to EML bytes without a UnicodeEncodeError. This crashes on + Python 3.13 without the fix. + """ + with openMsg(TEST_FILE_DIR / 'unicode-header.msg') as msg: + em = msg.asEmailMessage() + self.assertIsInstance(em, EmailMessage) + raw = em.as_bytes() + self.assertIn(b'alice@example.com', raw) + def testMultiToAsEmailMessage(self): """ Tests that a message with multiple To recipients converts to EML without error, From 74bfd5ba5ba85bd253897a859686655f5b8868b0 Mon Sep 17 00:00:00 2001 From: Kevin Tam Date: Sun, 19 Apr 2026 08:07:35 +0800 Subject: [PATCH 4/5] Fix GB2312-labelled RFC 2047 headers mangled when bytes are valid GBK Real-world emails often label headers as GB2312 but use byte sequences only valid in GBK (a strict superset). The previous latin-1 fallback produced garbled display names. Now tries GBK/CP936 before latin-1 when a GB2312-declared encoded word fails to decode. --- extract_msg/msg_classes/message_base.py | 18 +++++++++++++++++- extract_msg_tests/message_tests.py | 18 ++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/extract_msg/msg_classes/message_base.py b/extract_msg/msg_classes/message_base.py index d97c3046..42780a62 100644 --- a/extract_msg/msg_classes/message_base.py +++ b/extract_msg/msg_classes/message_base.py @@ -59,6 +59,15 @@ _RFC2047_WORD = re.compile(r'=\?[^?]+\?[bBqQ]\?[^?]*\?=') +# Encodings to try when a declared charset fails, keyed by normalised charset name. +# GBK is a strict superset of GB2312 and accepts the ASCII-range second bytes that +# GB2312 rejects, so real-world GB2312-labelled headers often decode correctly as GBK. +_CHARSET_FALLBACKS: Dict[str, Tuple[str, ...]] = { + 'gb2312': ('gbk', 'cp936'), + 'gb_2312': ('gbk', 'cp936'), + 'gb_2312-80': ('gbk', 'cp936'), +} + def _sanitize_header(value: str) -> str: """ @@ -80,7 +89,14 @@ def _fix_word(m: re.Match) -> str: try: text = btext.decode(cs or 'ascii') except (UnicodeDecodeError, LookupError): - text = btext.decode('latin-1', 'replace') + for fallback in _CHARSET_FALLBACKS.get((cs or '').lower(), ()): + try: + text = btext.decode(fallback) + break + except (UnicodeDecodeError, LookupError): + continue + else: + text = btext.decode('latin-1', 'replace') return str(_Header(text, charset='utf-8')) except Exception: return word diff --git a/extract_msg_tests/message_tests.py b/extract_msg_tests/message_tests.py index 5e2cc42a..7b89244f 100644 --- a/extract_msg_tests/message_tests.py +++ b/extract_msg_tests/message_tests.py @@ -3,6 +3,7 @@ ] +import base64 import unittest from email.message import EmailMessage @@ -81,6 +82,23 @@ def testUnicodeHeaderAsEmailMessage(self): raw = em.as_bytes() self.assertIn(b'alice@example.com', raw) + def testGbkFallbackDisplayName(self): + """ + Tests that RFC 2047 encoded words declared as GB2312 but containing + byte sequences only valid in GBK (a strict superset) are decoded + correctly rather than mangled via latin-1 fallback. + + The encoded word =?gb2312?B?6pCzydXCKG1heGNoZW4p?= decodes to + '陳成章(maxchen)' in GBK. Without the fix, the display name is + garbled as latin-1. + """ + with openMsg(TEST_FILE_DIR / 'unicode-header.msg') as msg: + em = msg.asEmailMessage() + raw = em.as_bytes() + # The correctly GBK-decoded name must appear RFC 2047-encoded as UTF-8. + # The Chinese characters 陳成章 base64-encoded in UTF-8 is the marker. + self.assertIn(base64.b64encode('陳成章'.encode('utf-8')), raw) + def testMultiToAsEmailMessage(self): """ Tests that a message with multiple To recipients converts to EML without error, From 6adacd14105470f4a7e8f073832157dac6db77c1 Mon Sep 17 00:00:00 2001 From: Kevin Tam Date: Sun, 19 Apr 2026 08:39:58 +0800 Subject: [PATCH 5/5] fix: clean up eml header handling --- extract_msg/msg_classes/message_base.py | 122 ++++++++++++++++-------- 1 file changed, 83 insertions(+), 39 deletions(-) diff --git a/extract_msg/msg_classes/message_base.py b/extract_msg/msg_classes/message_base.py index 42780a62..95d68a8a 100644 --- a/extract_msg/msg_classes/message_base.py +++ b/extract_msg/msg_classes/message_base.py @@ -69,39 +69,57 @@ } -def _sanitize_header(value: str) -> str: +def _fix_encoded_word(m: re.Match) -> str: """ - Prepare a header value from a compat32-parsed Message for safe assignment - to an EmailMessage. + Regex substitution callback: decode one RFC 2047 encoded word and + re-emit it as a valid UTF-8 encoded word. - RFC 2047 encoded words that use unknown or invalid charsets are re-encoded - as UTF-8 so that EmailMessage.as_bytes() does not raise UnicodeEncodeError - when folding the header. Raw non-ASCII characters are also RFC 2047-encoded. + If the declared charset fails (e.g. GB2312-labelled GBK bytes), charset + fallbacks from _CHARSET_FALLBACKS are tried before falling back to + latin-1 with replacement characters. """ - if '=?' in value: - def _fix_word(m: re.Match) -> str: - word = m.group(0) - try: - parts = _decode_header(word) - if len(parts) != 1 or not isinstance(parts[0][0], bytes): - return word - btext, cs = parts[0] + word = m.group(0) + try: + parts = _decode_header(word) + if len(parts) != 1 or not isinstance(parts[0][0], bytes): + return word + btext, cs = parts[0] + try: + text = btext.decode(cs or 'ascii') + except (UnicodeDecodeError, LookupError): + for fallback in _CHARSET_FALLBACKS.get((cs or '').lower(), ()): try: - text = btext.decode(cs or 'ascii') + text = btext.decode(fallback) + break except (UnicodeDecodeError, LookupError): - for fallback in _CHARSET_FALLBACKS.get((cs or '').lower(), ()): - try: - text = btext.decode(fallback) - break - except (UnicodeDecodeError, LookupError): - continue - else: - text = btext.decode('latin-1', 'replace') - return str(_Header(text, charset='utf-8')) - except Exception: - return word + continue + else: + text = btext.decode('latin-1', 'replace') + return str(_Header(text, charset='utf-8')) + except Exception: + return word - value = _RFC2047_WORD.sub(_fix_word, value) + +def _preprocess_encoded_words(text: str) -> str: + """ + Replace every RFC 2047 encoded word in *text* with a clean UTF-8 + encoded word. Safe to apply to an entire raw header block before + feeding it to the modern email policy parser. + """ + return _RFC2047_WORD.sub(_fix_encoded_word, text) + + +def _sanitize_header(value: str) -> str: + """ + Prepare a header value from a compat32-parsed Message for safe assignment + to an EmailMessage (fallback path when no raw headerText is available). + + RFC 2047 encoded words are re-encoded as UTF-8; raw non-ASCII characters + are also RFC 2047-encoded so that EmailMessage.as_bytes() never raises + UnicodeEncodeError. + """ + if '=?' in value: + value = _RFC2047_WORD.sub(_fix_encoded_word, value) try: value.encode('ascii') @@ -219,22 +237,48 @@ def asEmailMessage(self) -> EmailMessage: """ ret = EmailMessage() - # Merge duplicate keys (e.g. multiple TO entries) into one comma-separated value. - # Keyed by lowercased name to handle mixed casing (e.g. 'TO' vs 'To'). - seen = {} - for key, value in self.header.items(): + # Prefer the raw transport-header block: pre-fix any malformed RFC 2047 + # encoded words (e.g. GB2312-labelled GBK bytes), then parse with the + # modern policy so that RFC 5322 unfolding and RFC 2047 decoding are + # handled natively and the values arrive as clean Unicode strings. + # Fall back to the compat32-derived self.header when no raw text is + # stored (synthesised headers from MAPI properties). + _ADDRESS_HEADERS = frozenset({'to', 'cc', 'bcc', 'reply-to'}) + if self.headerText: + raw = self.headerText + if raw.startswith('Microsoft Mail Internet Headers Version 2.0'): + raw = raw[43:].lstrip() + raw = _preprocess_encoded_words(raw) + source_items = list(HeaderParser(policy = policy.default).parsestr(raw).items()) + sanitize = False + else: + source_items = list(self.header.items()) + sanitize = True + + # Address headers (To/CC/BCC/Reply-To) may appear once per recipient in + # the stored header block; merge them into a single comma-separated value. + # All other headers — including multi-valued trace headers like Received + # and Authentication-Results — are forwarded as-is; EmailMessage appends + # rather than replaces, so natural repetition is preserved correctly. + address_merged: Dict[str, Tuple[str, str]] = {} + for key, value in source_items: if key.lower() == 'content-type': continue - # RFC 5322 unfolding: replace CRLF/LF + whitespace with a single space. - cleaned = re.sub(r'\r?\n[ \t]', ' ', value) - cleaned = cleaned.replace('\r\n', '').replace('\n', '') + if sanitize: + # compat32 values are folded and may contain raw encoded words. + value = re.sub(r'\r?\n[ \t]', ' ', value) + value = value.replace('\r\n', '').replace('\n', '') + value = _sanitize_header(value) lower = key.lower() - if lower in seen: - seen[lower] = (seen[lower][0], seen[lower][1] + ', ' + cleaned) + if lower in _ADDRESS_HEADERS: + if lower in address_merged: + address_merged[lower] = (address_merged[lower][0], address_merged[lower][1] + ', ' + value) + else: + address_merged[lower] = (key, value) else: - seen[lower] = (key, cleaned) - for _, (key, value) in seen.items(): - ret[key] = _sanitize_header(value) + ret[key] = value + for _, (key, value) in address_merged.items(): + ret[key] = value ret['Content-Type'] = 'multipart/mixed'