From cef74d0d051ded2e95e06f81c42b2809bf39d826 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 23 Apr 2026 14:46:35 +0300 Subject: [PATCH 1/9] gh-148821: Always reject known multi-byte encodings in pyexpat The XML parser (pyexpat) now raises ValueError for known unsupported multi-byte encodings such us "ISO-2022-JP", "utf8" (without hyphen) or "raw-unicode-escape" instead of failing later, when encounter non-ASCII data. --- Include/codecs.h | 6 +++ Include/internal/pycore_codecs.h | 2 +- Lib/codecs.py | 5 +- Lib/encodings/big5.py | 1 + Lib/encodings/big5hkscs.py | 1 + Lib/encodings/cp932.py | 1 + Lib/encodings/cp949.py | 1 + Lib/encodings/cp950.py | 1 + Lib/encodings/euc_jis_2004.py | 1 + Lib/encodings/euc_jisx0213.py | 1 + Lib/encodings/euc_jp.py | 1 + Lib/encodings/euc_kr.py | 1 + Lib/encodings/gb18030.py | 1 + Lib/encodings/gb2312.py | 1 + Lib/encodings/gbk.py | 1 + Lib/encodings/hz.py | 1 + Lib/encodings/idna.py | 1 + Lib/encodings/iso2022_jp.py | 1 + Lib/encodings/iso2022_jp_1.py | 1 + Lib/encodings/iso2022_jp_2.py | 1 + Lib/encodings/iso2022_jp_2004.py | 1 + Lib/encodings/iso2022_jp_3.py | 1 + Lib/encodings/iso2022_jp_ext.py | 1 + Lib/encodings/iso2022_kr.py | 1 + Lib/encodings/johab.py | 1 + Lib/encodings/punycode.py | 1 + Lib/encodings/raw_unicode_escape.py | 1 + Lib/encodings/shift_jis.py | 1 + Lib/encodings/shift_jis_2004.py | 1 + Lib/encodings/shift_jisx0213.py | 1 + Lib/encodings/unicode_escape.py | 1 + Lib/encodings/utf_16.py | 1 + Lib/encodings/utf_16_be.py | 1 + Lib/encodings/utf_16_le.py | 1 + Lib/encodings/utf_32.py | 1 + Lib/encodings/utf_32_be.py | 1 + Lib/encodings/utf_32_le.py | 1 + Lib/encodings/utf_7.py | 1 + Lib/encodings/utf_8.py | 1 + Lib/encodings/utf_8_sig.py | 1 + Lib/test/test_codecs.py | 3 ++ Lib/test/test_pyexpat.py | 47 ++++++++++++++++++- ...-04-23-14-46-30.gh-issue-148821.cR4kMa.rst | 4 ++ Modules/pyexpat.c | 26 ++++++++++ Tools/unicode/gencjkcodecs.py | 1 + 45 files changed, 128 insertions(+), 3 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst diff --git a/Include/codecs.h b/Include/codecs.h index 512a3c723eca18..d14f527dee75da 100644 --- a/Include/codecs.h +++ b/Include/codecs.h @@ -170,6 +170,12 @@ PyAPI_FUNC(PyObject *) PyCodec_NameReplaceErrors(PyObject *exc); PyAPI_DATA(const char *) Py_hexdigits; #endif +#ifndef Py_LIMITED_API +PyAPI_FUNC(PyObject*) _PyCodec_LookupTextEncoding( + const char *encoding, + const char *alternate_command); +#endif + #ifdef __cplusplus } #endif diff --git a/Include/internal/pycore_codecs.h b/Include/internal/pycore_codecs.h index 52dca1362592d6..bfa10eadf73573 100644 --- a/Include/internal/pycore_codecs.h +++ b/Include/internal/pycore_codecs.h @@ -45,7 +45,7 @@ extern int _PyCodec_UnregisterError(const char *name); in Python 3.5+? */ -extern PyObject* _PyCodec_LookupTextEncoding( +PyAPI_FUNC(PyObject*) _PyCodec_LookupTextEncoding( const char *encoding, const char *alternate_command); diff --git a/Lib/codecs.py b/Lib/codecs.py index e4a8010aba90a5..e99460a670a516 100644 --- a/Lib/codecs.py +++ b/Lib/codecs.py @@ -93,7 +93,8 @@ class CodecInfo(tuple): def __new__(cls, encode, decode, streamreader=None, streamwriter=None, incrementalencoder=None, incrementaldecoder=None, name=None, - *, _is_text_encoding=None): + *, _is_text_encoding=None, + _is_single_byte=None): self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) self.name = name self.encode = encode @@ -104,6 +105,8 @@ def __new__(cls, encode, decode, streamreader=None, streamwriter=None, self.streamreader = streamreader if _is_text_encoding is not None: self._is_text_encoding = _is_text_encoding + if _is_single_byte is not None: + self._is_single_byte = _is_single_byte return self def __repr__(self): diff --git a/Lib/encodings/big5.py b/Lib/encodings/big5.py index 7adeb0e1605274..8bed14b35c5899 100644 --- a/Lib/encodings/big5.py +++ b/Lib/encodings/big5.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/big5hkscs.py b/Lib/encodings/big5hkscs.py index 350df37baaedaf..eeeb7865895190 100644 --- a/Lib/encodings/big5hkscs.py +++ b/Lib/encodings/big5hkscs.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/cp932.py b/Lib/encodings/cp932.py index e01f59b7190576..3671a4387f96b6 100644 --- a/Lib/encodings/cp932.py +++ b/Lib/encodings/cp932.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/cp949.py b/Lib/encodings/cp949.py index 627c87125e2aff..df998ba3bad75c 100644 --- a/Lib/encodings/cp949.py +++ b/Lib/encodings/cp949.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/cp950.py b/Lib/encodings/cp950.py index 39eec5ed0ddef9..12c7bbd8d226ad 100644 --- a/Lib/encodings/cp950.py +++ b/Lib/encodings/cp950.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/euc_jis_2004.py b/Lib/encodings/euc_jis_2004.py index 72b87aea68862f..68604db3c30b2d 100644 --- a/Lib/encodings/euc_jis_2004.py +++ b/Lib/encodings/euc_jis_2004.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/euc_jisx0213.py b/Lib/encodings/euc_jisx0213.py index cc47d04112a187..cd2808965a6edd 100644 --- a/Lib/encodings/euc_jisx0213.py +++ b/Lib/encodings/euc_jisx0213.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/euc_jp.py b/Lib/encodings/euc_jp.py index 7bcbe4147f2ad4..bcdd0582d71902 100644 --- a/Lib/encodings/euc_jp.py +++ b/Lib/encodings/euc_jp.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/euc_kr.py b/Lib/encodings/euc_kr.py index c1fb1260e879f0..8a81356d8f9980 100644 --- a/Lib/encodings/euc_kr.py +++ b/Lib/encodings/euc_kr.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/gb18030.py b/Lib/encodings/gb18030.py index 34fb6c366a7614..98df7d4cbeec3d 100644 --- a/Lib/encodings/gb18030.py +++ b/Lib/encodings/gb18030.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/gb2312.py b/Lib/encodings/gb2312.py index 3c3b837d618ecd..ba915a2500f21a 100644 --- a/Lib/encodings/gb2312.py +++ b/Lib/encodings/gb2312.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/gbk.py b/Lib/encodings/gbk.py index 1b45db89859cdf..d597c7bb77e93e 100644 --- a/Lib/encodings/gbk.py +++ b/Lib/encodings/gbk.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/hz.py b/Lib/encodings/hz.py index 383442a3c9ac9a..43ee36a9286426 100644 --- a/Lib/encodings/hz.py +++ b/Lib/encodings/hz.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py index d31ee07ab45b76..98bf9462e36fbf 100644 --- a/Lib/encodings/idna.py +++ b/Lib/encodings/idna.py @@ -385,4 +385,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, + _is_single_byte=False, ) diff --git a/Lib/encodings/iso2022_jp.py b/Lib/encodings/iso2022_jp.py index ab0406069356e4..27129ce67aa884 100644 --- a/Lib/encodings/iso2022_jp.py +++ b/Lib/encodings/iso2022_jp.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/iso2022_jp_1.py b/Lib/encodings/iso2022_jp_1.py index 997044dc378749..0f41dd95cd4332 100644 --- a/Lib/encodings/iso2022_jp_1.py +++ b/Lib/encodings/iso2022_jp_1.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/iso2022_jp_2.py b/Lib/encodings/iso2022_jp_2.py index 9106bf762512fd..25f625819f5ea0 100644 --- a/Lib/encodings/iso2022_jp_2.py +++ b/Lib/encodings/iso2022_jp_2.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/iso2022_jp_2004.py b/Lib/encodings/iso2022_jp_2004.py index 40198bf098570b..1f0bd1b7874472 100644 --- a/Lib/encodings/iso2022_jp_2004.py +++ b/Lib/encodings/iso2022_jp_2004.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/iso2022_jp_3.py b/Lib/encodings/iso2022_jp_3.py index 346e08beccbbaf..2acdb3a2cd9be3 100644 --- a/Lib/encodings/iso2022_jp_3.py +++ b/Lib/encodings/iso2022_jp_3.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/iso2022_jp_ext.py b/Lib/encodings/iso2022_jp_ext.py index 752bab9813a094..a32a533e8bdf00 100644 --- a/Lib/encodings/iso2022_jp_ext.py +++ b/Lib/encodings/iso2022_jp_ext.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/iso2022_kr.py b/Lib/encodings/iso2022_kr.py index bf7018763eae38..51dd4ab560422a 100644 --- a/Lib/encodings/iso2022_kr.py +++ b/Lib/encodings/iso2022_kr.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/johab.py b/Lib/encodings/johab.py index 512aeeb732b522..e58c50a06c4b96 100644 --- a/Lib/encodings/johab.py +++ b/Lib/encodings/johab.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/punycode.py b/Lib/encodings/punycode.py index 268fccbd53974e..335acb87cb9b28 100644 --- a/Lib/encodings/punycode.py +++ b/Lib/encodings/punycode.py @@ -250,4 +250,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, + _is_single_byte=False, ) diff --git a/Lib/encodings/raw_unicode_escape.py b/Lib/encodings/raw_unicode_escape.py index 46c8e070dd192e..5c5b41437a84b4 100644 --- a/Lib/encodings/raw_unicode_escape.py +++ b/Lib/encodings/raw_unicode_escape.py @@ -43,4 +43,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, + _is_single_byte=False, ) diff --git a/Lib/encodings/shift_jis.py b/Lib/encodings/shift_jis.py index 83381172764dea..bf7fded09468c8 100644 --- a/Lib/encodings/shift_jis.py +++ b/Lib/encodings/shift_jis.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/shift_jis_2004.py b/Lib/encodings/shift_jis_2004.py index 161b1e86f9918a..ae40b684a010f2 100644 --- a/Lib/encodings/shift_jis_2004.py +++ b/Lib/encodings/shift_jis_2004.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/shift_jisx0213.py b/Lib/encodings/shift_jisx0213.py index cb653f53055e67..5af8565618b40e 100644 --- a/Lib/encodings/shift_jisx0213.py +++ b/Lib/encodings/shift_jisx0213.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/unicode_escape.py b/Lib/encodings/unicode_escape.py index 9b1ce99b339ae0..d896cefc9596be 100644 --- a/Lib/encodings/unicode_escape.py +++ b/Lib/encodings/unicode_escape.py @@ -43,4 +43,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, + _is_single_byte=False, ) diff --git a/Lib/encodings/utf_16.py b/Lib/encodings/utf_16.py index d3b9980026666f..eac93bd17d07d1 100644 --- a/Lib/encodings/utf_16.py +++ b/Lib/encodings/utf_16.py @@ -152,4 +152,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/utf_16_be.py b/Lib/encodings/utf_16_be.py index 86b458eb9bcd96..d056cf9202a40f 100644 --- a/Lib/encodings/utf_16_be.py +++ b/Lib/encodings/utf_16_be.py @@ -39,4 +39,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/utf_16_le.py b/Lib/encodings/utf_16_le.py index ec454142eedf25..2e07f76cc3f742 100644 --- a/Lib/encodings/utf_16_le.py +++ b/Lib/encodings/utf_16_le.py @@ -39,4 +39,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/utf_32.py b/Lib/encodings/utf_32.py index 1924bedbb74c68..aebe145ec95e71 100644 --- a/Lib/encodings/utf_32.py +++ b/Lib/encodings/utf_32.py @@ -147,4 +147,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/utf_32_be.py b/Lib/encodings/utf_32_be.py index fe272b5fafec69..ee1b41a11aa35f 100644 --- a/Lib/encodings/utf_32_be.py +++ b/Lib/encodings/utf_32_be.py @@ -34,4 +34,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/utf_32_le.py b/Lib/encodings/utf_32_le.py index 9e48210928ee65..4ac786bb73349b 100644 --- a/Lib/encodings/utf_32_le.py +++ b/Lib/encodings/utf_32_le.py @@ -34,4 +34,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/utf_7.py b/Lib/encodings/utf_7.py index 8e0567f2087d65..3127867fb5bff9 100644 --- a/Lib/encodings/utf_7.py +++ b/Lib/encodings/utf_7.py @@ -35,4 +35,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/utf_8.py b/Lib/encodings/utf_8.py index 1bf6336571547b..3801615ce34001 100644 --- a/Lib/encodings/utf_8.py +++ b/Lib/encodings/utf_8.py @@ -39,4 +39,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/utf_8_sig.py b/Lib/encodings/utf_8_sig.py index 1bb479203f365d..b5e5c89f80b9eb 100644 --- a/Lib/encodings/utf_8_sig.py +++ b/Lib/encodings/utf_8_sig.py @@ -127,4 +127,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 79c8a7ef886482..03dd61a76db154 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -1892,6 +1892,7 @@ def test_copy(self): self.assertIsNot(dup, orig) self.assertEqual(dup, orig) self.assertTrue(orig._is_text_encoding) + self.assertFalse(orig._is_single_byte) self.assertEqual(dup.encode, orig.encode) self.assertEqual(dup.name, orig.name) self.assertEqual(dup.incrementalencoder, orig.incrementalencoder) @@ -1912,6 +1913,7 @@ def test_deepcopy(self): self.assertIsNot(dup, orig) self.assertEqual(dup, orig) self.assertTrue(orig._is_text_encoding) + self.assertFalse(orig._is_single_byte) self.assertEqual(dup.encode, orig.encode) self.assertEqual(dup.name, orig.name) self.assertEqual(dup.incrementalencoder, orig.incrementalencoder) @@ -1940,6 +1942,7 @@ def test_pickle(self): unpickled_codec_info.incrementalencoder ) self.assertTrue(unpickled_codec_info._is_text_encoding) + self.assertFalse(unpickled_codec_info._is_single_byte) # Test a CodecInfo with _is_text_encoding equal to false. codec_info = codecs.lookup('base64') diff --git a/Lib/test/test_pyexpat.py b/Lib/test/test_pyexpat.py index aaa91aca36e3c4..0763bb19865167 100644 --- a/Lib/test/test_pyexpat.py +++ b/Lib/test/test_pyexpat.py @@ -227,7 +227,7 @@ def _verify_parse_output(self, operations): "Character data: '\xb5'", "End element: 'root'", ] - for operation, expected_operation in zip(operations, expected_operations): + for operation, expected_operation in zip(operations, expected_operations, strict=True): self.assertEqual(operation, expected_operation) def test_parse_bytes(self): @@ -276,6 +276,51 @@ def test_parse_again(self): self.assertEqual(expat.ErrorString(cm.exception.code), expat.errors.XML_ERROR_FINISHED) + @support.subTests('enc', ['UTF-8', 'utf-8', 'utf-16', 'koi8-u', + 'cp1125', 'cp1251', 'iso8859-5', + 'mac_cyrillic']) + def test_supportes_ecodings(self, enc): + out = self.Outputter() + parser = expat.ParserCreate() + self._hookup_callbacks(parser, out) + data = (f'\n' + '<корінь атрибут="значення">зміст').encode(enc) + parser.Parse(data, True) + self.assertEqual(out.out, [ + ('XML declaration', ('1.0', enc, -1)), + "Start element: 'корінь' {'атрибут': 'значення'}", + "Character data: 'зміст'", + "End element: 'корінь'", + ]) + + @support.subTests('enc', [ + 'UTF8', 'UTF-7', + "unicode-escape", "raw-unicode-escape", + "Big5-HKSCS", "Big5", + "cp932", "cp949", "cp950", + "EUC_JIS-2004", "EUC_JISX0213", "EUC-JP", "EUC-KR", + "GB18030", "GB2312", "GBK", + "HZ-GB-2312", + "ISO-2022-JP", "ISO-2022-JP-1", "ISO-2022-JP-2004", + "ISO-2022-JP-2", "ISO-2022-JP-3", "ISO-2022-JP-EXT", + "ISO-2022-KR", + "johab", + "Shift_JIS", "Shift_JIS-2004", "Shift_JISX0213", + ]) + def test_unsupportes_ecodings(self, enc): + parser = expat.ParserCreate() + data = (f'\n' + '').encode(enc) + with self.assertRaises(ValueError): + parser.Parse(data, True) + + def test_unknown_ecoding(self): + parser = expat.ParserCreate() + data = b'\n' + with self.assertRaises(LookupError): + parser.Parse(data, True) + + class NamespaceSeparatorTest(unittest.TestCase): def test_legal(self): # Tests that make sure we get errors when the namespace_separator value diff --git a/Misc/NEWS.d/next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst b/Misc/NEWS.d/next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst new file mode 100644 index 00000000000000..5dd95047178938 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst @@ -0,0 +1,4 @@ +The :mod:`XML parser ` now raises :exc:`ValueError` for known +unsupported multi-byte encodings such us "UTF8", "ISO-2022-JP" or +"raw-unicode-escape" instead of failing later, when encounter non-ASCII +data. diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c index 0f0afe17513ef1..68c8ac0e4accef 100644 --- a/Modules/pyexpat.c +++ b/Modules/pyexpat.c @@ -4,6 +4,7 @@ #include "Python.h" #include "pycore_ceval.h" // _Py_EnterRecursiveCall() +#include "pycore_codecs.h" // _PyCodec_LookupTextEncoding() #include "pycore_import.h" // _PyImport_SetModule() #include "pycore_pyhash.h" // _Py_HashSecret #include "pycore_traceback.h" // _PyTraceback_Add() @@ -1465,6 +1466,31 @@ PyUnknownEncodingHandler(void *encodingHandlerData, if (PyErr_Occurred()) return XML_STATUS_ERROR; + PyObject *codec = _PyCodec_LookupTextEncoding(name, NULL); + if (codec == NULL) { + return XML_STATUS_ERROR; + } + // if (!PyTuple_CheckExact(codec)) { + // PyObject *attr; + // if (PyObject_GetOptionalAttrString(codec, "_is_single_byte", &attr) < 0) { + // Py_DECREF(codec); + // return XML_STATUS_ERROR; + // } + // if (attr != NULL) { + // int is_single_byte = PyObject_IsTrue(attr); + // Py_DECREF(attr); + // if (is_single_byte <= 0) { + // Py_DECREF(codec); + // if (is_single_byte == 0) { + // PyErr_SetString(PyExc_ValueError, + // "multi-byte encodings are not supported"); + // } + // return XML_STATUS_ERROR; + // } + // } + // } + Py_DECREF(codec); + u = PyUnicode_Decode((const char*) template_buffer, 256, name, "replace"); if (u == NULL) { Py_XDECREF(u); diff --git a/Tools/unicode/gencjkcodecs.py b/Tools/unicode/gencjkcodecs.py index 45866bf2f61062..eb04f67f2077eb 100644 --- a/Tools/unicode/gencjkcodecs.py +++ b/Tools/unicode/gencjkcodecs.py @@ -51,6 +51,7 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) """) From 2e2df1ea095bf9263b3aedb6332a5a2ef6c6ed3f Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 23 Apr 2026 15:47:18 +0300 Subject: [PATCH 2/9] Uncomment temporary commented out code. --- Modules/pyexpat.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c index 68c8ac0e4accef..e95dcb611a33e2 100644 --- a/Modules/pyexpat.c +++ b/Modules/pyexpat.c @@ -1470,25 +1470,25 @@ PyUnknownEncodingHandler(void *encodingHandlerData, if (codec == NULL) { return XML_STATUS_ERROR; } - // if (!PyTuple_CheckExact(codec)) { - // PyObject *attr; - // if (PyObject_GetOptionalAttrString(codec, "_is_single_byte", &attr) < 0) { - // Py_DECREF(codec); - // return XML_STATUS_ERROR; - // } - // if (attr != NULL) { - // int is_single_byte = PyObject_IsTrue(attr); - // Py_DECREF(attr); - // if (is_single_byte <= 0) { - // Py_DECREF(codec); - // if (is_single_byte == 0) { - // PyErr_SetString(PyExc_ValueError, - // "multi-byte encodings are not supported"); - // } - // return XML_STATUS_ERROR; - // } - // } - // } + if (!PyTuple_CheckExact(codec)) { + PyObject *attr; + if (PyObject_GetOptionalAttrString(codec, "_is_single_byte", &attr) < 0) { + Py_DECREF(codec); + return XML_STATUS_ERROR; + } + if (attr != NULL) { + int is_single_byte = PyObject_IsTrue(attr); + Py_DECREF(attr); + if (is_single_byte <= 0) { + Py_DECREF(codec); + if (is_single_byte == 0) { + PyErr_SetString(PyExc_ValueError, + "multi-byte encodings are not supported"); + } + return XML_STATUS_ERROR; + } + } + } Py_DECREF(codec); u = PyUnicode_Decode((const char*) template_buffer, 256, name, "replace"); From 91ac15e21f2d81fc2803856f641d7ae5bbaba45a Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 25 Apr 2026 15:59:16 +0300 Subject: [PATCH 3/9] Fix the module reference. --- .../next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst b/Misc/NEWS.d/next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst index 5dd95047178938..119a465fcb200a 100644 --- a/Misc/NEWS.d/next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst +++ b/Misc/NEWS.d/next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst @@ -1,4 +1,4 @@ -The :mod:`XML parser ` now raises :exc:`ValueError` for known +The :mod:`XML parser ` now raises :exc:`ValueError` for known unsupported multi-byte encodings such us "UTF8", "ISO-2022-JP" or "raw-unicode-escape" instead of failing later, when encounter non-ASCII data. From 2177825c7729d03c92b9618b0f98f2aca0abb3b9 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 25 Apr 2026 16:11:37 +0300 Subject: [PATCH 4/9] Fix ElementTree tests. --- Lib/test/test_xml_etree.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index 51af46f124cac6..730456e7582adc 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -1014,7 +1014,7 @@ def xml(encoding): def bxml(encoding): return xml(encoding).encode(encoding) supported_encodings = [ - 'ascii', 'utf-8', 'utf-8-sig', 'utf-16', 'utf-16be', 'utf-16le', + 'ascii', 'utf-8', 'utf-16', 'utf-16be', 'utf-16le', 'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5', 'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10', 'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16', @@ -1025,32 +1025,34 @@ def bxml(encoding): 'cp1256', 'cp1257', 'cp1258', 'mac-cyrillic', 'mac-greek', 'mac-iceland', 'mac-latin2', 'mac-roman', 'mac-turkish', - 'iso2022-jp', 'iso2022-jp-1', 'iso2022-jp-2', 'iso2022-jp-2004', - 'iso2022-jp-3', 'iso2022-jp-ext', - 'koi8-r', 'koi8-t', 'koi8-u', 'kz1048', - 'hz', 'ptcp154', + 'koi8-r', 'koi8-t', 'koi8-u', 'kz1048', 'ptcp154', ] for encoding in supported_encodings: - self.assertEqual(ET.tostring(ET.XML(bxml(encoding))), b'') + with self.subTest(encoding=encoding): + self.assertEqual(ET.tostring(ET.XML(bxml(encoding))), b'') unsupported_ascii_compatible_encodings = [ 'big5', 'big5hkscs', 'cp932', 'cp949', 'cp950', 'euc-jp', 'euc-jis-2004', 'euc-jisx0213', 'euc-kr', 'gb2312', 'gbk', 'gb18030', - 'iso2022-kr', 'johab', + 'iso2022-jp', 'iso2022-jp-1', 'iso2022-jp-2', 'iso2022-jp-2004', + 'iso2022-jp-3', 'iso2022-jp-ext', + 'iso2022-kr', 'johab', 'hz', 'shift-jis', 'shift-jis-2004', 'shift-jisx0213', - 'utf-7', + 'utf-7', 'utf-8-sig', 'utf8', ] for encoding in unsupported_ascii_compatible_encodings: - self.assertRaises(ValueError, ET.XML, bxml(encoding)) + with self.subTest(encoding=encoding): + self.assertRaises(ValueError, ET.XML, bxml(encoding)) unsupported_ascii_incompatible_encodings = [ 'cp037', 'cp424', 'cp500', 'cp864', 'cp875', 'cp1026', 'cp1140', 'utf_32', 'utf_32_be', 'utf_32_le', ] for encoding in unsupported_ascii_incompatible_encodings: - self.assertRaises(ET.ParseError, ET.XML, bxml(encoding)) + with self.subTest(encoding=encoding): + self.assertRaises(ET.ParseError, ET.XML, bxml(encoding)) self.assertRaises(ValueError, ET.XML, xml('undefined').encode('ascii')) self.assertRaises(LookupError, ET.XML, xml('xxx').encode('ascii')) From 6c9588fe822d5c38374b45f04b6060a61c324623 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 27 Apr 2026 18:47:31 +0300 Subject: [PATCH 5/9] Fix linking error for _PyCodec_LookupTextEncoding. --- Include/codecs.h | 6 ------ Python/codecs.c | 1 + 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/Include/codecs.h b/Include/codecs.h index d14f527dee75da..512a3c723eca18 100644 --- a/Include/codecs.h +++ b/Include/codecs.h @@ -170,12 +170,6 @@ PyAPI_FUNC(PyObject *) PyCodec_NameReplaceErrors(PyObject *exc); PyAPI_DATA(const char *) Py_hexdigits; #endif -#ifndef Py_LIMITED_API -PyAPI_FUNC(PyObject*) _PyCodec_LookupTextEncoding( - const char *encoding, - const char *alternate_command); -#endif - #ifdef __cplusplus } #endif diff --git a/Python/codecs.c b/Python/codecs.c index 0bde56c0ac662e..a522e6b88068b3 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -10,6 +10,7 @@ Copyright (c) Corporation for National Research Initiatives. #include "Python.h" #include "pycore_call.h" // _PyObject_CallNoArgs() +#include "pycore_codecs.h" // export _PyCodec_LookupTextEncoding() #include "pycore_interp.h" // PyInterpreterState.codec_search_path #include "pycore_pyerrors.h" // _PyErr_FormatNote() #include "pycore_pystate.h" // _PyInterpreterState_GET() From 02ecf828b36191f32dc466dd26ec948e472cf2b2 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 27 Apr 2026 19:08:18 +0300 Subject: [PATCH 6/9] Add more strict tests for supported encodings. --- Lib/test/test_pyexpat.py | 54 ++++++++++++++++++++++++++++++-------- Lib/test/test_xml_etree.py | 13 +++++---- 2 files changed, 51 insertions(+), 16 deletions(-) diff --git a/Lib/test/test_pyexpat.py b/Lib/test/test_pyexpat.py index 0763bb19865167..4d3425cf867227 100644 --- a/Lib/test/test_pyexpat.py +++ b/Lib/test/test_pyexpat.py @@ -276,24 +276,56 @@ def test_parse_again(self): self.assertEqual(expat.ErrorString(cm.exception.code), expat.errors.XML_ERROR_FINISHED) - @support.subTests('enc', ['UTF-8', 'utf-8', 'utf-16', 'koi8-u', - 'cp1125', 'cp1251', 'iso8859-5', - 'mac_cyrillic']) - def test_supportes_ecodings(self, enc): + @support.subTests('encoding', [ + 'utf-8', 'utf-16', 'utf-16be', 'utf-16le', + 'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5', + 'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10', + 'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16', + 'cp437', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', + 'cp855', 'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', + 'cp863', 'cp865', 'cp866', 'cp869', 'cp874', 'cp1006', 'cp1125', + 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', + 'cp1256', 'cp1257', 'cp1258', + 'mac-cyrillic', 'mac-greek', 'mac-iceland', 'mac-latin2', + 'mac-roman', 'mac-turkish', + 'koi8-r', 'koi8-t', 'koi8-u', 'kz1048', 'ptcp154', + ]) + def test_supported_ecodings(self, encoding): + out = self.Outputter() + parser = expat.ParserCreate() + self._hookup_callbacks(parser, out) + c = 'éπя\u05d0\u060c€'.encode(encoding, 'ignore').decode(encoding)[0] + data = (f'\n' + f'{c}').encode(encoding) + parser.Parse(data, True) + self.assertEqual(out.out, [ + ('XML declaration', ('1.0', encoding, -1)), + "Start element: 'root' {}", + f'Character data: {c!r}', + "End element: 'root'", + ]) + + @support.subTests('encoding', [ + 'UTF-8', 'utf-8', 'utf-16', 'utf-16le', 'utf-16be', + 'koi8-u', 'cp1125', 'cp1251', 'iso8859-5', 'mac-cyrillic', + ]) + def test_supported_ecodings2(self, encoding): out = self.Outputter() parser = expat.ParserCreate() self._hookup_callbacks(parser, out) - data = (f'\n' - '<корінь атрибут="значення">зміст').encode(enc) + data = (f'\n' + '' + '<корінь атрибут="значення">зміст').encode(encoding) parser.Parse(data, True) self.assertEqual(out.out, [ - ('XML declaration', ('1.0', enc, -1)), + ('XML declaration', ('1.0', encoding, -1)), + "Comment: ' коментар '", "Start element: 'корінь' {'атрибут': 'значення'}", "Character data: 'зміст'", "End element: 'корінь'", ]) - @support.subTests('enc', [ + @support.subTests('encoding', [ 'UTF8', 'UTF-7', "unicode-escape", "raw-unicode-escape", "Big5-HKSCS", "Big5", @@ -307,10 +339,10 @@ def test_supportes_ecodings(self, enc): "johab", "Shift_JIS", "Shift_JIS-2004", "Shift_JISX0213", ]) - def test_unsupportes_ecodings(self, enc): + def test_unsupportes_ecodings(self, encoding): parser = expat.ParserCreate() - data = (f'\n' - '').encode(enc) + data = (f'\n' + '').encode(encoding) with self.assertRaises(ValueError): parser.Parse(data, True) diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index 730456e7582adc..71ebb7f3182b26 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -1009,12 +1009,12 @@ def check(encoding, body=''): check("cp437", '\u221a') check("mac-roman", '\u02da') - def xml(encoding): - return "" % encoding - def bxml(encoding): - return xml(encoding).encode(encoding) + def xml(encoding, body=''): + return "%s" % (encoding, body) + def bxml(encoding, body=''): + return xml(encoding, body).encode(encoding) supported_encodings = [ - 'ascii', 'utf-8', 'utf-16', 'utf-16be', 'utf-16le', + 'utf-8', 'utf-16', 'utf-16be', 'utf-16le', 'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5', 'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10', 'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16', @@ -1030,6 +1030,9 @@ def bxml(encoding): for encoding in supported_encodings: with self.subTest(encoding=encoding): self.assertEqual(ET.tostring(ET.XML(bxml(encoding))), b'') + c = 'éπя\u05d0\u060c€'.encode(encoding, 'ignore').decode(encoding)[0] + self.assertEqual(ET.tostring(ET.XML(bxml(encoding, c))), + ('&#%d;' % ord(c)).encode()) unsupported_ascii_compatible_encodings = [ 'big5', 'big5hkscs', From 230fde155bc8b7dec70f922839790c851b80d350 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 13 May 2026 13:46:36 +0300 Subject: [PATCH 7/9] Add a comment. --- Modules/pyexpat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c index 2fe07f3dda7065..09f9d40d4ec710 100644 --- a/Modules/pyexpat.c +++ b/Modules/pyexpat.c @@ -1479,7 +1479,7 @@ PyUnknownEncodingHandler(void *encodingHandlerData, if (attr != NULL) { int is_single_byte = PyObject_IsTrue(attr); Py_DECREF(attr); - if (is_single_byte <= 0) { + if (is_single_byte <= 0) { // error or false Py_DECREF(codec); if (is_single_byte == 0) { PyErr_SetString(PyExc_ValueError, From ae909b268781c61061f2ce729f644ad1f53106d1 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 13 May 2026 16:31:59 +0300 Subject: [PATCH 8/9] Use _is_multibyte=True instead of _is_single_byte=False. --- Lib/codecs.py | 6 +++--- Lib/encodings/big5.py | 2 +- Lib/encodings/big5hkscs.py | 2 +- Lib/encodings/cp932.py | 2 +- Lib/encodings/cp949.py | 2 +- Lib/encodings/cp950.py | 2 +- Lib/encodings/euc_jis_2004.py | 2 +- Lib/encodings/euc_jisx0213.py | 2 +- Lib/encodings/euc_jp.py | 2 +- Lib/encodings/euc_kr.py | 2 +- Lib/encodings/gb18030.py | 2 +- Lib/encodings/gb2312.py | 2 +- Lib/encodings/gbk.py | 2 +- Lib/encodings/hz.py | 2 +- Lib/encodings/idna.py | 2 +- Lib/encodings/iso2022_jp.py | 2 +- Lib/encodings/iso2022_jp_1.py | 2 +- Lib/encodings/iso2022_jp_2.py | 2 +- Lib/encodings/iso2022_jp_2004.py | 2 +- Lib/encodings/iso2022_jp_3.py | 2 +- Lib/encodings/iso2022_jp_ext.py | 2 +- Lib/encodings/iso2022_kr.py | 2 +- Lib/encodings/johab.py | 2 +- Lib/encodings/punycode.py | 2 +- Lib/encodings/raw_unicode_escape.py | 2 +- Lib/encodings/shift_jis.py | 2 +- Lib/encodings/shift_jis_2004.py | 2 +- Lib/encodings/shift_jisx0213.py | 2 +- Lib/encodings/unicode_escape.py | 2 +- Lib/encodings/utf_16.py | 2 +- Lib/encodings/utf_16_be.py | 2 +- Lib/encodings/utf_16_le.py | 2 +- Lib/encodings/utf_32.py | 2 +- Lib/encodings/utf_32_be.py | 2 +- Lib/encodings/utf_32_le.py | 2 +- Lib/encodings/utf_7.py | 2 +- Lib/encodings/utf_8.py | 2 +- Lib/encodings/utf_8_sig.py | 2 +- Lib/test/test_codecs.py | 6 +++--- Modules/pyexpat.c | 8 ++++---- Tools/unicode/gencjkcodecs.py | 2 +- 41 files changed, 48 insertions(+), 48 deletions(-) diff --git a/Lib/codecs.py b/Lib/codecs.py index e99460a670a516..411856b3738d61 100644 --- a/Lib/codecs.py +++ b/Lib/codecs.py @@ -94,7 +94,7 @@ class CodecInfo(tuple): def __new__(cls, encode, decode, streamreader=None, streamwriter=None, incrementalencoder=None, incrementaldecoder=None, name=None, *, _is_text_encoding=None, - _is_single_byte=None): + _is_multibyte=None): self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) self.name = name self.encode = encode @@ -105,8 +105,8 @@ def __new__(cls, encode, decode, streamreader=None, streamwriter=None, self.streamreader = streamreader if _is_text_encoding is not None: self._is_text_encoding = _is_text_encoding - if _is_single_byte is not None: - self._is_single_byte = _is_single_byte + if _is_multibyte is not None: + self._is_multibyte = _is_multibyte return self def __repr__(self): diff --git a/Lib/encodings/big5.py b/Lib/encodings/big5.py index 8bed14b35c5899..0ffbf78f8c5f4d 100644 --- a/Lib/encodings/big5.py +++ b/Lib/encodings/big5.py @@ -36,5 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/big5hkscs.py b/Lib/encodings/big5hkscs.py index eeeb7865895190..c0c8960516469e 100644 --- a/Lib/encodings/big5hkscs.py +++ b/Lib/encodings/big5hkscs.py @@ -36,5 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/cp932.py b/Lib/encodings/cp932.py index 3671a4387f96b6..08213e7d8682ea 100644 --- a/Lib/encodings/cp932.py +++ b/Lib/encodings/cp932.py @@ -36,5 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/cp949.py b/Lib/encodings/cp949.py index df998ba3bad75c..4a0fb42579c4e6 100644 --- a/Lib/encodings/cp949.py +++ b/Lib/encodings/cp949.py @@ -36,5 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/cp950.py b/Lib/encodings/cp950.py index 12c7bbd8d226ad..a1e0196668a619 100644 --- a/Lib/encodings/cp950.py +++ b/Lib/encodings/cp950.py @@ -36,5 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/euc_jis_2004.py b/Lib/encodings/euc_jis_2004.py index 68604db3c30b2d..ede44475ae0891 100644 --- a/Lib/encodings/euc_jis_2004.py +++ b/Lib/encodings/euc_jis_2004.py @@ -36,5 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/euc_jisx0213.py b/Lib/encodings/euc_jisx0213.py index cd2808965a6edd..958240852519ce 100644 --- a/Lib/encodings/euc_jisx0213.py +++ b/Lib/encodings/euc_jisx0213.py @@ -36,5 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/euc_jp.py b/Lib/encodings/euc_jp.py index bcdd0582d71902..e1d4d25d6b417d 100644 --- a/Lib/encodings/euc_jp.py +++ b/Lib/encodings/euc_jp.py @@ -36,5 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/euc_kr.py b/Lib/encodings/euc_kr.py index 8a81356d8f9980..28f491e7367d6a 100644 --- a/Lib/encodings/euc_kr.py +++ b/Lib/encodings/euc_kr.py @@ -36,5 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/gb18030.py b/Lib/encodings/gb18030.py index 98df7d4cbeec3d..db8368747bad42 100644 --- a/Lib/encodings/gb18030.py +++ b/Lib/encodings/gb18030.py @@ -36,5 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/gb2312.py b/Lib/encodings/gb2312.py index ba915a2500f21a..cb230c22b948a7 100644 --- a/Lib/encodings/gb2312.py +++ b/Lib/encodings/gb2312.py @@ -36,5 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/gbk.py b/Lib/encodings/gbk.py index d597c7bb77e93e..7a6402036d39c1 100644 --- a/Lib/encodings/gbk.py +++ b/Lib/encodings/gbk.py @@ -36,5 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/hz.py b/Lib/encodings/hz.py index 43ee36a9286426..5d175cc18d80cd 100644 --- a/Lib/encodings/hz.py +++ b/Lib/encodings/hz.py @@ -36,5 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py index 98bf9462e36fbf..a7934dd9880dd1 100644 --- a/Lib/encodings/idna.py +++ b/Lib/encodings/idna.py @@ -385,5 +385,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/iso2022_jp.py b/Lib/encodings/iso2022_jp.py index 27129ce67aa884..ab2361562b1099 100644 --- a/Lib/encodings/iso2022_jp.py +++ b/Lib/encodings/iso2022_jp.py @@ -36,5 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/iso2022_jp_1.py b/Lib/encodings/iso2022_jp_1.py index 0f41dd95cd4332..8066806b212e74 100644 --- a/Lib/encodings/iso2022_jp_1.py +++ b/Lib/encodings/iso2022_jp_1.py @@ -36,5 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/iso2022_jp_2.py b/Lib/encodings/iso2022_jp_2.py index 25f625819f5ea0..0804129a08b9db 100644 --- a/Lib/encodings/iso2022_jp_2.py +++ b/Lib/encodings/iso2022_jp_2.py @@ -36,5 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/iso2022_jp_2004.py b/Lib/encodings/iso2022_jp_2004.py index 1f0bd1b7874472..292e3a7b95c0cc 100644 --- a/Lib/encodings/iso2022_jp_2004.py +++ b/Lib/encodings/iso2022_jp_2004.py @@ -36,5 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/iso2022_jp_3.py b/Lib/encodings/iso2022_jp_3.py index 2acdb3a2cd9be3..036312d202374a 100644 --- a/Lib/encodings/iso2022_jp_3.py +++ b/Lib/encodings/iso2022_jp_3.py @@ -36,5 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/iso2022_jp_ext.py b/Lib/encodings/iso2022_jp_ext.py index a32a533e8bdf00..e6a3f888f04516 100644 --- a/Lib/encodings/iso2022_jp_ext.py +++ b/Lib/encodings/iso2022_jp_ext.py @@ -36,5 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/iso2022_kr.py b/Lib/encodings/iso2022_kr.py index 51dd4ab560422a..56a6e1d3115f1c 100644 --- a/Lib/encodings/iso2022_kr.py +++ b/Lib/encodings/iso2022_kr.py @@ -36,5 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/johab.py b/Lib/encodings/johab.py index e58c50a06c4b96..a835154b552117 100644 --- a/Lib/encodings/johab.py +++ b/Lib/encodings/johab.py @@ -36,5 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/punycode.py b/Lib/encodings/punycode.py index 335acb87cb9b28..d274d642d020cd 100644 --- a/Lib/encodings/punycode.py +++ b/Lib/encodings/punycode.py @@ -250,5 +250,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/raw_unicode_escape.py b/Lib/encodings/raw_unicode_escape.py index 5c5b41437a84b4..bb8bb15bd589be 100644 --- a/Lib/encodings/raw_unicode_escape.py +++ b/Lib/encodings/raw_unicode_escape.py @@ -43,5 +43,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/shift_jis.py b/Lib/encodings/shift_jis.py index bf7fded09468c8..ee2300fb4ad001 100644 --- a/Lib/encodings/shift_jis.py +++ b/Lib/encodings/shift_jis.py @@ -36,5 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/shift_jis_2004.py b/Lib/encodings/shift_jis_2004.py index ae40b684a010f2..4d9c6fb8613cc7 100644 --- a/Lib/encodings/shift_jis_2004.py +++ b/Lib/encodings/shift_jis_2004.py @@ -36,5 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/shift_jisx0213.py b/Lib/encodings/shift_jisx0213.py index 5af8565618b40e..2b80a1f7b2c102 100644 --- a/Lib/encodings/shift_jisx0213.py +++ b/Lib/encodings/shift_jisx0213.py @@ -36,5 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/unicode_escape.py b/Lib/encodings/unicode_escape.py index d896cefc9596be..65b10462228554 100644 --- a/Lib/encodings/unicode_escape.py +++ b/Lib/encodings/unicode_escape.py @@ -43,5 +43,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/utf_16.py b/Lib/encodings/utf_16.py index eac93bd17d07d1..41c4f610532927 100644 --- a/Lib/encodings/utf_16.py +++ b/Lib/encodings/utf_16.py @@ -152,5 +152,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/utf_16_be.py b/Lib/encodings/utf_16_be.py index d056cf9202a40f..9dbb25ff018262 100644 --- a/Lib/encodings/utf_16_be.py +++ b/Lib/encodings/utf_16_be.py @@ -39,5 +39,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/utf_16_le.py b/Lib/encodings/utf_16_le.py index 2e07f76cc3f742..f9655609379e02 100644 --- a/Lib/encodings/utf_16_le.py +++ b/Lib/encodings/utf_16_le.py @@ -39,5 +39,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/utf_32.py b/Lib/encodings/utf_32.py index aebe145ec95e71..e5fd8175809be0 100644 --- a/Lib/encodings/utf_32.py +++ b/Lib/encodings/utf_32.py @@ -147,5 +147,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/utf_32_be.py b/Lib/encodings/utf_32_be.py index ee1b41a11aa35f..100a167a064473 100644 --- a/Lib/encodings/utf_32_be.py +++ b/Lib/encodings/utf_32_be.py @@ -34,5 +34,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/utf_32_le.py b/Lib/encodings/utf_32_le.py index 4ac786bb73349b..1395c51dcfeac7 100644 --- a/Lib/encodings/utf_32_le.py +++ b/Lib/encodings/utf_32_le.py @@ -34,5 +34,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/utf_7.py b/Lib/encodings/utf_7.py index 3127867fb5bff9..a273f0fa26c818 100644 --- a/Lib/encodings/utf_7.py +++ b/Lib/encodings/utf_7.py @@ -35,5 +35,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/utf_8.py b/Lib/encodings/utf_8.py index 3801615ce34001..d5544140451a95 100644 --- a/Lib/encodings/utf_8.py +++ b/Lib/encodings/utf_8.py @@ -39,5 +39,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/encodings/utf_8_sig.py b/Lib/encodings/utf_8_sig.py index b5e5c89f80b9eb..fab8aaf7ba2abb 100644 --- a/Lib/encodings/utf_8_sig.py +++ b/Lib/encodings/utf_8_sig.py @@ -127,5 +127,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 03dd61a76db154..aada3752e318a0 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -1892,7 +1892,7 @@ def test_copy(self): self.assertIsNot(dup, orig) self.assertEqual(dup, orig) self.assertTrue(orig._is_text_encoding) - self.assertFalse(orig._is_single_byte) + self.assertTrue(orig._is_multibyte) self.assertEqual(dup.encode, orig.encode) self.assertEqual(dup.name, orig.name) self.assertEqual(dup.incrementalencoder, orig.incrementalencoder) @@ -1913,7 +1913,7 @@ def test_deepcopy(self): self.assertIsNot(dup, orig) self.assertEqual(dup, orig) self.assertTrue(orig._is_text_encoding) - self.assertFalse(orig._is_single_byte) + self.assertTrue(orig._is_multibyte) self.assertEqual(dup.encode, orig.encode) self.assertEqual(dup.name, orig.name) self.assertEqual(dup.incrementalencoder, orig.incrementalencoder) @@ -1942,7 +1942,7 @@ def test_pickle(self): unpickled_codec_info.incrementalencoder ) self.assertTrue(unpickled_codec_info._is_text_encoding) - self.assertFalse(unpickled_codec_info._is_single_byte) + self.assertTrue(unpickled_codec_info._is_multibyte) # Test a CodecInfo with _is_text_encoding equal to false. codec_info = codecs.lookup('base64') diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c index 09f9d40d4ec710..81a71410c5de71 100644 --- a/Modules/pyexpat.c +++ b/Modules/pyexpat.c @@ -1472,16 +1472,16 @@ PyUnknownEncodingHandler(void *encodingHandlerData, } if (!PyTuple_CheckExact(codec)) { PyObject *attr; - if (PyObject_GetOptionalAttrString(codec, "_is_single_byte", &attr) < 0) { + if (PyObject_GetOptionalAttrString(codec, "_is_multibyte", &attr) < 0) { Py_DECREF(codec); return XML_STATUS_ERROR; } if (attr != NULL) { - int is_single_byte = PyObject_IsTrue(attr); + int is_multibyte = PyObject_IsTrue(attr); Py_DECREF(attr); - if (is_single_byte <= 0) { // error or false + if (is_multibyte != 0) { // true or error Py_DECREF(codec); - if (is_single_byte == 0) { + if (is_multibyte > 0) { // true PyErr_SetString(PyExc_ValueError, "multi-byte encodings are not supported"); } diff --git a/Tools/unicode/gencjkcodecs.py b/Tools/unicode/gencjkcodecs.py index eb04f67f2077eb..71d19693eb6f7b 100644 --- a/Tools/unicode/gencjkcodecs.py +++ b/Tools/unicode/gencjkcodecs.py @@ -51,7 +51,7 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, - _is_single_byte=False, + _is_multibyte=True, ) """) From fb266e193481c320079bd158410b386255de4467 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 13 May 2026 19:01:26 +0300 Subject: [PATCH 9/9] Include the encoding name in the error message. --- Modules/pyexpat.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c index 81a71410c5de71..b688cf6a16fc37 100644 --- a/Modules/pyexpat.c +++ b/Modules/pyexpat.c @@ -1482,8 +1482,9 @@ PyUnknownEncodingHandler(void *encodingHandlerData, if (is_multibyte != 0) { // true or error Py_DECREF(codec); if (is_multibyte > 0) { // true - PyErr_SetString(PyExc_ValueError, - "multi-byte encodings are not supported"); + PyErr_Format(PyExc_ValueError, + "multi-byte encodings are not supported: '%s'", + name); } return XML_STATUS_ERROR; } @@ -1499,8 +1500,9 @@ PyUnknownEncodingHandler(void *encodingHandlerData, if (PyUnicode_GET_LENGTH(u) != 256) { Py_DECREF(u); - PyErr_SetString(PyExc_ValueError, - "multi-byte encodings are not supported"); + PyErr_Format(PyExc_ValueError, + "multi-byte encodings are not supported: '%s'", + name); return XML_STATUS_ERROR; }