Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Include/internal/pycore_codecs.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ extern int _PyCodec_UnregisterError(const char *name);
in Python 3.5+?
*/
extern PyObject* _PyCodec_LookupTextEncoding(
PyAPI_FUNC(PyObject*) _PyCodec_LookupTextEncoding(
const char *encoding,
const char *alternate_command);

Expand Down
5 changes: 4 additions & 1 deletion Lib/codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ class CodecInfo(tuple):

def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
incrementalencoder=None, incrementaldecoder=None, name=None,
*, _is_text_encoding=None):
*, _is_text_encoding=None,
_is_multibyte=None):
self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
self.name = name
self.encode = encode
Expand All @@ -104,6 +105,8 @@ def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
self.streamreader = streamreader
if _is_text_encoding is not None:
self._is_text_encoding = _is_text_encoding
if _is_multibyte is not None:
self._is_multibyte = _is_multibyte
return self

def __repr__(self):
Expand Down
1 change: 1 addition & 0 deletions Lib/encodings/big5.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/big5hkscs.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/cp932.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/cp949.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/cp950.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/euc_jis_2004.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/euc_jisx0213.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/euc_jp.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/euc_kr.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/gb18030.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/gb2312.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/gbk.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/hz.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/idna.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,4 +385,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/iso2022_jp.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/iso2022_jp_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/iso2022_jp_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/iso2022_jp_2004.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/iso2022_jp_3.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/iso2022_jp_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/iso2022_kr.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/johab.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/punycode.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,4 +250,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/raw_unicode_escape.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/shift_jis.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/shift_jis_2004.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/shift_jisx0213.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/unicode_escape.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/utf_16.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,4 +152,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/utf_16_be.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/utf_16_le.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/utf_32.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,4 +147,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/utf_32_be.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/utf_32_le.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/utf_7.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/utf_8.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
1 change: 1 addition & 0 deletions Lib/encodings/utf_8_sig.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,4 +127,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
3 changes: 3 additions & 0 deletions Lib/test/test_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1892,6 +1892,7 @@ def test_copy(self):
self.assertIsNot(dup, orig)
self.assertEqual(dup, orig)
self.assertTrue(orig._is_text_encoding)
self.assertTrue(orig._is_multibyte)
self.assertEqual(dup.encode, orig.encode)
self.assertEqual(dup.name, orig.name)
self.assertEqual(dup.incrementalencoder, orig.incrementalencoder)
Expand All @@ -1912,6 +1913,7 @@ def test_deepcopy(self):
self.assertIsNot(dup, orig)
self.assertEqual(dup, orig)
self.assertTrue(orig._is_text_encoding)
self.assertTrue(orig._is_multibyte)
self.assertEqual(dup.encode, orig.encode)
self.assertEqual(dup.name, orig.name)
self.assertEqual(dup.incrementalencoder, orig.incrementalencoder)
Expand Down Expand Up @@ -1940,6 +1942,7 @@ def test_pickle(self):
unpickled_codec_info.incrementalencoder
)
self.assertTrue(unpickled_codec_info._is_text_encoding)
self.assertTrue(unpickled_codec_info._is_multibyte)

# Test a CodecInfo with _is_text_encoding equal to false.
codec_info = codecs.lookup('base64')
Expand Down
6 changes: 5 additions & 1 deletion Lib/test/test_pyexpat.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,11 +325,15 @@ def test_supported_ecodings2(self, encoding):
])

@support.subTests('encoding', [
'UTF-7',
'UTF8', 'UTF-7',
"unicode-escape", "raw-unicode-escape",
"Big5-HKSCS", "Big5",
"cp932", "cp949", "cp950",
"EUC_JIS-2004", "EUC_JISX0213", "EUC-JP", "EUC-KR",
"GB18030", "GB2312", "GBK",
"HZ-GB-2312",
"ISO-2022-JP", "ISO-2022-JP-1", "ISO-2022-JP-2004",
"ISO-2022-JP-2", "ISO-2022-JP-3", "ISO-2022-JP-EXT",
"ISO-2022-KR",
"johab",
"Shift_JIS", "Shift_JIS-2004", "Shift_JISX0213",
Expand Down
6 changes: 4 additions & 2 deletions Lib/test/test_xml_etree.py
Original file line number Diff line number Diff line change
Expand Up @@ -1039,9 +1039,11 @@ def bxml(encoding, body=''):
'cp932', 'cp949', 'cp950',
'euc-jp', 'euc-jis-2004', 'euc-jisx0213', 'euc-kr',
'gb2312', 'gbk', 'gb18030',
'iso2022-kr', 'johab',
'iso2022-jp', 'iso2022-jp-1', 'iso2022-jp-2', 'iso2022-jp-2004',
'iso2022-jp-3', 'iso2022-jp-ext',
'iso2022-kr', 'johab', 'hz',
'shift-jis', 'shift-jis-2004', 'shift-jisx0213',
'utf-7',
'utf-7', 'utf-8-sig', 'utf8',
]
for encoding in unsupported_ascii_compatible_encodings:
with self.subTest(encoding=encoding):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
The :mod:`XML parser <xml.parsers.expat>` now raises :exc:`ValueError` for known
unsupported multi-byte encodings such us "UTF8", "ISO-2022-JP" or
"raw-unicode-escape" instead of failing later, when encounter non-ASCII
data.
32 changes: 30 additions & 2 deletions Modules/pyexpat.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include "Python.h"
#include "pycore_ceval.h" // _Py_EnterRecursiveCall()
#include "pycore_codecs.h" // _PyCodec_LookupTextEncoding()
#include "pycore_import.h" // _PyImport_SetModule()
#include "pycore_pyhash.h" // _Py_HashSecret
#include "pycore_traceback.h" // _PyTraceback_Add()
Expand Down Expand Up @@ -1465,6 +1466,32 @@ PyUnknownEncodingHandler(void *encodingHandlerData,
if (PyErr_Occurred())
return XML_STATUS_ERROR;

PyObject *codec = _PyCodec_LookupTextEncoding(name, NULL);
if (codec == NULL) {
return XML_STATUS_ERROR;
}
if (!PyTuple_CheckExact(codec)) {
PyObject *attr;
if (PyObject_GetOptionalAttrString(codec, "_is_multibyte", &attr) < 0) {
Py_DECREF(codec);
return XML_STATUS_ERROR;
}
if (attr != NULL) {
int is_multibyte = PyObject_IsTrue(attr);
Py_DECREF(attr);
if (is_multibyte != 0) { // true or error
Py_DECREF(codec);
if (is_multibyte > 0) { // true
PyErr_Format(PyExc_ValueError,
"multi-byte encodings are not supported: '%s'",
name);
}
return XML_STATUS_ERROR;
}
}
}
Py_DECREF(codec);

u = PyUnicode_Decode((const char*) template_buffer, 256, name, "replace");
if (u == NULL) {
Py_XDECREF(u);
Expand All @@ -1473,8 +1500,9 @@ PyUnknownEncodingHandler(void *encodingHandlerData,

if (PyUnicode_GET_LENGTH(u) != 256) {
Py_DECREF(u);
PyErr_SetString(PyExc_ValueError,
"multi-byte encodings are not supported");
PyErr_Format(PyExc_ValueError,
"multi-byte encodings are not supported: '%s'",
name);
return XML_STATUS_ERROR;
}

Expand Down
1 change: 1 addition & 0 deletions Python/codecs.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Copyright (c) Corporation for National Research Initiatives.

#include "Python.h"
#include "pycore_call.h" // _PyObject_CallNoArgs()
#include "pycore_codecs.h" // export _PyCodec_LookupTextEncoding()
#include "pycore_interp.h" // PyInterpreterState.codec_search_path
#include "pycore_pyerrors.h" // _PyErr_FormatNote()
#include "pycore_pystate.h" // _PyInterpreterState_GET()
Expand Down
1 change: 1 addition & 0 deletions Tools/unicode/gencjkcodecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_multibyte=True,
)
""")

Expand Down
Loading