Skip to content

Commit c8b3a94

Browse files
ch4n3-yoonsethmlarson
authored andcommitted
gh-149079: Fix O(n^2) canonical ordering in unicodedata.normalize()
Replace the insertion sort used for canonical ordering of combining characters with a hybrid approach: insertion sort for short runs (< 20) and counting sort for longer runs, reducing worst-case complexity from O(n^2) to O(n). This prevents denial of service via crafted Unicode strings with many combining characters in alternating CCC order. Co-authored-by: Seokchan Yoon <13852925+ch4n3-yoon@users.noreply.github.com>
1 parent 005555a commit c8b3a94

3 files changed

Lines changed: 159 additions & 27 deletions

File tree

Lib/test/test_unicodedata.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,34 @@ def test_issue10254(self):
616616
b = 'C\u0338' * 20 + '\xC7'
617617
self.assertEqual(self.db.normalize('NFC', a), b)
618618

619+
def test_long_combining_mark_run(self):
620+
# GH-XXXXX: avoid quadratic canonical ordering.
621+
payload = "a" + ("\u0300\u0327" * 32)
622+
nfd = "a" + ("\u0327" * 32) + ("\u0300" * 32)
623+
nfc = "\u00e0" + ("\u0327" * 32) + ("\u0300" * 31)
624+
625+
self.assertEqual(self.db.normalize("NFD", payload), nfd)
626+
self.assertEqual(self.db.normalize("NFKD", payload), nfd)
627+
self.assertEqual(self.db.normalize("NFC", payload), nfc)
628+
self.assertEqual(self.db.normalize("NFKC", payload), nfc)
629+
630+
def test_combining_mark_run_fast_paths(self):
631+
# GH-XXXXX: cover short runs and already-sorted long runs.
632+
short_payload = "a" + ("\u0300\u0327" * 9) + "\u0300"
633+
short_nfd = "a" + ("\u0327" * 9) + ("\u0300" * 10)
634+
short_nfc = "\u00e0" + ("\u0327" * 9) + ("\u0300" * 9)
635+
long_sorted = "a" + ("\u0327" * 30) + ("\u0300" * 30)
636+
long_sorted_nfc = "\u00e0" + ("\u0327" * 30) + ("\u0300" * 29)
637+
638+
self.assertEqual(self.db.normalize("NFD", short_payload), short_nfd)
639+
self.assertEqual(self.db.normalize("NFKD", short_payload), short_nfd)
640+
self.assertEqual(self.db.normalize("NFC", short_payload), short_nfc)
641+
self.assertEqual(self.db.normalize("NFKC", short_payload), short_nfc)
642+
self.assertEqual(self.db.normalize("NFD", long_sorted), long_sorted)
643+
self.assertEqual(self.db.normalize("NFKD", long_sorted), long_sorted)
644+
self.assertEqual(self.db.normalize("NFC", long_sorted), long_sorted_nfc)
645+
self.assertEqual(self.db.normalize("NFKC", long_sorted), long_sorted_nfc)
646+
619647
def test_issue29456(self):
620648
# Fix #29456
621649
u1176_str_a = '\u1100\u1176\u11a8'
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Fix a potential denial of service in :func:`unicodedata.normalize`. The
2+
canonical ordering step of Unicode normalization used an O(n²) insertion
3+
sort for reordering combining characters, which could be exploited with
4+
crafted input containing many combining characters in non-canonical order.
5+
Replaced with a linear-time counting sort for long runs.

Modules/unicodedata.c

Lines changed: 126 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -552,19 +552,88 @@ get_decomp_record(PyObject *self, Py_UCS4 code,
552552
(*index)++;
553553
}
554554

555+
/* Small combining runs are usually cheaper with insertion sort. */
556+
#define CANONICAL_ORDERING_COUNTING_SORT_THRESHOLD 20
557+
558+
static void
559+
canonical_ordering_sort_insertion(int kind, void *data,
560+
Py_ssize_t start, Py_ssize_t end)
561+
{
562+
for (Py_ssize_t i = start + 1; i < end; i++) {
563+
Py_UCS4 code = PyUnicode_READ(kind, data, i);
564+
unsigned char combining = _getrecord_ex(code)->combining;
565+
Py_ssize_t j = i;
566+
567+
while (j > start) {
568+
Py_UCS4 previous = PyUnicode_READ(kind, data, j - 1);
569+
if (_getrecord_ex(previous)->combining <= combining) {
570+
break;
571+
}
572+
PyUnicode_WRITE(kind, data, j, previous);
573+
j--;
574+
}
575+
if (j != i) {
576+
PyUnicode_WRITE(kind, data, j, code);
577+
}
578+
}
579+
}
580+
581+
static void
582+
canonical_ordering_sort_counting(int kind, void *data,
583+
Py_ssize_t start, Py_ssize_t end,
584+
Py_UCS4 *sortbuf)
585+
{
586+
Py_ssize_t counts[256] = {0};
587+
Py_ssize_t run_length = end - start;
588+
Py_ssize_t total = 0;
589+
unsigned char min_combining = 255;
590+
unsigned char max_combining = 0;
591+
592+
for (Py_ssize_t i = start; i < end; i++) {
593+
Py_UCS4 code = PyUnicode_READ(kind, data, i);
594+
unsigned char combining = _getrecord_ex(code)->combining;
595+
counts[combining]++;
596+
if (combining < min_combining) {
597+
min_combining = combining;
598+
}
599+
if (combining > max_combining) {
600+
max_combining = combining;
601+
}
602+
}
603+
604+
for (Py_ssize_t i = min_combining; i <= max_combining; i++) {
605+
Py_ssize_t count = counts[i];
606+
counts[i] = total;
607+
total += count;
608+
}
609+
610+
/* Reuse counts[] as the next output slot for each CCC. */
611+
for (Py_ssize_t i = start; i < end; i++) {
612+
Py_UCS4 code = PyUnicode_READ(kind, data, i);
613+
unsigned char combining = _getrecord_ex(code)->combining;
614+
sortbuf[counts[combining]++] = code;
615+
}
616+
for (Py_ssize_t i = 0; i < run_length; i++) {
617+
PyUnicode_WRITE(kind, data, start + i, sortbuf[i]);
618+
}
619+
}
620+
555621
static PyObject*
556622
nfd_nfkd(PyObject *self, PyObject *input, int k)
557623
{
558624
PyObject *result;
559625
Py_UCS4 *output;
560626
Py_ssize_t i, o, osize;
561-
int kind;
562-
const void *data;
627+
int input_kind, result_kind;
628+
const void *input_data;
629+
void *result_data;
563630
/* Longest decomposition in Unicode 3.2: U+FDFA */
564631
Py_UCS4 stack[20];
565-
Py_ssize_t space, isize;
632+
Py_ssize_t space, isize, length;
566633
int index, prefix, count, stackptr;
567634
unsigned char prev, cur;
635+
Py_UCS4 *sortbuf = NULL;
636+
Py_ssize_t sortbuflen = 0;
568637

569638
stackptr = 0;
570639
isize = PyUnicode_GET_LENGTH(input);
@@ -584,11 +653,11 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
584653
return NULL;
585654
}
586655
i = o = 0;
587-
kind = PyUnicode_KIND(input);
588-
data = PyUnicode_DATA(input);
656+
input_kind = PyUnicode_KIND(input);
657+
input_data = PyUnicode_DATA(input);
589658

590659
while (i < isize) {
591-
stack[stackptr++] = PyUnicode_READ(kind, data, i++);
660+
stack[stackptr++] = PyUnicode_READ(input_kind, input_data, i++);
592661
while(stackptr) {
593662
Py_UCS4 code = stack[--stackptr];
594663
/* Hangul Decomposition adds three characters in
@@ -656,34 +725,64 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
656725
if (!result)
657726
return NULL;
658727

659-
kind = PyUnicode_KIND(result);
660-
data = PyUnicode_DATA(result);
728+
result_kind = PyUnicode_KIND(result);
729+
result_data = PyUnicode_DATA(result);
730+
length = PyUnicode_GET_LENGTH(result);
661731

662-
/* Sort canonically. */
732+
/* Sort each consecutive combining-character run canonically. */
663733
i = 0;
664-
prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
665-
for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
666-
cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
667-
if (prev == 0 || cur == 0 || prev <= cur) {
668-
prev = cur;
734+
while (i < length) {
735+
Py_ssize_t run_length, run_start;
736+
int needs_sort = 0;
737+
738+
prev = _getrecord_ex(
739+
PyUnicode_READ(result_kind, result_data, i))->combining;
740+
if (prev == 0) {
741+
i++;
669742
continue;
670743
}
671-
/* Non-canonical order. Need to switch *i with previous. */
672-
o = i - 1;
673-
while (1) {
674-
Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
675-
PyUnicode_WRITE(kind, data, o+1,
676-
PyUnicode_READ(kind, data, o));
677-
PyUnicode_WRITE(kind, data, o, tmp);
678-
o--;
679-
if (o < 0)
680-
break;
681-
prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
682-
if (prev == 0 || prev <= cur)
744+
745+
run_start = i++;
746+
while (i < length) {
747+
cur = _getrecord_ex(
748+
PyUnicode_READ(result_kind, result_data, i))->combining;
749+
if (cur == 0) {
683750
break;
751+
}
752+
if (prev > cur) {
753+
needs_sort = 1;
754+
}
755+
prev = cur;
756+
i++;
757+
}
758+
if (!needs_sort) {
759+
continue;
760+
}
761+
762+
run_length = i - run_start;
763+
if (run_length < CANONICAL_ORDERING_COUNTING_SORT_THRESHOLD) {
764+
canonical_ordering_sort_insertion(result_kind, result_data,
765+
run_start, i);
766+
continue;
684767
}
685-
prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
768+
769+
if (run_length > sortbuflen) {
770+
Py_UCS4 *new_sortbuf = PyMem_Realloc(sortbuf,
771+
run_length * sizeof(Py_UCS4));
772+
if (new_sortbuf == NULL) {
773+
PyErr_NoMemory();
774+
PyMem_Free(sortbuf);
775+
Py_DECREF(result);
776+
return NULL;
777+
}
778+
sortbuf = new_sortbuf;
779+
sortbuflen = run_length;
780+
}
781+
782+
canonical_ordering_sort_counting(result_kind, result_data,
783+
run_start, i, sortbuf);
686784
}
785+
PyMem_Free(sortbuf);
687786
return result;
688787
}
689788

0 commit comments

Comments
 (0)