Skip to content

Commit 6616512

Browse files
serhiy-storchakamcepl
authored andcommitted
Fix use-after-free in the unicode-escape decoder with an error handler
Cut disused recode_encoding logic in _PyBytes_DecodeEscape. All call sites pass NULL for `recode_encoding`, so this path is completely untested. That's been true since before Python 3.0. It adds significant complexity to this logic, so it's best to take it out. All call sites now have a literal NULL, and that's been true since commit 768921c eliminated a conditional (`foo ? bar : NULL`) at the call site in Python/ast.c where we're parsing a bytes literal. But even before then, that condition `foo` had been a constant since unadorned string literals started meaning Unicode, in commit 572dbf8 aka v3.0a1~1035 . The `unicode` parameter is already unused, so mark it as unused too. The code that acted on it was also taken out before Python 3.0, in commit 8d30cc0 aka v3.0a1~1031 . The function (PyBytes_DecodeEscape) is exposed in the API, but it's never been documented. Fixes: bsc#1243273 (CVE-2025-4516) Fixes: gh#python#133767 From-PR: gh#python/cpython!134346 Patch: CVE-2025-4516-DecodeError-handler.patch
1 parent a438682 commit 6616512

File tree

13 files changed

+436
-149
lines changed

13 files changed

+436
-149
lines changed

Include/longobject.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ PyAPI_FUNC(PyObject *) PyLong_GetInfo(void);
6666
#endif /* SIZEOF_VOID_P */
6767

6868
/* Used by Python/mystrtoul.c, _PyBytes_FromHex(),
69-
_PyBytes_DecodeEscapeRecode(), etc. */
69+
_PyBytes_DecodeEscape(), etc. */
7070
#ifndef Py_LIMITED_API
7171
PyAPI_DATA(unsigned char) _PyLong_DigitValue[256];
7272
#endif

Include/modsupport.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,14 @@ PyAPI_FUNC(PyObject *) Py_BuildValue(const char *, ...);
4141
PyAPI_FUNC(PyObject *) _Py_BuildValue_SizeT(const char *, ...);
4242

4343
#ifndef Py_LIMITED_API
44+
PyAPI_FUNC(int) _PyArg_UnpackStack(
45+
PyObject *const *args,
46+
Py_ssize_t nargs,
47+
const char *name,
48+
Py_ssize_t min,
49+
Py_ssize_t max,
50+
...);
51+
4452
PyAPI_FUNC(int) _PyArg_NoKeywords(const char *funcname, PyObject *kw);
4553
PyAPI_FUNC(int) _PyArg_NoPositional(const char *funcname, PyObject *args);
4654
#endif

Include/unicodeobject.h

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1505,12 +1505,33 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
15051505
);
15061506

15071507
#ifndef Py_LIMITED_API
1508+
/* Variant of PyUnicode_DecodeUnicodeEscape that supports partial decoding. */
1509+
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful(
1510+
const char *string, /* Unicode-Escape encoded string */
1511+
Py_ssize_t length, /* size of string */
1512+
const char *errors, /* error handling */
1513+
Py_ssize_t *consumed /* bytes consumed */
1514+
);
15081515
/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
15091516
chars. */
1510-
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscape(
1517+
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal2(
1518+
const char *string, /* Unicode-Escape encoded string */
1519+
Py_ssize_t length, /* size of string */
1520+
const char *errors, /* error handling */
1521+
Py_ssize_t *consumed, /* bytes consumed */
1522+
int *first_invalid_escape_char, /* on return, if not -1, contain the first
1523+
invalid escaped char (<= 0xff) or invalid
1524+
octal escape (> 0xff) in string. */
1525+
const char **first_invalid_escape_ptr); /* on return, if not NULL, may
1526+
point to the first invalid escaped
1527+
char in string.
1528+
May be NULL if errors is not NULL. */
1529+
// Export for binary compatibility.
1530+
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
15111531
const char *string, /* Unicode-Escape encoded string */
15121532
Py_ssize_t length, /* size of string */
15131533
const char *errors, /* error handling */
1534+
Py_ssize_t *consumed, /* bytes consumed */
15141535
const char **first_invalid_escape /* on return, points to first
15151536
invalid escaped char in
15161537
string. */
@@ -1547,6 +1568,14 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
15471568
);
15481569
#endif
15491570

1571+
/* Variant of PyUnicode_DecodeRawUnicodeEscape that supports partial decoding. */
1572+
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeRawUnicodeEscapeStateful(
1573+
const char *string, /* Unicode-Escape encoded string */
1574+
Py_ssize_t length, /* size of string */
1575+
const char *errors, /* error handling */
1576+
Py_ssize_t *consumed /* bytes consumed */
1577+
);
1578+
15501579
/* --- Unicode Internal Codec ---------------------------------------------
15511580
15521581
Only for internal use in _codecsmodule.c */

Lib/encodings/unicode_escape.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,16 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
2121
def encode(self, input, final=False):
2222
return codecs.unicode_escape_encode(input, self.errors)[0]
2323

24-
class IncrementalDecoder(codecs.IncrementalDecoder):
25-
def decode(self, input, final=False):
26-
return codecs.unicode_escape_decode(input, self.errors)[0]
24+
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
25+
def _buffer_decode(self, input, errors, final):
26+
return codecs.unicode_escape_decode(input, errors, final)
2727

2828
class StreamWriter(Codec,codecs.StreamWriter):
2929
pass
3030

3131
class StreamReader(Codec,codecs.StreamReader):
32-
pass
32+
def decode(self, input, errors='strict'):
33+
return codecs.unicode_escape_decode(input, errors, False)
3334

3435
### encodings module API
3536

Lib/test/test_codeccallbacks.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1004,7 +1004,7 @@ def test_bug828737(self):
10041004
text = 'abc<def>ghi'*n
10051005
text.translate(charmap)
10061006

1007-
def test_mutatingdecodehandler(self):
1007+
def test_mutating_decode_handler(self):
10081008
baddata = [
10091009
("ascii", b"\xff"),
10101010
("utf-7", b"++"),
@@ -1044,6 +1044,40 @@ def mutating(exc):
10441044
for (encoding, data) in baddata:
10451045
self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
10461046

1047+
def test_mutating_decode_handler_unicode_escape(self):
1048+
decode = codecs.unicode_escape_decode
1049+
def mutating(exc):
1050+
if isinstance(exc, UnicodeDecodeError):
1051+
r = data.get(exc.object[:exc.end])
1052+
if r is not None:
1053+
exc.object = r[0] + exc.object[exc.end:]
1054+
return ('\u0404', r[1])
1055+
raise AssertionError("don't know how to handle %r" % exc)
1056+
1057+
codecs.register_error('test.mutating2', mutating)
1058+
data = {
1059+
br'\x0': (b'\\', 0),
1060+
br'\x3': (b'xxx\\', 3),
1061+
br'\x5': (b'x\\', 1),
1062+
}
1063+
def check(input, expected, msg):
1064+
with self.assertWarns(DeprecationWarning) as cm:
1065+
self.assertEqual(decode(input, 'test.mutating2'), (expected, len(input)))
1066+
self.assertIn(msg, str(cm.warning))
1067+
1068+
check(br'\x0n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
1069+
check(br'\x0z', '\u0404\\z', r"invalid escape sequence '\z'")
1070+
1071+
check(br'\x3n\zr', '\u0404\n\\zr', r"invalid escape sequence '\z'")
1072+
check(br'\x3zr', '\u0404\\zr', r"invalid escape sequence '\z'")
1073+
check(br'\x3z5', '\u0404\\z5', r"invalid escape sequence '\z'")
1074+
check(memoryview(br'\x3z5x')[:-1], '\u0404\\z5', r"invalid escape sequence '\z'")
1075+
check(memoryview(br'\x3z5xy')[:-2], '\u0404\\z5', r"invalid escape sequence '\z'")
1076+
1077+
check(br'\x5n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
1078+
check(br'\x5z', '\u0404\\z', r"invalid escape sequence '\z'")
1079+
check(memoryview(br'\x5zy')[:-1], '\u0404\\z', r"invalid escape sequence '\z'")
1080+
10471081
# issue32583
10481082
def test_crashing_decode_handler(self):
10491083
# better generating one more character to fill the extra space slot

Lib/test/test_codecs.py

Lines changed: 74 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1206,20 +1206,32 @@ def test_escape(self):
12061206
check(br"[\501]", b"[A]")
12071207
check(br"[\x41]", b"[A]")
12081208
check(br"[\x410]", b"[A0]")
1209+
1210+
def test_warnings(self):
1211+
decode = codecs.escape_decode
1212+
check = coding_checker(self, decode)
12091213
for i in range(97, 123):
12101214
b = bytes([i])
12111215
if b not in b'abfnrtvx':
1212-
with self.assertWarns(DeprecationWarning):
1216+
with self.assertWarnsRegex(DeprecationWarning,
1217+
r"invalid escape sequence '\\%c'" % i):
12131218
check(b"\\" + b, b"\\" + b)
1214-
with self.assertWarns(DeprecationWarning):
1219+
with self.assertWarnsRegex(DeprecationWarning,
1220+
r"invalid escape sequence '\\%c'" % (i-32)):
12151221
check(b"\\" + b.upper(), b"\\" + b.upper())
1216-
with self.assertWarns(DeprecationWarning):
1222+
with self.assertWarnsRegex(DeprecationWarning,
1223+
r"invalid escape sequence '\\8'"):
12171224
check(br"\8", b"\\8")
12181225
with self.assertWarns(DeprecationWarning):
12191226
check(br"\9", b"\\9")
1220-
with self.assertWarns(DeprecationWarning):
1227+
with self.assertWarnsRegex(DeprecationWarning,
1228+
r"invalid escape sequence '\\\xfa'") as cm:
12211229
check(b"\\\xfa", b"\\\xfa")
12221230

1231+
with self.assertWarnsRegex(DeprecationWarning,
1232+
r"invalid escape sequence '\\z'"):
1233+
self.assertEqual(decode(br'\x\z', 'ignore'), (b'\\z', 4))
1234+
12231235
def test_errors(self):
12241236
decode = codecs.escape_decode
12251237
self.assertRaises(ValueError, decode, br"\x")
@@ -2428,7 +2440,11 @@ def test_unicode_escape(self):
24282440
(r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
24292441

24302442

2431-
class UnicodeEscapeTest(unittest.TestCase):
2443+
class UnicodeEscapeTest(ReadTest, unittest.TestCase):
2444+
encoding = "unicode-escape"
2445+
2446+
test_lone_surrogates = None
2447+
24322448
def test_empty(self):
24332449
self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
24342450
self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
@@ -2484,20 +2500,31 @@ def test_escape_decode(self):
24842500
check(br"[\x410]", "[A0]")
24852501
check(br"\u20ac", "\u20ac")
24862502
check(br"\U0001d120", "\U0001d120")
2503+
2504+
def test_decode_warnings(self):
2505+
decode = codecs.unicode_escape_decode
2506+
check = coding_checker(self, decode)
24872507
for i in range(97, 123):
24882508
b = bytes([i])
24892509
if b not in b'abfnrtuvx':
2490-
with self.assertWarns(DeprecationWarning):
2510+
with self.assertWarnsRegex(DeprecationWarning,
2511+
r"invalid escape sequence '\\%c'" % i):
24912512
check(b"\\" + b, "\\" + chr(i))
24922513
if b.upper() not in b'UN':
2493-
with self.assertWarns(DeprecationWarning):
2514+
with self.assertWarnsRegex(DeprecationWarning,
2515+
r"invalid escape sequence '\\%c'" % (i-32)):
24942516
check(b"\\" + b.upper(), "\\" + chr(i-32))
2495-
with self.assertWarns(DeprecationWarning):
2517+
with self.assertWarnsRegex(DeprecationWarning,
2518+
r"invalid escape sequence '\\8'"):
24962519
check(br"\8", "\\8")
24972520
with self.assertWarns(DeprecationWarning):
24982521
check(br"\9", "\\9")
2499-
with self.assertWarns(DeprecationWarning):
2522+
with self.assertWarnsRegex(DeprecationWarning,
2523+
r"invalid escape sequence '\\\xfa'") as cm:
25002524
check(b"\\\xfa", "\\\xfa")
2525+
with self.assertWarnsRegex(DeprecationWarning,
2526+
r"invalid escape sequence '\\z'"):
2527+
self.assertEqual(decode(br'\x\z', 'ignore'), ('\\z', 4))
25012528

25022529
def test_decode_errors(self):
25032530
decode = codecs.unicode_escape_decode
@@ -2515,6 +2542,44 @@ def test_decode_errors(self):
25152542
self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
25162543
self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
25172544

2545+
def test_partial(self):
2546+
self.check_partial(
2547+
"\x00\t\n\r\\\xff\uffff\U00010000",
2548+
[
2549+
'',
2550+
'',
2551+
'',
2552+
'\x00',
2553+
'\x00',
2554+
'\x00\t',
2555+
'\x00\t',
2556+
'\x00\t\n',
2557+
'\x00\t\n',
2558+
'\x00\t\n\r',
2559+
'\x00\t\n\r',
2560+
'\x00\t\n\r\\',
2561+
'\x00\t\n\r\\',
2562+
'\x00\t\n\r\\',
2563+
'\x00\t\n\r\\',
2564+
'\x00\t\n\r\\\xff',
2565+
'\x00\t\n\r\\\xff',
2566+
'\x00\t\n\r\\\xff',
2567+
'\x00\t\n\r\\\xff',
2568+
'\x00\t\n\r\\\xff',
2569+
'\x00\t\n\r\\\xff',
2570+
'\x00\t\n\r\\\xff\uffff',
2571+
'\x00\t\n\r\\\xff\uffff',
2572+
'\x00\t\n\r\\\xff\uffff',
2573+
'\x00\t\n\r\\\xff\uffff',
2574+
'\x00\t\n\r\\\xff\uffff',
2575+
'\x00\t\n\r\\\xff\uffff',
2576+
'\x00\t\n\r\\\xff\uffff',
2577+
'\x00\t\n\r\\\xff\uffff',
2578+
'\x00\t\n\r\\\xff\uffff',
2579+
'\x00\t\n\r\\\xff\uffff',
2580+
'\x00\t\n\r\\\xff\uffff\U00010000',
2581+
]
2582+
)
25182583

25192584
class RawUnicodeEscapeTest(unittest.TestCase):
25202585
def test_empty(self):
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix incremental decoder and stream reader in the "unicode-escape" codec.
2+
Previously they failed if the escape sequence was split.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix use-after-free in the "unicode-escape" decoder with a non-"strict" error
2+
handler.

Modules/_codecsmodule.c

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -519,35 +519,41 @@ _codecs_utf_32_ex_decode_impl(PyObject *module, Py_buffer *data,
519519
/*[clinic input]
520520
_codecs.unicode_escape_decode
521521
data: Py_buffer(accept={str, buffer})
522-
errors: str(accept={str, NoneType}) = NULL
522+
errors: str(accept={str, NoneType}) = None
523+
final: bool = True
523524
/
524525
[clinic start generated code]*/
525526

526527
static PyObject *
527528
_codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
528-
const char *errors)
529-
/*[clinic end generated code: output=3ca3c917176b82ab input=49fd27d06813a7f5]*/
529+
const char *errors, int final)
530+
/*[clinic end generated code: output=b284f97b12c635ee input=15019f081ffe272b]*/
530531
{
531-
PyObject *decoded = PyUnicode_DecodeUnicodeEscape(data->buf, data->len,
532-
errors);
533-
return codec_tuple(decoded, data->len);
532+
Py_ssize_t consumed = data->len;
533+
PyObject *decoded = _PyUnicode_DecodeUnicodeEscapeStateful(data->buf, data->len,
534+
errors,
535+
final ? NULL : &consumed);
536+
return codec_tuple(decoded, consumed);
534537
}
535538

536539
/*[clinic input]
537540
_codecs.raw_unicode_escape_decode
538541
data: Py_buffer(accept={str, buffer})
539-
errors: str(accept={str, NoneType}) = NULL
542+
errors: str(accept={str, NoneType}) = None
543+
final: bool = True
540544
/
541545
[clinic start generated code]*/
542546

543547
static PyObject *
544548
_codecs_raw_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
545-
const char *errors)
546-
/*[clinic end generated code: output=c98eeb56028070a6 input=770903a211434ebc]*/
549+
const char *errors, int final)
550+
/*[clinic end generated code: output=11dbd96301e2879e input=b93f823aa8c343ad]*/
547551
{
548-
PyObject *decoded = PyUnicode_DecodeRawUnicodeEscape(data->buf, data->len,
549-
errors);
550-
return codec_tuple(decoded, data->len);
552+
Py_ssize_t consumed = data->len;
553+
PyObject *decoded = _PyUnicode_DecodeRawUnicodeEscapeStateful(data->buf, data->len,
554+
errors,
555+
final ? NULL : &consumed);
556+
return codec_tuple(decoded, consumed);
551557
}
552558

553559
/*[clinic input]

0 commit comments

Comments
 (0)