Fix use-after-free in the unicode-escape decoder with an error handler

serhiy-storchaka · mcepl · commit 66165124d1f0 · 2025-06-11T19:50:51.000+02:00
Cut disused recode_encoding logic in _PyBytes_DecodeEscape. All call sites pass NULL for `recode_encoding`, so this path is completely untested. That's been true since before Python 3.0. It adds significant complexity to this logic, so it's best to take it out. All call sites now have a literal NULL, and that's been true since commit 768921c eliminated a conditional (`foo ? bar : NULL`) at the call site in Python/ast.c where we're parsing a bytes literal. But even before then, that condition `foo` had been a constant since unadorned string literals started meaning Unicode, in commit 572dbf8 aka v3.0a1~1035 . The `unicode` parameter is already unused, so mark it as unused too. The code that acted on it was also taken out before Python 3.0, in commit 8d30cc0 aka v3.0a1~1031 . The function (PyBytes_DecodeEscape) is exposed in the API, but it's never been documented. Fixes: bsc#1243273 (CVE-2025-4516) Fixes: gh#python#133767 From-PR: gh#python/cpython!134346 Patch: CVE-2025-4516-DecodeError-handler.patch
diff --git a/Include/longobject.h b/Include/longobject.h
@@ -66,7 +66,7 @@ PyAPI_FUNC(PyObject *) PyLong_GetInfo(void);
 #endif /* SIZEOF_VOID_P */
 
 /* Used by Python/mystrtoul.c, _PyBytes_FromHex(),
-   _PyBytes_DecodeEscapeRecode(), etc. */
+   _PyBytes_DecodeEscape(), etc. */
 #ifndef Py_LIMITED_API
 PyAPI_DATA(unsigned char) _PyLong_DigitValue[256];
 #endif
diff --git a/Include/modsupport.h b/Include/modsupport.h
@@ -41,6 +41,14 @@ PyAPI_FUNC(PyObject *) Py_BuildValue(const char *, ...);
 PyAPI_FUNC(PyObject *) _Py_BuildValue_SizeT(const char *, ...);
 
 #ifndef Py_LIMITED_API
+PyAPI_FUNC(int) _PyArg_UnpackStack(
+    PyObject *const *args,
+    Py_ssize_t nargs,
+    const char *name,
+    Py_ssize_t min,
+    Py_ssize_t max,
+    ...);
+
 PyAPI_FUNC(int) _PyArg_NoKeywords(const char *funcname, PyObject *kw);
 PyAPI_FUNC(int) _PyArg_NoPositional(const char *funcname, PyObject *args);
 #endif
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
@@ -1505,12 +1505,33 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
     );
 
 #ifndef Py_LIMITED_API
+/* Variant of PyUnicode_DecodeUnicodeEscape that supports partial decoding. */
+PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful(
+        const char *string,     /* Unicode-Escape encoded string */
+        Py_ssize_t length,      /* size of string */
+        const char *errors,     /* error handling */
+        Py_ssize_t *consumed    /* bytes consumed */
+);
 /* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
    chars. */
-PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscape(
+PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal2(
+    const char *string,     /* Unicode-Escape encoded string */
+    Py_ssize_t length,      /* size of string */
+    const char *errors,     /* error handling */
+    Py_ssize_t *consumed,   /* bytes consumed */
+    int *first_invalid_escape_char, /* on return, if not -1, contain the first
+                                       invalid escaped char (<= 0xff) or invalid
+                                       octal escape (> 0xff) in string. */
+    const char **first_invalid_escape_ptr); /* on return, if not NULL, may
+                                        point to the first invalid escaped
+                                        char in string.
+                                        May be NULL if errors is not NULL. */
+// Export for binary compatibility.
+PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
         const char *string,     /* Unicode-Escape encoded string */
         Py_ssize_t length,      /* size of string */
         const char *errors,     /* error handling */
+        Py_ssize_t *consumed,   /* bytes consumed */
         const char **first_invalid_escape  /* on return, points to first
                                               invalid escaped char in
                                               string. */
@@ -1547,6 +1568,14 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
     );
 #endif
 
+/* Variant of PyUnicode_DecodeRawUnicodeEscape that supports partial decoding. */
+PyAPI_FUNC(PyObject*) _PyUnicode_DecodeRawUnicodeEscapeStateful(
+        const char *string,     /* Unicode-Escape encoded string */
+        Py_ssize_t length,      /* size of string */
+        const char *errors,     /* error handling */
+        Py_ssize_t *consumed    /* bytes consumed */
+);
+
 /* --- Unicode Internal Codec ---------------------------------------------
 
     Only for internal use in _codecsmodule.c */
diff --git a/Lib/encodings/unicode_escape.py b/Lib/encodings/unicode_escape.py
@@ -21,15 +21,16 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
     def encode(self, input, final=False):
         return codecs.unicode_escape_encode(input, self.errors)[0]
 
-class IncrementalDecoder(codecs.IncrementalDecoder):
-    def decode(self, input, final=False):
-        return codecs.unicode_escape_decode(input, self.errors)[0]
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+    def _buffer_decode(self, input, errors, final):
+        return codecs.unicode_escape_decode(input, errors, final)
 
 class StreamWriter(Codec,codecs.StreamWriter):
     pass
 
 class StreamReader(Codec,codecs.StreamReader):
-    pass
+    def decode(self, input, errors='strict'):
+        return codecs.unicode_escape_decode(input, errors, False)
 
 ### encodings module API
 
diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
@@ -1004,7 +1004,7 @@ def test_bug828737(self):
             text = 'abc<def>ghi'*n
             text.translate(charmap)
 
-    def test_mutatingdecodehandler(self):
+    def test_mutating_decode_handler(self):
         baddata = [
             ("ascii", b"\xff"),
             ("utf-7", b"++"),
@@ -1044,6 +1044,40 @@ def mutating(exc):
             for (encoding, data) in baddata:
                 self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
 
+    def test_mutating_decode_handler_unicode_escape(self):
+        decode = codecs.unicode_escape_decode
+        def mutating(exc):
+            if isinstance(exc, UnicodeDecodeError):
+                r = data.get(exc.object[:exc.end])
+                if r is not None:
+                    exc.object = r[0] + exc.object[exc.end:]
+                    return ('\u0404', r[1])
+            raise AssertionError("don't know how to handle %r" % exc)
+
+        codecs.register_error('test.mutating2', mutating)
+        data = {
+            br'\x0': (b'\\', 0),
+            br'\x3': (b'xxx\\', 3),
+            br'\x5': (b'x\\', 1),
+        }
+        def check(input, expected, msg):
+            with self.assertWarns(DeprecationWarning) as cm:
+                self.assertEqual(decode(input, 'test.mutating2'), (expected, len(input)))
+            self.assertIn(msg, str(cm.warning))
+
+        check(br'\x0n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
+        check(br'\x0z', '\u0404\\z', r"invalid escape sequence '\z'")
+
+        check(br'\x3n\zr', '\u0404\n\\zr', r"invalid escape sequence '\z'")
+        check(br'\x3zr', '\u0404\\zr', r"invalid escape sequence '\z'")
+        check(br'\x3z5', '\u0404\\z5', r"invalid escape sequence '\z'")
+        check(memoryview(br'\x3z5x')[:-1], '\u0404\\z5', r"invalid escape sequence '\z'")
+        check(memoryview(br'\x3z5xy')[:-2], '\u0404\\z5', r"invalid escape sequence '\z'")
+
+        check(br'\x5n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
+        check(br'\x5z', '\u0404\\z', r"invalid escape sequence '\z'")
+        check(memoryview(br'\x5zy')[:-1], '\u0404\\z', r"invalid escape sequence '\z'")
+
     # issue32583
     def test_crashing_decode_handler(self):
         # better generating one more character to fill the extra space slot
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
@@ -1206,20 +1206,32 @@ def test_escape(self):
         check(br"[\501]", b"[A]")
         check(br"[\x41]", b"[A]")
         check(br"[\x410]", b"[A0]")
+
+    def test_warnings(self):
+        decode = codecs.escape_decode
+        check = coding_checker(self, decode)
         for i in range(97, 123):
             b = bytes([i])
             if b not in b'abfnrtvx':
-                with self.assertWarns(DeprecationWarning):
+                with self.assertWarnsRegex(DeprecationWarning,
+                        r"invalid escape sequence '\\%c'" % i):
                     check(b"\\" + b, b"\\" + b)
-            with self.assertWarns(DeprecationWarning):
+            with self.assertWarnsRegex(DeprecationWarning,
+                    r"invalid escape sequence '\\%c'" % (i-32)):
                 check(b"\\" + b.upper(), b"\\" + b.upper())
-        with self.assertWarns(DeprecationWarning):
+        with self.assertWarnsRegex(DeprecationWarning,
+                r"invalid escape sequence '\\8'"):
             check(br"\8", b"\\8")
         with self.assertWarns(DeprecationWarning):
             check(br"\9", b"\\9")
-        with self.assertWarns(DeprecationWarning):
+        with self.assertWarnsRegex(DeprecationWarning,
+                r"invalid escape sequence '\\\xfa'") as cm:
             check(b"\\\xfa", b"\\\xfa")
 
+        with self.assertWarnsRegex(DeprecationWarning,
+                r"invalid escape sequence '\\z'"):
+            self.assertEqual(decode(br'\x\z', 'ignore'), (b'\\z', 4))
+
     def test_errors(self):
         decode = codecs.escape_decode
         self.assertRaises(ValueError, decode, br"\x")
@@ -2428,7 +2440,11 @@ def test_unicode_escape(self):
                          (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
 
 
-class UnicodeEscapeTest(unittest.TestCase):
+class UnicodeEscapeTest(ReadTest, unittest.TestCase):
+    encoding = "unicode-escape"
+
+    test_lone_surrogates = None
+
     def test_empty(self):
         self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
         self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
@@ -2484,20 +2500,31 @@ def test_escape_decode(self):
         check(br"[\x410]", "[A0]")
         check(br"\u20ac", "\u20ac")
         check(br"\U0001d120", "\U0001d120")
+
+    def test_decode_warnings(self):
+        decode = codecs.unicode_escape_decode
+        check = coding_checker(self, decode)
         for i in range(97, 123):
             b = bytes([i])
             if b not in b'abfnrtuvx':
-                with self.assertWarns(DeprecationWarning):
+                with self.assertWarnsRegex(DeprecationWarning,
+                        r"invalid escape sequence '\\%c'" % i):
                     check(b"\\" + b, "\\" + chr(i))
             if b.upper() not in b'UN':
-                with self.assertWarns(DeprecationWarning):
+                with self.assertWarnsRegex(DeprecationWarning,
+                        r"invalid escape sequence '\\%c'" % (i-32)):
                     check(b"\\" + b.upper(), "\\" + chr(i-32))
-        with self.assertWarns(DeprecationWarning):
+        with self.assertWarnsRegex(DeprecationWarning,
+                r"invalid escape sequence '\\8'"):
             check(br"\8", "\\8")
         with self.assertWarns(DeprecationWarning):
             check(br"\9", "\\9")
-        with self.assertWarns(DeprecationWarning):
+        with self.assertWarnsRegex(DeprecationWarning,
+                r"invalid escape sequence '\\\xfa'") as cm:
             check(b"\\\xfa", "\\\xfa")
+        with self.assertWarnsRegex(DeprecationWarning,
+                r"invalid escape sequence '\\z'"):
+            self.assertEqual(decode(br'\x\z', 'ignore'), ('\\z', 4))
 
     def test_decode_errors(self):
         decode = codecs.unicode_escape_decode
@@ -2515,6 +2542,44 @@ def test_decode_errors(self):
         self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
         self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
 
+    def test_partial(self):
+        self.check_partial(
+            "\x00\t\n\r\\\xff\uffff\U00010000",
+            [
+                '',
+                '',
+                '',
+                '\x00',
+                '\x00',
+                '\x00\t',
+                '\x00\t',
+                '\x00\t\n',
+                '\x00\t\n',
+                '\x00\t\n\r',
+                '\x00\t\n\r',
+                '\x00\t\n\r\\',
+                '\x00\t\n\r\\',
+                '\x00\t\n\r\\',
+                '\x00\t\n\r\\',
+                '\x00\t\n\r\\\xff',
+                '\x00\t\n\r\\\xff',
+                '\x00\t\n\r\\\xff',
+                '\x00\t\n\r\\\xff',
+                '\x00\t\n\r\\\xff',
+                '\x00\t\n\r\\\xff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff\U00010000',
+            ]
+        )
 
 class RawUnicodeEscapeTest(unittest.TestCase):
     def test_empty(self):
diff --git a/Misc/NEWS.d/next/Library/2021-10-14-00-19-02.bpo-45461.4LB_tJ.rst b/Misc/NEWS.d/next/Library/2021-10-14-00-19-02.bpo-45461.4LB_tJ.rst
@@ -0,0 +1,2 @@
+Fix incremental decoder and stream reader in the "unicode-escape" codec.
+Previously they failed if the escape sequence was split.
diff --git a/Misc/NEWS.d/next/Security/2025-05-09-20-22-54.gh-issue-133767.kN2i3Q.rst b/Misc/NEWS.d/next/Security/2025-05-09-20-22-54.gh-issue-133767.kN2i3Q.rst
@@ -0,0 +1,2 @@
+Fix use-after-free in the "unicode-escape" decoder with a non-"strict" error
+handler.
diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c
@@ -519,35 +519,41 @@ _codecs_utf_32_ex_decode_impl(PyObject *module, Py_buffer *data,
 /*[clinic input]
 _codecs.unicode_escape_decode
     data: Py_buffer(accept={str, buffer})
-    errors: str(accept={str, NoneType}) = NULL
+    errors: str(accept={str, NoneType}) = None
+    final: bool = True
     /
 [clinic start generated code]*/
 
 static PyObject *
 _codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
-                                   const char *errors)
-/*[clinic end generated code: output=3ca3c917176b82ab input=49fd27d06813a7f5]*/
+                                   const char *errors, int final)
+/*[clinic end generated code: output=b284f97b12c635ee input=15019f081ffe272b]*/
 {
-    PyObject *decoded = PyUnicode_DecodeUnicodeEscape(data->buf, data->len,
-                                                      errors);
-    return codec_tuple(decoded, data->len);
+    Py_ssize_t consumed = data->len;
+    PyObject *decoded = _PyUnicode_DecodeUnicodeEscapeStateful(data->buf, data->len,
+                                                               errors,
+                                                               final ? NULL : &consumed);
+    return codec_tuple(decoded, consumed);
 }
 
 /*[clinic input]
 _codecs.raw_unicode_escape_decode
     data: Py_buffer(accept={str, buffer})
-    errors: str(accept={str, NoneType}) = NULL
+    errors: str(accept={str, NoneType}) = None
+    final: bool = True
     /
 [clinic start generated code]*/
 
 static PyObject *
 _codecs_raw_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
-                                       const char *errors)
-/*[clinic end generated code: output=c98eeb56028070a6 input=770903a211434ebc]*/
+                                       const char *errors, int final)
+/*[clinic end generated code: output=11dbd96301e2879e input=b93f823aa8c343ad]*/
 {
-    PyObject *decoded = PyUnicode_DecodeRawUnicodeEscape(data->buf, data->len,
-                                                         errors);
-    return codec_tuple(decoded, data->len);
+    Py_ssize_t consumed = data->len;
+    PyObject *decoded = _PyUnicode_DecodeRawUnicodeEscapeStateful(data->buf, data->len,
+                                                                  errors,
+                                                                  final ? NULL : &consumed);
+    return codec_tuple(decoded, consumed);
 }
 
 /*[clinic input]
diff --git a/Modules/clinic/_codecsmodule.c.h b/Modules/clinic/_codecsmodule.c.h
diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
diff --git a/Python/ast.c b/Python/ast.c

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+Fix incremental decoder and stream reader in the "unicode-escape" codec.`
	`2`	`+Previously they failed if the escape sequence was split.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+Fix use-after-free in the "unicode-escape" decoder with a non-"strict" error`
	`2`	`+handler.`