From 8e3fdfe576624cf071ba3a78838ac6b30a69394b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 27 Oct 2024 11:02:34 +0100 Subject: [PATCH 1/6] improve test coverage of codecs C API --- Lib/test/test_capi/test_codecs.py | 145 +++++++++++++++++++++++++----- 1 file changed, 122 insertions(+), 23 deletions(-) diff --git a/Lib/test/test_capi/test_codecs.py b/Lib/test/test_capi/test_codecs.py index 85491a89947318..b764981cca6da2 100644 --- a/Lib/test/test_capi/test_codecs.py +++ b/Lib/test/test_capi/test_codecs.py @@ -745,7 +745,69 @@ def test_codec_stream_writer(self): codec_stream_writer(NULL, stream, 'strict') +class UnsafeUnicodeEncodeError(UnicodeEncodeError): + def __init__(self, encoding, message, start, end, reason): + self.may_crash = (end - start) < 0 or (end - start) >= len(message) + super().__init__(encoding, message, start, end, reason) + + +class UnsafeUnicodeDecodeError(UnicodeDecodeError): + def __init__(self, encoding, message, start, end, reason): + # the case end - start >= len(message) does not crash + self.may_crash = (end - start) < 0 + super().__init__(encoding, message, start, end, reason) + + +class UnsafeUnicodeTranslateError(UnicodeTranslateError): + def __init__(self, message, start, end, reason): + # <= 0 because PyCodec_ReplaceErrors tries to check the Unicode kind + # of a 0-length result (which is by convention PyUnicode_1BYTE_KIND + # and not PyUnicode_2BYTE_KIND as it currently expects) + self.may_crash = (end - start) <= 0 or (end - start) >= len(message) + super().__init__(message, start, end, reason) + + class CAPICodecErrors(unittest.TestCase): + @classmethod + def _generate_exceptions(cls, atomic_literal, factory, objlens): + for objlen in objlens: + obj = atomic_literal * objlen + m = 2 * max(2, objlen) + for start in range(-m, m): + for end in range(-m, m): + yield factory(obj, start, end) + + @classmethod + def generate_encode_errors(cls, objlen, *objlens): + def factory(obj, start, end): + return UnsafeUnicodeEncodeError('utf-8', obj, start, end, 'reason') + return tuple(cls._generate_exceptions('0', factory, [objlen, *objlens])) + + @classmethod + def generate_decode_errors(cls, objlen, *objlens): + def factory(obj, start, end): + return UnsafeUnicodeDecodeError('utf-8', obj, start, end, 'reason') + return tuple(cls._generate_exceptions(b'0', factory, [objlen, *objlens])) + + @classmethod + def generate_translate_errors(cls, objlen, *objlens): + def factory(obj, start, end): + return UnsafeUnicodeTranslateError(obj, start, end, 'reason') + return tuple(cls._generate_exceptions('0', factory, [objlen, *objlens])) + + @classmethod + def setUpClass(cls): + cls.unicode_encode_errors = cls.generate_encode_errors(0, 1, 5) + cls.unicode_decode_errors = cls.generate_decode_errors(0, 1, 5) + cls.unicode_translate_errors = cls.generate_translate_errors(0, 1, 5) + cls.all_unicode_errors = ( + cls.unicode_encode_errors + + cls.unicode_decode_errors + + cls.unicode_translate_errors + ) + cls.bad_unicode_errors = ( + ValueError(), + ) def test_codec_register_error(self): # for cleaning up between tests @@ -780,33 +842,70 @@ def test_codec_lookup_error(self): self.assertIs(codec_lookup_error('ignore'), codecs.ignore_errors) self.assertIs(codec_lookup_error('replace'), codecs.replace_errors) self.assertIs(codec_lookup_error('xmlcharrefreplace'), codecs.xmlcharrefreplace_errors) + self.assertIs(codec_lookup_error('backslashreplace'), codecs.backslashreplace_errors) self.assertIs(codec_lookup_error('namereplace'), codecs.namereplace_errors) self.assertRaises(LookupError, codec_lookup_error, 'unknown') - def test_codec_error_handlers(self): - exceptions = [ - # A UnicodeError with an empty message currently crashes: - # See: https://github.com/python/cpython/issues/123378 - # UnicodeEncodeError('bad', '', 0, 1, 'reason'), - UnicodeEncodeError('bad', 'x', 0, 1, 'reason'), - UnicodeEncodeError('bad', 'xyz123', 0, 1, 'reason'), - UnicodeEncodeError('bad', 'xyz123', 1, 4, 'reason'), - ] - - strict_handler = _testcapi.codec_strict_errors + def test_codec_strict_errors_handler(self): + handler = _testcapi.codec_strict_errors + for exc in self.all_unicode_errors + self.bad_unicode_errors: + with self.subTest(handler=handler, exc=exc): + self.assertRaises(type(exc), handler, exc) + + def test_codec_ignore_errors_handler(self): + handler = _testcapi.codec_ignore_errors + all_exceptions = self.all_unicode_errors + bad_exceptions = self.bad_unicode_errors + self.do_test_codec_errors_handler(handler, all_exceptions, bad_exceptions) + + def test_codec_replace_errors_handler(self): + handler = _testcapi.codec_replace_errors + all_exceptions = self.all_unicode_errors + bad_exceptions = self.bad_unicode_errors + self.do_test_codec_errors_handler(handler, all_exceptions, bad_exceptions) + + def test_codec_xmlcharrefreplace_errors_handler(self): + handler = _testcapi.codec_xmlcharrefreplace_errors + exceptions = self.unicode_encode_errors + bad_exceptions = ( + self.bad_unicode_errors + + tuple(e for e in self.all_unicode_errors if e not in exceptions) + ) + self.do_test_codec_errors_handler(handler, exceptions, bad_exceptions) + + def test_codec_backslashreplace_errors_handler(self): + handler = _testcapi.codec_backslashreplace_errors + exceptions = self.all_unicode_errors + bad_exceptions = self.bad_unicode_errors + self.do_test_codec_errors_handler(handler, exceptions, bad_exceptions) + + def test_codec_namereplace_errors_handler(self): + handler = _testlimitedcapi.codec_namereplace_errors + exceptions = self.unicode_encode_errors + bad_exceptions = ( + self.bad_unicode_errors + + tuple(e for e in self.all_unicode_errors if e not in exceptions) + ) + self.do_test_codec_errors_handler(handler, exceptions, bad_exceptions) + + def do_test_codec_errors_handler(self, handler, exceptions, bad_exceptions): + at_least_one = False for exc in exceptions: - with self.subTest(handler=strict_handler, exc=exc): - self.assertRaises(UnicodeEncodeError, strict_handler, exc) - - for handler in [ - _testcapi.codec_ignore_errors, - _testcapi.codec_replace_errors, - _testcapi.codec_xmlcharrefreplace_errors, - _testlimitedcapi.codec_namereplace_errors, - ]: - for exc in exceptions: - with self.subTest(handler=handler, exc=exc): - self.assertIsInstance(handler(exc), tuple) + # See https://github.com/python/cpython/issues/123378 and related + # discussion and issues for details. + if exc.may_crash: + continue + + at_least_one = True + with self.subTest(handler=handler, exc=exc): + # test that the handler does not crash + self.assertIsInstance(handler(exc), tuple) + + self.assertTrue(at_least_one, "all exceptions are crashing") + + for bad_exc in bad_exceptions: + with self.subTest('bad type', handler=handler, exc=bad_exc): + self.assertRaises(TypeError, handler, bad_exc) if __name__ == "__main__": From 03959c4195f48d532b4221f61839932eb537ea49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 29 Oct 2024 14:46:29 +0100 Subject: [PATCH 2/6] address Petr's review --- Lib/test/test_capi/test_codecs.py | 73 +++++++++++++------------------ 1 file changed, 31 insertions(+), 42 deletions(-) diff --git a/Lib/test/test_capi/test_codecs.py b/Lib/test/test_capi/test_codecs.py index b764981cca6da2..fef7a67ee4a9ce 100644 --- a/Lib/test/test_capi/test_codecs.py +++ b/Lib/test/test_capi/test_codecs.py @@ -769,37 +769,39 @@ def __init__(self, message, start, end, reason): class CAPICodecErrors(unittest.TestCase): @classmethod - def _generate_exceptions(cls, atomic_literal, factory, objlens): - for objlen in objlens: - obj = atomic_literal * objlen + def _generate_exception_args(cls): + for objlen in range(10): m = 2 * max(2, objlen) for start in range(-m, m): for end in range(-m, m): - yield factory(obj, start, end) + yield objlen, start, end @classmethod - def generate_encode_errors(cls, objlen, *objlens): - def factory(obj, start, end): - return UnsafeUnicodeEncodeError('utf-8', obj, start, end, 'reason') - return tuple(cls._generate_exceptions('0', factory, [objlen, *objlens])) + def generate_encode_errors(cls): + return tuple( + UnsafeUnicodeEncodeError('utf-8', '0' * objlen, start, end, 'why') + for objlen, start, end in cls._generate_exception_args() + ) @classmethod - def generate_decode_errors(cls, objlen, *objlens): - def factory(obj, start, end): - return UnsafeUnicodeDecodeError('utf-8', obj, start, end, 'reason') - return tuple(cls._generate_exceptions(b'0', factory, [objlen, *objlens])) + def generate_decode_errors(cls): + return tuple( + UnsafeUnicodeDecodeError('utf-8', b'0' * objlen, start, end, 'why') + for objlen, start, end in cls._generate_exception_args() + ) @classmethod - def generate_translate_errors(cls, objlen, *objlens): - def factory(obj, start, end): - return UnsafeUnicodeTranslateError(obj, start, end, 'reason') - return tuple(cls._generate_exceptions('0', factory, [objlen, *objlens])) + def generate_translate_errors(cls): + return tuple( + UnsafeUnicodeTranslateError('0' * objlen, start, end, 'why') + for objlen, start, end in cls._generate_exception_args() + ) @classmethod def setUpClass(cls): - cls.unicode_encode_errors = cls.generate_encode_errors(0, 1, 5) - cls.unicode_decode_errors = cls.generate_decode_errors(0, 1, 5) - cls.unicode_translate_errors = cls.generate_translate_errors(0, 1, 5) + cls.unicode_encode_errors = cls.generate_encode_errors() + cls.unicode_decode_errors = cls.generate_decode_errors() + cls.unicode_translate_errors = cls.generate_translate_errors() cls.all_unicode_errors = ( cls.unicode_encode_errors + cls.unicode_decode_errors @@ -854,41 +856,25 @@ def test_codec_strict_errors_handler(self): def test_codec_ignore_errors_handler(self): handler = _testcapi.codec_ignore_errors - all_exceptions = self.all_unicode_errors - bad_exceptions = self.bad_unicode_errors - self.do_test_codec_errors_handler(handler, all_exceptions, bad_exceptions) + self.do_test_codec_errors_handler(handler, self.all_unicode_errors) def test_codec_replace_errors_handler(self): handler = _testcapi.codec_replace_errors - all_exceptions = self.all_unicode_errors - bad_exceptions = self.bad_unicode_errors - self.do_test_codec_errors_handler(handler, all_exceptions, bad_exceptions) + self.do_test_codec_errors_handler(handler, self.all_unicode_errors) def test_codec_xmlcharrefreplace_errors_handler(self): handler = _testcapi.codec_xmlcharrefreplace_errors - exceptions = self.unicode_encode_errors - bad_exceptions = ( - self.bad_unicode_errors - + tuple(e for e in self.all_unicode_errors if e not in exceptions) - ) - self.do_test_codec_errors_handler(handler, exceptions, bad_exceptions) + self.do_test_codec_errors_handler(handler, self.unicode_encode_errors) def test_codec_backslashreplace_errors_handler(self): handler = _testcapi.codec_backslashreplace_errors - exceptions = self.all_unicode_errors - bad_exceptions = self.bad_unicode_errors - self.do_test_codec_errors_handler(handler, exceptions, bad_exceptions) + self.do_test_codec_errors_handler(handler, self.all_unicode_errors) def test_codec_namereplace_errors_handler(self): handler = _testlimitedcapi.codec_namereplace_errors - exceptions = self.unicode_encode_errors - bad_exceptions = ( - self.bad_unicode_errors - + tuple(e for e in self.all_unicode_errors if e not in exceptions) - ) - self.do_test_codec_errors_handler(handler, exceptions, bad_exceptions) + self.do_test_codec_errors_handler(handler, self.unicode_encode_errors) - def do_test_codec_errors_handler(self, handler, exceptions, bad_exceptions): + def do_test_codec_errors_handler(self, handler, exceptions): at_least_one = False for exc in exceptions: # See https://github.com/python/cpython/issues/123378 and related @@ -903,7 +889,10 @@ def do_test_codec_errors_handler(self, handler, exceptions, bad_exceptions): self.assertTrue(at_least_one, "all exceptions are crashing") - for bad_exc in bad_exceptions: + for bad_exc in ( + self.bad_unicode_errors + + tuple(e for e in self.all_unicode_errors if e not in exceptions) + ): with self.subTest('bad type', handler=handler, exc=bad_exc): self.assertRaises(TypeError, handler, bad_exc) From b993a2e23a83925e90ba8bf0c1205c609ec7d1e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 31 Oct 2024 15:34:33 +0100 Subject: [PATCH 3/6] Simplify tests --- Lib/test/test_capi/test_codecs.py | 65 ++++++++++++++++--------------- 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/Lib/test/test_capi/test_codecs.py b/Lib/test/test_capi/test_codecs.py index fef7a67ee4a9ce..84cffdec6df6b0 100644 --- a/Lib/test/test_capi/test_codecs.py +++ b/Lib/test/test_capi/test_codecs.py @@ -7,6 +7,7 @@ import unittest.mock as mock import _testcapi from test.support import import_helper +from test.support.script_helper import assert_python_failure _testlimitedcapi = import_helper.import_module('_testlimitedcapi') @@ -745,55 +746,34 @@ def test_codec_stream_writer(self): codec_stream_writer(NULL, stream, 'strict') -class UnsafeUnicodeEncodeError(UnicodeEncodeError): - def __init__(self, encoding, message, start, end, reason): - self.may_crash = (end - start) < 0 or (end - start) >= len(message) - super().__init__(encoding, message, start, end, reason) - - -class UnsafeUnicodeDecodeError(UnicodeDecodeError): - def __init__(self, encoding, message, start, end, reason): - # the case end - start >= len(message) does not crash - self.may_crash = (end - start) < 0 - super().__init__(encoding, message, start, end, reason) - - -class UnsafeUnicodeTranslateError(UnicodeTranslateError): - def __init__(self, message, start, end, reason): - # <= 0 because PyCodec_ReplaceErrors tries to check the Unicode kind - # of a 0-length result (which is by convention PyUnicode_1BYTE_KIND - # and not PyUnicode_2BYTE_KIND as it currently expects) - self.may_crash = (end - start) <= 0 or (end - start) >= len(message) - super().__init__(message, start, end, reason) - - class CAPICodecErrors(unittest.TestCase): + @classmethod def _generate_exception_args(cls): - for objlen in range(10): - m = 2 * max(2, objlen) - for start in range(-m, m): - for end in range(-m, m): + for objlen in range(5): + maxind = 2 * max(2, objlen) + for start in range(-maxind, maxind + 1): + for end in range(-maxind, maxind + 1): yield objlen, start, end @classmethod def generate_encode_errors(cls): return tuple( - UnsafeUnicodeEncodeError('utf-8', '0' * objlen, start, end, 'why') + UnicodeEncodeError('utf-8', '0' * objlen, start, end, 'why') for objlen, start, end in cls._generate_exception_args() ) @classmethod def generate_decode_errors(cls): return tuple( - UnsafeUnicodeDecodeError('utf-8', b'0' * objlen, start, end, 'why') + UnicodeDecodeError('utf-8', b'0' * objlen, start, end, 'why') for objlen, start, end in cls._generate_exception_args() ) @classmethod def generate_translate_errors(cls): return tuple( - UnsafeUnicodeTranslateError('0' * objlen, start, end, 'why') + UnicodeTranslateError('0' * objlen, start, end, 'why') for objlen, start, end in cls._generate_exception_args() ) @@ -879,7 +859,7 @@ def do_test_codec_errors_handler(self, handler, exceptions): for exc in exceptions: # See https://github.com/python/cpython/issues/123378 and related # discussion and issues for details. - if exc.may_crash: + if self._exception_may_crash(exc): continue at_least_one = True @@ -887,7 +867,8 @@ def do_test_codec_errors_handler(self, handler, exceptions): # test that the handler does not crash self.assertIsInstance(handler(exc), tuple) - self.assertTrue(at_least_one, "all exceptions are crashing") + if exceptions: + self.assertTrue(at_least_one, "all exceptions are crashing") for bad_exc in ( self.bad_unicode_errors @@ -896,6 +877,28 @@ def do_test_codec_errors_handler(self, handler, exceptions): with self.subTest('bad type', handler=handler, exc=bad_exc): self.assertRaises(TypeError, handler, bad_exc) + @classmethod + def _exception_may_crash(cls, exc): + """Indicate whether a Unicode exception may crash the interpreter + when used by a built-in codecs error handler. + + This should only be used by "do_test_codec_errors_handler". + """ + message, start, end = exc.object, exc.start, exc.end + match exc: + case UnicodeEncodeError(): + return end < start or (end - start) >= len(message) + case UnicodeDecodeError(): + # The case "end - start >= len(message)" does not crash. + return end < start + case UnicodeTranslateError(): + # Test "end <= start" because PyCodec_ReplaceErrors checks + # the Unicode kind of a 0-length string which by convention + # is PyUnicode_1BYTE_KIND and not PyUnicode_2BYTE_KIND as + # the handler currently expects. + return end <= start or (end - start) >= len(message) + return False + if __name__ == "__main__": unittest.main() From 69d0643f1c3fdb53db341a13bc13ab628a71e916 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 31 Oct 2024 15:34:45 +0100 Subject: [PATCH 4/6] Simplify tests --- Lib/test/test_capi/test_codecs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Lib/test/test_capi/test_codecs.py b/Lib/test/test_capi/test_codecs.py index 84cffdec6df6b0..52ef362f4fc60d 100644 --- a/Lib/test/test_capi/test_codecs.py +++ b/Lib/test/test_capi/test_codecs.py @@ -7,7 +7,6 @@ import unittest.mock as mock import _testcapi from test.support import import_helper -from test.support.script_helper import assert_python_failure _testlimitedcapi = import_helper.import_module('_testlimitedcapi') From 05f75b6bb7d1bbb9f191ba51cdcfda5a26fd6ffe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 31 Oct 2024 16:33:31 +0100 Subject: [PATCH 5/6] Update Lib/test/test_capi/test_codecs.py --- Lib/test/test_capi/test_codecs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Lib/test/test_capi/test_codecs.py b/Lib/test/test_capi/test_codecs.py index 52ef362f4fc60d..a43117da65d3ac 100644 --- a/Lib/test/test_capi/test_codecs.py +++ b/Lib/test/test_capi/test_codecs.py @@ -881,6 +881,8 @@ def _exception_may_crash(cls, exc): """Indicate whether a Unicode exception may crash the interpreter when used by a built-in codecs error handler. + Until gh-123378 is fixed, we skip the tests for these exceptions. + This should only be used by "do_test_codec_errors_handler". """ message, start, end = exc.object, exc.start, exc.end From c36232a650a9d3f579e2b383f153a0cad9e4b517 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 1 Nov 2024 14:03:09 +0100 Subject: [PATCH 6/6] Update Lib/test/test_capi/test_codecs.py Co-authored-by: Petr Viktorin --- Lib/test/test_capi/test_codecs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_capi/test_codecs.py b/Lib/test/test_capi/test_codecs.py index a43117da65d3ac..a557e35e68915d 100644 --- a/Lib/test/test_capi/test_codecs.py +++ b/Lib/test/test_capi/test_codecs.py @@ -878,8 +878,8 @@ def do_test_codec_errors_handler(self, handler, exceptions): @classmethod def _exception_may_crash(cls, exc): - """Indicate whether a Unicode exception may crash the interpreter - when used by a built-in codecs error handler. + """Indicate whether a Unicode exception might currently crash + the interpreter when used by a built-in codecs error handler. Until gh-123378 is fixed, we skip the tests for these exceptions.