Skip to content

Commit 9b6c60c

Browse files
bpo-31979: Simplify transforming decimals to ASCII (#4336)
in int(), float() and complex() parsers. This also speeds up parsing non-ASCII numbers by around 20%.
1 parent ce12629 commit 9b6c60c

File tree

7 files changed

+63
-139
lines changed

7 files changed

+63
-139
lines changed

Include/unicodeobject.h

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1723,6 +1723,7 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
17231723

17241724
#endif /* MS_WINDOWS */
17251725

1726+
#ifndef Py_LIMITED_API
17261727
/* --- Decimal Encoder ---------------------------------------------------- */
17271728

17281729
/* Takes a Unicode string holding a decimal value and writes it into
@@ -1747,34 +1748,31 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
17471748
17481749
*/
17491750

1750-
#ifndef Py_LIMITED_API
17511751
PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
17521752
Py_UNICODE *s, /* Unicode buffer */
17531753
Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
17541754
char *output, /* Output buffer; must have size >= length */
17551755
const char *errors /* error handling */
17561756
) /* Py_DEPRECATED(3.3) */;
1757-
#endif
17581757

17591758
/* Transforms code points that have decimal digit property to the
17601759
corresponding ASCII digit code points.
17611760
17621761
Returns a new Unicode string on success, NULL on failure.
17631762
*/
17641763

1765-
#ifndef Py_LIMITED_API
17661764
PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
17671765
Py_UNICODE *s, /* Unicode buffer */
17681766
Py_ssize_t length /* Number of Py_UNICODE chars to transform */
17691767
) /* Py_DEPRECATED(3.3) */;
1770-
#endif
17711768

1772-
/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject
1773-
as argument instead of a raw buffer and length. This function additionally
1774-
transforms spaces to ASCII because this is what the callers in longobject,
1775-
floatobject, and complexobject did anyways. */
1769+
/* Coverts a Unicode object holding a decimal value to an ASCII string
1770+
for using in int, float and complex parsers.
1771+
Transforms code points that have decimal digit property to the
1772+
corresponding ASCII digit code points. Transforms spaces to ASCII.
1773+
Transforms code points starting from the first non-ASCII code point that
1774+
is neither a decimal digit nor a space to the end into '?'. */
17761775

1777-
#ifndef Py_LIMITED_API
17781776
PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
17791777
PyObject *unicode /* Unicode object */
17801778
);

Lib/test/test_float.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def test_float(self):
5151
self.assertRaises(TypeError, float, {})
5252
self.assertRaisesRegex(TypeError, "not 'dict'", float, {})
5353
# Lone surrogate
54-
self.assertRaises(UnicodeEncodeError, float, '\uD8F0')
54+
self.assertRaises(ValueError, float, '\uD8F0')
5555
# check that we don't accept alternate exponent markers
5656
self.assertRaises(ValueError, float, "-1.7d29")
5757
self.assertRaises(ValueError, float, "3D-14")

Lib/test/test_unicode.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2068,11 +2068,14 @@ def test_codecs_errors(self):
20682068
# Error handling (wrong arguments)
20692069
self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
20702070

2071-
# Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII())
2072-
self.assertRaises(UnicodeError, float, "\ud800")
2073-
self.assertRaises(UnicodeError, float, "\udf00")
2074-
self.assertRaises(UnicodeError, complex, "\ud800")
2075-
self.assertRaises(UnicodeError, complex, "\udf00")
2071+
# Error handling (lone surrogate in
2072+
# _PyUnicode_TransformDecimalAndSpaceToASCII())
2073+
self.assertRaises(ValueError, int, "\ud800")
2074+
self.assertRaises(ValueError, int, "\udf00")
2075+
self.assertRaises(ValueError, float, "\ud800")
2076+
self.assertRaises(ValueError, float, "\udf00")
2077+
self.assertRaises(ValueError, complex, "\ud800")
2078+
self.assertRaises(ValueError, complex, "\udf00")
20762079

20772080
def test_codecs(self):
20782081
# Encoding

Objects/complexobject.c

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -914,10 +914,10 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v)
914914
if (s_buffer == NULL) {
915915
return NULL;
916916
}
917+
assert(PyUnicode_IS_ASCII(s_buffer));
918+
/* Simply get a pointer to existing ASCII characters. */
917919
s = PyUnicode_AsUTF8AndSize(s_buffer, &len);
918-
if (s == NULL) {
919-
goto exit;
920-
}
920+
assert(s != NULL);
921921
}
922922
else {
923923
PyErr_Format(PyExc_TypeError,
@@ -928,7 +928,6 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v)
928928

929929
result = _Py_string_to_number_with_underscores(s, len, "complex", v, type,
930930
complex_from_string_inner);
931-
exit:
932931
Py_DECREF(s_buffer);
933932
return result;
934933
}

Objects/floatobject.c

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -176,11 +176,10 @@ PyFloat_FromString(PyObject *v)
176176
s_buffer = _PyUnicode_TransformDecimalAndSpaceToASCII(v);
177177
if (s_buffer == NULL)
178178
return NULL;
179+
assert(PyUnicode_IS_ASCII(s_buffer));
180+
/* Simply get a pointer to existing ASCII characters. */
179181
s = PyUnicode_AsUTF8AndSize(s_buffer, &len);
180-
if (s == NULL) {
181-
Py_DECREF(s_buffer);
182-
return NULL;
183-
}
182+
assert(s != NULL);
184183
}
185184
else if (PyBytes_Check(v)) {
186185
s = PyBytes_AS_STRING(v);

Objects/longobject.c

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2509,21 +2509,18 @@ PyLong_FromUnicodeObject(PyObject *u, int base)
25092509
asciidig = _PyUnicode_TransformDecimalAndSpaceToASCII(u);
25102510
if (asciidig == NULL)
25112511
return NULL;
2512+
assert(PyUnicode_IS_ASCII(asciidig));
2513+
/* Simply get a pointer to existing ASCII characters. */
25122514
buffer = PyUnicode_AsUTF8AndSize(asciidig, &buflen);
2513-
if (buffer == NULL) {
2514-
Py_DECREF(asciidig);
2515-
if (!PyErr_ExceptionMatches(PyExc_UnicodeEncodeError))
2516-
return NULL;
2517-
}
2518-
else {
2519-
result = PyLong_FromString(buffer, &end, base);
2520-
if (end == NULL || (result != NULL && end == buffer + buflen)) {
2521-
Py_DECREF(asciidig);
2522-
return result;
2523-
}
2515+
assert(buffer != NULL);
2516+
2517+
result = PyLong_FromString(buffer, &end, base);
2518+
if (end == NULL || (result != NULL && end == buffer + buflen)) {
25242519
Py_DECREF(asciidig);
2525-
Py_XDECREF(result);
2520+
return result;
25262521
}
2522+
Py_DECREF(asciidig);
2523+
Py_XDECREF(result);
25272524
PyErr_Format(PyExc_ValueError,
25282525
"invalid literal for int() with base %d: %.200R",
25292526
base, u);

Objects/unicodeobject.c

Lines changed: 32 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -840,9 +840,6 @@ ensure_unicode(PyObject *obj)
840840

841841
/* --- Unicode Object ----------------------------------------------------- */
842842

843-
static PyObject *
844-
fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
845-
846843
static inline Py_ssize_t
847844
findchar(const void *s, int kind,
848845
Py_ssize_t size, Py_UCS4 ch,
@@ -9062,42 +9059,6 @@ PyUnicode_Translate(PyObject *str,
90629059
return _PyUnicode_TranslateCharmap(str, mapping, errors);
90639060
}
90649061

9065-
static Py_UCS4
9066-
fix_decimal_and_space_to_ascii(PyObject *self)
9067-
{
9068-
/* No need to call PyUnicode_READY(self) because this function is only
9069-
called as a callback from fixup() which does it already. */
9070-
const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9071-
const int kind = PyUnicode_KIND(self);
9072-
void *data = PyUnicode_DATA(self);
9073-
Py_UCS4 maxchar = 127, ch, fixed;
9074-
int modified = 0;
9075-
Py_ssize_t i;
9076-
9077-
for (i = 0; i < len; ++i) {
9078-
ch = PyUnicode_READ(kind, data, i);
9079-
fixed = 0;
9080-
if (ch > 127) {
9081-
if (Py_UNICODE_ISSPACE(ch))
9082-
fixed = ' ';
9083-
else {
9084-
const int decimal = Py_UNICODE_TODECIMAL(ch);
9085-
if (decimal >= 0)
9086-
fixed = '0' + decimal;
9087-
}
9088-
if (fixed != 0) {
9089-
modified = 1;
9090-
maxchar = Py_MAX(maxchar, fixed);
9091-
PyUnicode_WRITE(kind, data, i, fixed);
9092-
}
9093-
else
9094-
maxchar = Py_MAX(maxchar, ch);
9095-
}
9096-
}
9097-
9098-
return (modified) ? maxchar : 0;
9099-
}
9100-
91019062
PyObject *
91029063
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
91039064
{
@@ -9107,12 +9068,42 @@ _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
91079068
}
91089069
if (PyUnicode_READY(unicode) == -1)
91099070
return NULL;
9110-
if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9071+
if (PyUnicode_IS_ASCII(unicode)) {
91119072
/* If the string is already ASCII, just return the same string */
91129073
Py_INCREF(unicode);
91139074
return unicode;
91149075
}
9115-
return fixup(unicode, fix_decimal_and_space_to_ascii);
9076+
9077+
Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9078+
PyObject *result = PyUnicode_New(len, 127);
9079+
if (result == NULL) {
9080+
return NULL;
9081+
}
9082+
9083+
Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9084+
int kind = PyUnicode_KIND(unicode);
9085+
const void *data = PyUnicode_DATA(unicode);
9086+
Py_ssize_t i;
9087+
for (i = 0; i < len; ++i) {
9088+
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9089+
if (ch < 127) {
9090+
out[i] = ch;
9091+
}
9092+
else if (Py_UNICODE_ISSPACE(ch)) {
9093+
out[i] = ' ';
9094+
}
9095+
else {
9096+
int decimal = Py_UNICODE_TODECIMAL(ch);
9097+
if (decimal < 0) {
9098+
out[i] = '?';
9099+
_PyUnicode_LENGTH(result) = i + 1;
9100+
break;
9101+
}
9102+
out[i] = '0' + decimal;
9103+
}
9104+
}
9105+
9106+
return result;
91169107
}
91179108

91189109
PyObject *
@@ -9588,69 +9579,6 @@ PyUnicode_Tailmatch(PyObject *str,
95889579
return tailmatch(str, substr, start, end, direction);
95899580
}
95909581

9591-
/* Apply fixfct filter to the Unicode object self and return a
9592-
reference to the modified object */
9593-
9594-
static PyObject *
9595-
fixup(PyObject *self,
9596-
Py_UCS4 (*fixfct)(PyObject *s))
9597-
{
9598-
PyObject *u;
9599-
Py_UCS4 maxchar_old, maxchar_new = 0;
9600-
PyObject *v;
9601-
9602-
u = _PyUnicode_Copy(self);
9603-
if (u == NULL)
9604-
return NULL;
9605-
maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
9606-
9607-
/* fix functions return the new maximum character in a string,
9608-
if the kind of the resulting unicode object does not change,
9609-
everything is fine. Otherwise we need to change the string kind
9610-
and re-run the fix function. */
9611-
maxchar_new = fixfct(u);
9612-
9613-
if (maxchar_new == 0) {
9614-
/* no changes */;
9615-
if (PyUnicode_CheckExact(self)) {
9616-
Py_DECREF(u);
9617-
Py_INCREF(self);
9618-
return self;
9619-
}
9620-
else
9621-
return u;
9622-
}
9623-
9624-
maxchar_new = align_maxchar(maxchar_new);
9625-
9626-
if (maxchar_new == maxchar_old)
9627-
return u;
9628-
9629-
/* In case the maximum character changed, we need to
9630-
convert the string to the new category. */
9631-
v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9632-
if (v == NULL) {
9633-
Py_DECREF(u);
9634-
return NULL;
9635-
}
9636-
if (maxchar_new > maxchar_old) {
9637-
/* If the maxchar increased so that the kind changed, not all
9638-
characters are representable anymore and we need to fix the
9639-
string again. This only happens in very few cases. */
9640-
_PyUnicode_FastCopyCharacters(v, 0,
9641-
self, 0, PyUnicode_GET_LENGTH(self));
9642-
maxchar_old = fixfct(v);
9643-
assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9644-
}
9645-
else {
9646-
_PyUnicode_FastCopyCharacters(v, 0,
9647-
u, 0, PyUnicode_GET_LENGTH(self));
9648-
}
9649-
Py_DECREF(u);
9650-
assert(_PyUnicode_CheckConsistency(v, 1));
9651-
return v;
9652-
}
9653-
96549582
static PyObject *
96559583
ascii_upper_or_lower(PyObject *self, int lower)
96569584
{

0 commit comments

Comments
 (0)