Skip to content

Commit 9d1c4d6

Browse files
authored
bpo-38256: Fix binascii.crc32() when inputs are 4+GiB (GH-32000)
When compiled with `USE_ZLIB_CRC32` defined (`configure` sets this on POSIX systems), `binascii.crc32(...)` failed to compute the correct value when the input data was >= 4GiB. Because the zlib crc32 API is limited to a 32-bit length. This lines it up with the `zlib.crc32(...)` implementation that doesn't have that flaw. **Performance:** This also adopts the same GIL releasing for larger inputs logic that `zlib.crc32` has, and causes the Windows build to always use zlib's crc32 instead of our slow C code as zlib is a required build dependency on Windows.
1 parent 3ae975f commit 9d1c4d6

File tree

6 files changed

+89
-33
lines changed

6 files changed

+89
-33
lines changed

Lib/test/test_binascii.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import binascii
55
import array
66
import re
7-
from test.support import warnings_helper
7+
from test.support import bigmemtest, _1G, _4G, warnings_helper
88

99

1010
# Note: "*_hex" functions are aliases for "(un)hexlify"
@@ -441,6 +441,14 @@ class BytearrayBinASCIITest(BinASCIITest):
441441
class MemoryviewBinASCIITest(BinASCIITest):
442442
type2test = memoryview
443443

444+
class ChecksumBigBufferTestCase(unittest.TestCase):
445+
"""bpo-38256 - check that inputs >=4 GiB are handled correctly."""
446+
447+
@bigmemtest(size=_4G + 4, memuse=1, dry_run=False)
448+
def test_big_buffer(self, size):
449+
data = b"nyan" * (_1G + 1)
450+
self.assertEqual(binascii.crc32(data), 1044521549)
451+
444452

445453
if __name__ == "__main__":
446454
unittest.main()
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
Fix :func:`binascii.crc32` when it is compiled to use zlib'c crc32 to
2+
work properly on inputs 4+GiB in length instead of returning the wrong
3+
result. The workaround prior to this was to always feed the function
4+
data in increments smaller than 4GiB or to just call the zlib module
5+
function.
6+
7+
We also have :func:`binascii.crc32` release the GIL when computing
8+
on larger inputs as :func:`zlib.crc32` and :mod:`hashlib` do.
9+
10+
This also boosts performance on Windows as it now uses the zlib crc32
11+
implementation for :func:`binascii.crc32` for a 2-3x speedup.
12+
13+
That the stdlib has a crc32 API in two modules is a known historical
14+
oddity. This moves us closer to a single implementation behind them.

Modules/binascii.c

Lines changed: 51 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -737,6 +737,21 @@ static const unsigned int crc_32_tab[256] = {
737737
0x5d681b02U, 0x2a6f2b94U, 0xb40bbe37U, 0xc30c8ea1U, 0x5a05df1bU,
738738
0x2d02ef8dU
739739
};
740+
741+
static unsigned int
742+
internal_crc32(const unsigned char *bin_data, Py_ssize_t len, unsigned int crc)
743+
{ /* By Jim Ahlstrom; All rights transferred to CNRI */
744+
unsigned int result;
745+
746+
crc = ~ crc;
747+
while (len-- > 0) {
748+
crc = crc_32_tab[(crc ^ *bin_data++) & 0xff] ^ (crc >> 8);
749+
/* Note: (crc >> 8) MUST zero fill on left */
750+
}
751+
752+
result = (crc ^ 0xFFFFFFFF);
753+
return result & 0xffffffff;
754+
}
740755
#endif /* USE_ZLIB_CRC32 */
741756

742757
/*[clinic input]
@@ -754,34 +769,46 @@ binascii_crc32_impl(PyObject *module, Py_buffer *data, unsigned int crc)
754769
/*[clinic end generated code: output=52cf59056a78593b input=bbe340bc99d25aa8]*/
755770

756771
#ifdef USE_ZLIB_CRC32
757-
/* This was taken from zlibmodule.c PyZlib_crc32 (but is PY_SSIZE_T_CLEAN) */
772+
/* This is the same as zlibmodule.c zlib_crc32_impl. It exists in two
773+
* modules for historical reasons. */
758774
{
759-
const Byte *buf;
760-
Py_ssize_t len;
761-
int signed_val;
762-
763-
buf = (Byte*)data->buf;
764-
len = data->len;
765-
signed_val = crc32(crc, buf, len);
766-
return (unsigned int)signed_val & 0xffffffffU;
775+
/* Releasing the GIL for very small buffers is inefficient
776+
and may lower performance */
777+
if (data->len > 1024*5) {
778+
unsigned char *buf = data->buf;
779+
Py_ssize_t len = data->len;
780+
781+
Py_BEGIN_ALLOW_THREADS
782+
/* Avoid truncation of length for very large buffers. crc32() takes
783+
length as an unsigned int, which may be narrower than Py_ssize_t. */
784+
while ((size_t)len > UINT_MAX) {
785+
crc = crc32(crc, buf, UINT_MAX);
786+
buf += (size_t) UINT_MAX;
787+
len -= (size_t) UINT_MAX;
788+
}
789+
crc = crc32(crc, buf, (unsigned int)len);
790+
Py_END_ALLOW_THREADS
791+
} else {
792+
crc = crc32(crc, data->buf, (unsigned int)data->len);
793+
}
794+
return crc & 0xffffffff;
767795
}
768796
#else /* USE_ZLIB_CRC32 */
769-
{ /* By Jim Ahlstrom; All rights transferred to CNRI */
770-
const unsigned char *bin_data;
771-
Py_ssize_t len;
772-
unsigned int result;
773-
774-
bin_data = data->buf;
775-
len = data->len;
776-
777-
crc = ~ crc;
778-
while (len-- > 0) {
779-
crc = crc_32_tab[(crc ^ *bin_data++) & 0xff] ^ (crc >> 8);
780-
/* Note: (crc >> 8) MUST zero fill on left */
797+
{
798+
const unsigned char *bin_data = data->buf;
799+
Py_ssize_t len = data->len;
800+
801+
/* Releasing the GIL for very small buffers is inefficient
802+
and may lower performance */
803+
if (len > 1024*5) {
804+
unsigned int result;
805+
Py_BEGIN_ALLOW_THREADS
806+
result = internal_crc32(bin_data, len, crc);
807+
Py_END_ALLOW_THREADS
808+
return result;
809+
} else {
810+
return internal_crc32(bin_data, len, crc);
781811
}
782-
783-
result = (crc ^ 0xFFFFFFFF);
784-
return result & 0xffffffff;
785812
}
786813
#endif /* USE_ZLIB_CRC32 */
787814

Modules/clinic/zlibmodule.c.h

Lines changed: 8 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Modules/zlibmodule.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1420,7 +1420,7 @@ zlib_adler32_impl(PyObject *module, Py_buffer *data, unsigned int value)
14201420
}
14211421

14221422
/*[clinic input]
1423-
zlib.crc32
1423+
zlib.crc32 -> unsigned_int
14241424
14251425
data: Py_buffer
14261426
value: unsigned_int(bitwise=True) = 0
@@ -1432,9 +1432,9 @@ Compute a CRC-32 checksum of data.
14321432
The returned checksum is an integer.
14331433
[clinic start generated code]*/
14341434

1435-
static PyObject *
1435+
static unsigned int
14361436
zlib_crc32_impl(PyObject *module, Py_buffer *data, unsigned int value)
1437-
/*[clinic end generated code: output=63499fa20af7ea25 input=26c3ed430fa00b4c]*/
1437+
/*[clinic end generated code: output=b217562e4fe6d6a6 input=1229cb2fb5ea948a]*/
14381438
{
14391439
/* Releasing the GIL for very small buffers is inefficient
14401440
and may lower performance */
@@ -1455,7 +1455,7 @@ zlib_crc32_impl(PyObject *module, Py_buffer *data, unsigned int value)
14551455
} else {
14561456
value = crc32(value, data->buf, (unsigned int)data->len);
14571457
}
1458-
return PyLong_FromUnsignedLong(value & 0xffffffffU);
1458+
return value;
14591459
}
14601460

14611461

PCbuild/pythoncore.vcxproj

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -366,7 +366,9 @@
366366
<ClCompile Include="..\Modules\arraymodule.c" />
367367
<ClCompile Include="..\Modules\atexitmodule.c" />
368368
<ClCompile Include="..\Modules\audioop.c" />
369-
<ClCompile Include="..\Modules\binascii.c" />
369+
<ClCompile Include="..\Modules\binascii.c">
370+
<PreprocessorDefinitions>USE_ZLIB_CRC32;%(PreprocessorDefinitions)</PreprocessorDefinitions>
371+
</ClCompile>
370372
<ClCompile Include="..\Modules\cmathmodule.c" />
371373
<ClCompile Include="..\Modules\_datetimemodule.c" />
372374
<ClCompile Include="..\Modules\errnomodule.c" />

0 commit comments

Comments
 (0)