Skip to content

Commit 82ed1c9

Browse files
committed
[SPARK-45179][PYTHON] Increase Numpy minimum version to 1.21
### What changes were proposed in this pull request? Increase Numpy minimum version to 1.21 ### Why are the changes needed? - according to the [release history](https://pypi.org/project/numpy/#history), Numpy 1.15 was released about 5 years ago, while the last maintenance release in 1.21 was released 1 year ago; - with 1.21 as the minimum version, we can discard all version checking in PySpark; - `pandas==1.4.4` just depends on `numpy>=1.21.0`; ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? search with `ag` ``` (spark_dev_310) ➜ spark git:(master) ag --py 'numpy\.__version' python (spark_dev_310) ➜ spark git:(master) (spark_dev_310) ➜ spark git:(master) ag --py 'np\.__version' python python/pyspark/ml/image.py 231: if LooseVersion(np.__version__) >= LooseVersion("1.9"): python/pyspark/pandas/typedef/typehints.py 152: if sys.version_info >= (3, 8) and LooseVersion(np.__version__) >= LooseVersion("1.21"): python/pyspark/pandas/tests/test_typedef.py 365: if sys.version_info >= (3, 8) and LooseVersion(np.__version__) >= LooseVersion("1.21"): python/pyspark/pandas/tests/computation/test_apply_func.py 257: if sys.version_info >= (3, 8) and LooseVersion(np.__version__) >= LooseVersion("1.21"): ``` ### Was this patch authored or co-authored using generative AI tooling? no Closes #42944 from zhengruifeng/bump_min_np_ver. Authored-by: Ruifeng Zheng <[email protected]> Signed-off-by: Ruifeng Zheng <[email protected]>
1 parent 0d1f43c commit 82ed1c9

File tree

6 files changed

+11
-21
lines changed

6 files changed

+11
-21
lines changed

python/docs/source/getting_started/install.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ Package Supported version Note
158158
`py4j` >=0.10.9.7 Required
159159
`pandas` >=1.4.4 Required for pandas API on Spark and Spark Connect; Optional for Spark SQL
160160
`pyarrow` >=4.0.0 Required for pandas API on Spark and Spark Connect; Optional for Spark SQL
161-
`numpy` >=1.15 Required for pandas API on Spark and MLLib DataFrame-based API; Optional for Spark SQL
161+
`numpy` >=1.21 Required for pandas API on Spark and MLLib DataFrame-based API; Optional for Spark SQL
162162
`grpcio` >=1.48,<1.57 Required for Spark Connect
163163
`grpcio-status` >=1.48,<1.57 Required for Spark Connect
164164
`googleapis-common-protos` ==1.56.4 Required for Spark Connect

python/pyspark/ml/image.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
from typing import Any, Dict, List, NoReturn, Optional, cast
2929

3030
import numpy as np
31-
from distutils.version import LooseVersion
3231

3332
from pyspark import SparkContext
3433
from pyspark.sql.types import Row, StructType, _create_row, _parse_datatype_json_string
@@ -225,14 +224,7 @@ def toImage(self, array: np.ndarray, origin: str = "") -> Row:
225224
else:
226225
raise ValueError("Invalid number of channels")
227226

228-
# Running `bytearray(numpy.array([1]))` fails in specific Python versions
229-
# with a specific Numpy version, for example in Python 3.6.0 and NumPy 1.13.3.
230-
# Here, it avoids it by converting it to bytes.
231-
if LooseVersion(np.__version__) >= LooseVersion("1.9"):
232-
data = bytearray(array.astype(dtype=np.uint8).ravel().tobytes())
233-
else:
234-
# Numpy prior to 1.9 don't have `tobytes` method.
235-
data = bytearray(array.astype(dtype=np.uint8).ravel())
227+
data = bytearray(array.astype(dtype=np.uint8).ravel().tobytes())
236228

237229
# Creating new Row with _create_row(), because Row(name = value, ... )
238230
# orders fields by name, which conflicts with expected schema order

python/pyspark/pandas/tests/computation/test_apply_func.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
# limitations under the License.
1616
#
1717
from datetime import datetime
18-
from distutils.version import LooseVersion
1918
import sys
2019
import unittest
2120
from typing import List
@@ -254,7 +253,7 @@ def identify3(x) -> ps.DataFrame[float, [int, List[int]]]:
254253
self.assert_eq(actual, pdf)
255254

256255
# For NumPy typing, NumPy version should be 1.21+ and Python version should be 3.8+
257-
if sys.version_info >= (3, 8) and LooseVersion(np.__version__) >= LooseVersion("1.21"):
256+
if sys.version_info >= (3, 8):
258257
import numpy.typing as ntp
259258

260259
psdf = ps.from_pandas(pdf)

python/pyspark/pandas/tests/test_typedef.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
import unittest
2020
import datetime
2121
import decimal
22-
from distutils.version import LooseVersion
2322
from typing import List
2423

2524
import pandas
@@ -362,7 +361,7 @@ def test_as_spark_type_pandas_on_spark_dtype(self):
362361
)
363362

364363
# For NumPy typing, NumPy version should be 1.21+ and Python version should be 3.8+
365-
if sys.version_info >= (3, 8) and LooseVersion(np.__version__) >= LooseVersion("1.21"):
364+
if sys.version_info >= (3, 8):
366365
import numpy.typing as ntp
367366

368367
self.assertEqual(

python/pyspark/pandas/typedef/typehints.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
import sys
2424
import typing
2525
from collections.abc import Iterable
26-
from distutils.version import LooseVersion
2726
from inspect import isclass
2827
from typing import Any, Callable, Generic, List, Tuple, Union, Type, get_type_hints
2928

@@ -149,7 +148,7 @@ def as_spark_type(
149148
- Python3's typing system
150149
"""
151150
# For NumPy typing, NumPy version should be 1.21+ and Python version should be 3.8+
152-
if sys.version_info >= (3, 8) and LooseVersion(np.__version__) >= LooseVersion("1.21"):
151+
if sys.version_info >= (3, 8):
153152
if (
154153
hasattr(tpe, "__origin__")
155154
and tpe.__origin__ is np.ndarray # type: ignore[union-attr]

python/setup.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ def _supports_symlinks():
131131
# binary format protocol with the Java version, see ARROW_HOME/format/* for specifications.
132132
# Also don't forget to update python/docs/source/getting_started/install.rst.
133133
_minimum_pandas_version = "1.4.4"
134+
_minimum_numpy_version = "1.21"
134135
_minimum_pyarrow_version = "4.0.0"
135136
_minimum_grpc_version = "1.56.0"
136137
_minimum_googleapis_common_protos_version = "1.56.4"
@@ -307,25 +308,25 @@ def run(self):
307308
# if you're updating the versions or dependencies.
308309
install_requires=["py4j==0.10.9.7"],
309310
extras_require={
310-
"ml": ["numpy>=1.15"],
311-
"mllib": ["numpy>=1.15"],
311+
"ml": ["numpy>=%s" % _minimum_numpy_version],
312+
"mllib": ["numpy>=%s" % _minimum_numpy_version],
312313
"sql": [
313314
"pandas>=%s" % _minimum_pandas_version,
314315
"pyarrow>=%s" % _minimum_pyarrow_version,
315-
"numpy>=1.15",
316+
"numpy>=%s" % _minimum_numpy_version,
316317
],
317318
"pandas_on_spark": [
318319
"pandas>=%s" % _minimum_pandas_version,
319320
"pyarrow>=%s" % _minimum_pyarrow_version,
320-
"numpy>=1.15",
321+
"numpy>=%s" % _minimum_numpy_version,
321322
],
322323
"connect": [
323324
"pandas>=%s" % _minimum_pandas_version,
324325
"pyarrow>=%s" % _minimum_pyarrow_version,
325326
"grpcio>=%s" % _minimum_grpc_version,
326327
"grpcio-status>=%s" % _minimum_grpc_version,
327328
"googleapis-common-protos>=%s" % _minimum_googleapis_common_protos_version,
328-
"numpy>=1.15",
329+
"numpy>=%s" % _minimum_numpy_version,
329330
],
330331
},
331332
python_requires=">=3.8",

0 commit comments

Comments
 (0)