Skip to content

[SPARK-48710][PYTHON] Use NumPy 2.0 compatible types #47083

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions python/pyspark/core/rdd.py
Original file line number Diff line number Diff line change
Expand Up @@ -5370,6 +5370,18 @@ def _test() -> None:
import tempfile
from pyspark.core.context import SparkContext

try:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shall we add a todo that once we upgrade the minimum version >= 2.0, we can remove this try-except and update the doc tests.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There exist also multiple tests already with:

try:
    # Numpy 1.14+ changed it's string format.
    numpy.set_printoptions(legacy="1.13")
except TypeError:
    pass

I'd guess these should be considered for updating before that (since the minimum NumPy version is at 1.21 currently).

# Numpy 2.0+ changed its string format,
# adding type information to numeric scalars.
import numpy as np
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here actually did a try catch but I think there's some issue related to import ..

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's move this to sql and ml only for now because both modules use numpy.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you.

Yes, the problem is that try-catch didn't handle ModuleNotFoundError and causes failure like the following.

$ python/run-tests.py --python-executables python3 --modules pyspark-core
...
  File "/Users/dongjoon/APACHE/spark-merge/python/pyspark/core/rdd.py", line 5376, in _test
    import numpy as np
ModuleNotFoundError: No module named 'numpy'

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's move this to sql and ml only for now because both modules use numpy.

+1 for moving.

from pandas.util.version import Version

if Version(np.__version__) >= Version("2"):
# `legacy="1.25"` only available in `nump>=2`
np.set_printoptions(legacy="1.25") # type: ignore[arg-type]
except TypeError:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, yeah let's catch ImportError ....

pass

tmp_dir = tempfile.TemporaryDirectory()
globs = globals().copy()
# The small batch size here ensures that we see multiple batches,
Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/ml/param/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def _can_convert_to_list(value: Any) -> bool:
@staticmethod
def _can_convert_to_string(value: Any) -> bool:
vtype = type(value)
return isinstance(value, str) or vtype in [np.unicode_, np.string_, np.str_]
return isinstance(value, str) or vtype in [np.bytes_, np.str_]

@staticmethod
def identity(value: "T") -> "T":
Expand Down Expand Up @@ -230,7 +230,7 @@ def toString(value: Any) -> str:
"""
if isinstance(value, str):
return value
elif type(value) in [np.string_, np.str_, np.unicode_]:
elif type(value) in [np.bytes_, np.str_]:
return str(value)
else:
raise TypeError("Could not convert %s to string type" % type(value))
Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -903,7 +903,7 @@ def isnull(self: IndexOpsLike) -> IndexOpsLike:

Examples
--------
>>> ser = ps.Series([5, 6, np.NaN])
>>> ser = ps.Series([5, 6, np.nan])
>>> ser.isna() # doctest: +NORMALIZE_WHITESPACE
0 False
1 False
Expand Down Expand Up @@ -939,7 +939,7 @@ def notnull(self: IndexOpsLike) -> IndexOpsLike:
--------
Show which entries in a Series are not NA.

>>> ser = ps.Series([5, 6, np.NaN])
>>> ser = ps.Series([5, 6, np.nan])
>>> ser
0 5.0
1 6.0
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/pandas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -10064,7 +10064,7 @@ def reindex(
number (0, 1).
copy : bool, default True
Return a new object, even if the passed indexes are the same.
fill_value : scalar, default np.NaN
fill_value : scalar, default np.nan
Value to use for missing values. Defaults to NaN, but can be any
"compatible" value.

Expand Down
7 changes: 7 additions & 0 deletions python/pyspark/pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2645,9 +2645,16 @@ def _test() -> None:
import sys
from pyspark.sql import SparkSession
import pyspark.pandas.indexes.base
from pandas.util.version import Version

os.chdir(os.environ["SPARK_HOME"])

if Version(np.__version__) >= Version("2"):
# Numpy 2.0+ changed its string format,
# adding type information to numeric scalars.
# `legacy="1.25"` only available in `nump>=2`
np.set_printoptions(legacy="1.25") # type: ignore[arg-type]

globs = pyspark.pandas.indexes.base.__dict__.copy()
globs["ps"] = pyspark.pandas
spark = (
Expand Down
7 changes: 7 additions & 0 deletions python/pyspark/pandas/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1833,9 +1833,16 @@ def _test() -> None:
import sys
from pyspark.sql import SparkSession
import pyspark.pandas.indexing
from pandas.util.version import Version

os.chdir(os.environ["SPARK_HOME"])

if Version(np.__version__) >= Version("2"):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto

# Numpy 2.0+ changed its string format,
# adding type information to numeric scalars.
# `legacy="1.25"` only available in `nump>=2`
np.set_printoptions(legacy="1.25") # type: ignore[arg-type]

globs = pyspark.pandas.indexing.__dict__.copy()
globs["ps"] = pyspark.pandas
spark = (
Expand Down
11 changes: 9 additions & 2 deletions python/pyspark/pandas/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -2812,7 +2812,7 @@ def notna(obj):
--------
Show which entries in a DataFrame are not NA.

>>> df = ps.DataFrame({'age': [5, 6, np.NaN],
>>> df = ps.DataFrame({'age': [5, 6, np.nan],
... 'born': [pd.NaT, pd.Timestamp('1939-05-27'),
... pd.Timestamp('1940-04-25')],
... 'name': ['Alfred', 'Batman', ''],
Expand All @@ -2831,7 +2831,7 @@ def notna(obj):

Show which entries in a Series are not NA.

>>> ser = ps.Series([5, 6, np.NaN])
>>> ser = ps.Series([5, 6, np.nan])
>>> ser
0 5.0
1 6.0
Expand Down Expand Up @@ -3731,9 +3731,16 @@ def _test() -> None:
import uuid
from pyspark.sql import SparkSession
import pyspark.pandas.namespace
from pandas.util.version import Version

os.chdir(os.environ["SPARK_HOME"])

if Version(np.__version__) >= Version("2"):
# Numpy 2.0+ changed its string format,
# adding type information to numeric scalars.
# `legacy="1.25"` only available in `nump>=2`
np.set_printoptions(legacy="1.25") # type: ignore[arg-type]

globs = pyspark.pandas.namespace.__dict__.copy()
globs["ps"] = pyspark.pandas
globs["sf"] = F
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1893,7 +1893,7 @@ def reindex(self, index: Optional[Any] = None, fill_value: Optional[Any] = None)
index: array-like, optional
New labels / index to conform to, should be specified using keywords.
Preferably an Index object to avoid duplicating data
fill_value : scalar, default np.NaN
fill_value : scalar, default np.nan
Value to use for missing values. Defaults to NaN, but can be any
"compatible" value.

Expand Down
8 changes: 4 additions & 4 deletions python/pyspark/pandas/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -862,7 +862,7 @@ def contains(
--------
Returning a Series of booleans using only a literal pattern.

>>> s1 = ps.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])
>>> s1 = ps.Series(['Mouse', 'dog', 'house and parrot', '23', np.nan])
>>> s1.str.contains('og', regex=False)
0 False
1 True
Expand Down Expand Up @@ -965,7 +965,7 @@ def count(self, pat: str, flags: int = 0) -> "ps.Series":

Examples
--------
>>> s = ps.Series(['A', 'B', 'Aaba', 'Baca', np.NaN, 'CABA', 'cat'])
>>> s = ps.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat'])
>>> s.str.count('a')
0 0.0
1 0.0
Expand Down Expand Up @@ -1327,7 +1327,7 @@ def pandas_ljust(s) -> ps.Series[str]: # type: ignore[no-untyped-def]

return self._data.pandas_on_spark.transform_batch(pandas_ljust)

def match(self, pat: str, case: bool = True, flags: int = 0, na: Any = np.NaN) -> "ps.Series":
def match(self, pat: str, case: bool = True, flags: int = 0, na: Any = np.nan) -> "ps.Series":
"""
Determine if each string matches a regular expression.

Expand All @@ -1353,7 +1353,7 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na: Any = np.NaN) -

Examples
--------
>>> s = ps.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])
>>> s = ps.Series(['Mouse', 'dog', 'house and parrot', '23', np.nan])
>>> s.str.match('dog')
0 False
1 True
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/pandas/tests/indexes/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def test_astype(self):
self.assert_eq(psidx.astype(bool), pidx.astype(bool))
self.assert_eq(psidx.astype("bool"), pidx.astype("bool"))
self.assert_eq(psidx.astype("?"), pidx.astype("?"))
self.assert_eq(psidx.astype(np.unicode_), pidx.astype(np.unicode_))
self.assert_eq(psidx.astype(np.str_), pidx.astype(np.str_))
self.assert_eq(psidx.astype("str"), pidx.astype("str"))
self.assert_eq(psidx.astype("U"), pidx.astype("U"))

Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/pandas/tests/series/test_arg_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def test_argmin_argmax(self):
self.assert_eq(pser.argmax(), psser.argmax())
self.assert_eq(pser.argmax(skipna=False), psser.argmax(skipna=False))

pser2 = pd.Series([np.NaN, 1.0, 2.0, np.NaN])
pser2 = pd.Series([np.nan, 1.0, 2.0, np.nan])
psser2 = ps.from_pandas(pser2)
self.assert_eq(pser2.argmin(), psser2.argmin())
self.assert_eq(pser2.argmax(), psser2.argmax())
Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/pandas/tests/series/test_as_of.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def test_asof(self):
self.assert_eq(psser.asof(20), pser.asof(20))
self.assert_eq(psser.asof([5, 20]).sort_index(), pser.asof([5, 20]).sort_index())
self.assert_eq(psser.asof(100), pser.asof(100))
self.assert_eq(repr(psser.asof(-100)), repr(pser.asof(-100)))
self.assert_eq(str(psser.asof(-100)), str(pser.asof(-100)))
self.assert_eq(psser.asof([-100, 100]).sort_index(), pser.asof([-100, 100]).sort_index())

# where cannot be an Index, Series or a DataFrame
Expand All @@ -55,7 +55,7 @@ def test_asof(self):

self.assert_eq(psser.asof("2014-01-01"), pser.asof("2014-01-01"))
self.assert_eq(psser.asof("2014-01-02"), pser.asof("2014-01-02"))
self.assert_eq(repr(psser.asof("1999-01-02")), repr(pser.asof("1999-01-02")))
self.assert_eq(str(psser.asof("1999-01-02")), str(pser.asof("1999-01-02")))

# SPARK-37482: Skip check monotonic increasing for Series.asof with 'compute.eager_check'
pser = pd.Series([1, 2, np.nan, 4], index=[10, 30, 20, 40])
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/pandas/tests/series/test_as_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def _test_numeric_astype(self, pser):
self.assert_eq(psser.astype(bool), pser.astype(bool))
self.assert_eq(psser.astype("bool"), pser.astype("bool"))
self.assert_eq(psser.astype("?"), pser.astype("?"))
self.assert_eq(psser.astype(np.unicode_), pser.astype(np.unicode_))
self.assert_eq(psser.astype(np.str_), pser.astype(np.str_))
self.assert_eq(psser.astype("str"), pser.astype("str"))
self.assert_eq(psser.astype("U"), pser.astype("U"))

Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/pandas/tests/series/test_string_ops_adv.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def pser(self):
"\nleading-whitespace",
"trailing-Whitespace \t",
None,
np.NaN,
np.nan,
]
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def pser(self):
"\nleading-whitespace",
"trailing-Whitespace \t",
None,
np.NaN,
np.nan,
]
)

Expand Down
21 changes: 10 additions & 11 deletions python/pyspark/pandas/tests/test_typedef.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,12 @@ def func() -> pd.Series[float]:
self.assertEqual(inferred.dtype, np.float64)
self.assertEqual(inferred.spark_type, DoubleType())

def func() -> "pd.DataFrame[np.float_, str]":
def func() -> "pd.DataFrame[np.float64, str]":
pass

expected = StructType([StructField("c0", DoubleType()), StructField("c1", StringType())])
inferred = infer_return_type(func)
self.assertEqual(inferred.dtypes, [np.float64, np.unicode_])
self.assertEqual(inferred.dtypes, [np.float64, np.str_])
self.assertEqual(inferred.spark_type, expected)

def func() -> "pandas.DataFrame[float]":
Expand All @@ -121,10 +121,10 @@ def func() -> pd.DataFrame[np.float64, str]:

expected = StructType([StructField("c0", DoubleType()), StructField("c1", StringType())])
inferred = infer_return_type(func)
self.assertEqual(inferred.dtypes, [np.float64, np.unicode_])
self.assertEqual(inferred.dtypes, [np.float64, np.str_])
self.assertEqual(inferred.spark_type, expected)

def func() -> pd.DataFrame[np.float_]:
def func() -> pd.DataFrame[np.float64]:
pass

expected = StructType([StructField("c0", DoubleType())])
Expand Down Expand Up @@ -167,12 +167,12 @@ def test_if_pandas_implements_class_getitem(self):
assert not ps._series_has_class_getitem

def test_infer_schema_with_names_pandas_instances(self):
def func() -> 'pd.DataFrame["a" : np.float_, "b":str]': # noqa: F405
def func() -> 'pd.DataFrame["a" : np.float64, "b":str]': # noqa: F405
pass

expected = StructType([StructField("a", DoubleType()), StructField("b", StringType())])
inferred = infer_return_type(func)
self.assertEqual(inferred.dtypes, [np.float64, np.unicode_])
self.assertEqual(inferred.dtypes, [np.float64, np.str_])
self.assertEqual(inferred.spark_type, expected)

def func() -> "pd.DataFrame['a': float, 'b': int]": # noqa: F405
Expand Down Expand Up @@ -217,7 +217,7 @@ def func() -> pd.DataFrame[zip(pdf.columns, pdf.dtypes)]:

def test_infer_schema_with_names_pandas_instances_negative(self):
def try_infer_return_type():
def f() -> 'pd.DataFrame["a" : np.float_ : 1, "b":str:2]': # noqa: F405
def f() -> 'pd.DataFrame["a" : np.float64 : 1, "b":str:2]': # noqa: F405
pass

infer_return_type(f)
Expand Down Expand Up @@ -283,7 +283,7 @@ def f() -> ps.DataFrame[A]:
self.assertRaisesRegex(TypeError, "not understood", try_infer_return_type)

def try_infer_return_type():
def f() -> 'ps.DataFrame["a" : np.float_ : 1, "b":str:2]': # noqa: F405
def f() -> 'ps.DataFrame["a" : np.float64 : 1, "b":str:2]': # noqa: F405
pass

infer_return_type(f)
Expand Down Expand Up @@ -314,7 +314,6 @@ def test_as_spark_type_pandas_on_spark_dtype(self):
# binary
np.character: (np.character, BinaryType()),
np.bytes_: (np.bytes_, BinaryType()),
np.string_: (np.bytes_, BinaryType()),
bytes: (np.bytes_, BinaryType()),
# integer
np.int8: (np.int8, ByteType()),
Expand All @@ -328,8 +327,8 @@ def test_as_spark_type_pandas_on_spark_dtype(self):
np.float64: (np.float64, DoubleType()),
float: (np.float64, DoubleType()),
# string
np.unicode_: (np.unicode_, StringType()),
str: (np.unicode_, StringType()),
np.str_: (np.str_, StringType()),
str: (np.str_, StringType()),
# bool
bool: (np.bool_, BooleanType()),
# datetime
Expand Down
6 changes: 3 additions & 3 deletions python/pyspark/pandas/typedef/typehints.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def as_spark_type(
return None
return types.ArrayType(element_type)
# BinaryType
elif tpe in (bytes, np.character, np.bytes_, np.string_):
Copy link
Contributor

@itholic itholic Jun 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

qq: why do we remove np.string_?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, nvm. I just checked the PR description.

elif tpe in (bytes, np.character, np.bytes_):
return types.BinaryType()
# BooleanType
elif tpe in (bool, np.bool_, "bool", "?"):
Expand All @@ -190,7 +190,7 @@ def as_spark_type(
elif tpe in (decimal.Decimal,):
# TODO: considering the precision & scale for decimal type.
return types.DecimalType(38, 18)
elif tpe in (float, np.float_, np.float64, "float", "float64", "double"):
elif tpe in (float, np.double, np.float64, "float", "float64", "double"):
return types.DoubleType()
elif tpe in (np.float32, "float32", "f"):
return types.FloatType()
Expand All @@ -201,7 +201,7 @@ def as_spark_type(
elif tpe in (np.int16, "int16", "short"):
return types.ShortType()
# StringType
elif tpe in (str, np.unicode_, "str", "U"):
elif tpe in (str, np.str_, "str", "U"):
return types.StringType()
# TimestampType or TimestampNTZType if timezone is not specified.
elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M", pd.Timestamp):
Expand Down
14 changes: 9 additions & 5 deletions python/pyspark/sql/tests/test_arrow_python_udf.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def test_complex_input_types(self):
.first()
)

self.assertEqual(row[0], "[1, 2, 3]")
self.assertIn(row[0], ["[1, 2, 3]", "[np.int32(1), np.int32(2), np.int32(3)]"])
self.assertEqual(row[1], "{'a': 'b'}")
self.assertEqual(row[2], "Row(col1=1, col2=2)")

Expand Down Expand Up @@ -119,9 +119,10 @@ def test_register(self):
str_repr_func = self.spark.udf.register("str_repr", udf(lambda x: str(x), useArrow=True))

# To verify that Arrow optimization is on
self.assertEqual(
self.assertIn(
df.selectExpr("str_repr(array) AS str_id").first()[0],
"[1, 2, 3]", # The input is a NumPy array when the Arrow optimization is on
["[1, 2, 3]", "[np.int32(1), np.int32(2), np.int32(3)]"],
# The input is a NumPy array when the Arrow optimization is on
)

# To verify that a UserDefinedFunction is returned
Expand All @@ -132,11 +133,14 @@ def test_register(self):

def test_nested_array_input(self):
df = self.spark.range(1).selectExpr("array(array(1, 2), array(3, 4)) as nested_array")
self.assertEqual(
self.assertIn(
df.select(
udf(lambda x: str(x), returnType="string", useArrow=True)("nested_array")
).first()[0],
"[[1, 2], [3, 4]]",
[
"[[1, 2], [3, 4]]",
"[[np.int32(1), np.int32(2)], [np.int32(3), np.int32(4)]]",
],
)

def test_type_coercion_string_to_numeric(self):
Expand Down
4 changes: 3 additions & 1 deletion python/pyspark/sql/tests/test_udf.py
Original file line number Diff line number Diff line change
Expand Up @@ -904,7 +904,9 @@ def test_nested_array(self):
df = self.spark.range(1).selectExpr("array(array(1, 2), array(3, 4)) as nested_array")
# Input
row = df.select(udf(lambda x: str(x))("nested_array")).first()
self.assertEqual(row[0], "[[1, 2], [3, 4]]")
self.assertIn(
row[0], ["[[1, 2], [3, 4]]", "[[np.int32(1), np.int32(2)], [np.int32(3), np.int32(4)]]"]
)
# Output

@udf(returnType=df.dtypes[0][1])
Expand Down
Loading