Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit b07bb9a

Browse files
authored
Call astype scalable (#628)
Series.astype parallel
1 parent c5b2a0c commit b07bb9a

File tree

3 files changed

+11
-39
lines changed

3 files changed

+11
-39
lines changed

sdc/datatypes/hpat_pandas_series_functions.py

Lines changed: 9 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1777,42 +1777,20 @@ def hpat_pandas_series_astype(self, dtype, copy=True, errors='raise'):
17771777
errors in ('raise', 'ignore')):
17781778
ty_checker.raise_exc(errors, 'str', 'errors')
17791779

1780-
# Return StringArray for astype(str) or astype('str')
1781-
def hpat_pandas_series_astype_to_str_impl(self, dtype, copy=True, errors='raise'):
1782-
num_chars = 0
1783-
arr_len = len(self._data)
1784-
1785-
# Get total chars for new array
1786-
for i in prange(arr_len):
1787-
item = self._data[i]
1788-
num_chars += len(str(item)) # TODO: check NA
1789-
1790-
data = pre_alloc_string_array(arr_len, num_chars)
1791-
for i in prange(arr_len):
1792-
item = self._data[i]
1793-
data[i] = str(item) # TODO: check NA
1794-
1795-
return pandas.Series(data=data, index=self._index, name=self._name)
1796-
17971780
# Return npytypes.Array from npytypes.Array for astype(types.functions.NumberClass), example - astype(np.int64)
1798-
def hpat_pandas_series_astype_numba_impl(self, dtype, copy=True, errors='raise'):
1799-
return pandas.Series(data=self._data.astype(dtype), index=self._index, name=self._name)
1800-
18011781
# Return npytypes.Array from npytypes.Array for astype(types.StringLiteral), example - astype('int64')
1802-
def hpat_pandas_series_astype_literal_type_numba_impl(self, dtype, copy=True, errors='raise'):
1803-
return pandas.Series(data=self._data.astype(numpy.dtype(dtype)), index=self._index, name=self._name)
1782+
def hpat_pandas_series_astype_numba_impl(self, dtype, copy=True, errors='raise'):
1783+
return pandas.Series(data=numpy_like.astype(self._data, dtype), index=self._index, name=self._name)
18041784

18051785
# Return self
18061786
def hpat_pandas_series_astype_no_modify_impl(self, dtype, copy=True, errors='raise'):
18071787
return pandas.Series(data=self._data, index=self._index, name=self._name)
18081788

1809-
1810-
if ((isinstance(dtype, types.Function) and dtype.typing_key == str)
1811-
or (isinstance(dtype, types.StringLiteral) and dtype.literal_value == 'str')):
1812-
return hpat_pandas_series_astype_to_str_impl
1789+
str_check = ((isinstance(dtype, types.Function) and dtype.typing_key == str) or
1790+
(isinstance(dtype, types.StringLiteral) and dtype.literal_value == 'str'))
18131791

18141792
# Needs Numba astype impl support converting unicode_type to NumberClass and other types
1815-
if isinstance(self.data, StringArrayType):
1793+
if (isinstance(self.data, StringArrayType) and not str_check):
18161794
if isinstance(dtype, types.functions.NumberClass) and errors == 'raise':
18171795
raise TypingError(f'Needs Numba astype impl support converting unicode_type to {dtype}')
18181796
if isinstance(dtype, types.StringLiteral) and errors == 'raise':
@@ -1823,18 +1801,12 @@ def hpat_pandas_series_astype_no_modify_impl(self, dtype, copy=True, errors='rai
18231801
else:
18241802
raise TypingError(f'Needs Numba astype impl support converting unicode_type to {dtype.literal_value}')
18251803

1826-
if isinstance(self.data, types.npytypes.Array) and isinstance(dtype, types.functions.NumberClass):
1827-
return hpat_pandas_series_astype_numba_impl
1804+
data_narr = isinstance(self.data, types.npytypes.Array)
1805+
dtype_num_liter = isinstance(dtype, (types.functions.NumberClass, types.StringLiteral))
18281806

1829-
if isinstance(self.data, types.npytypes.Array) and isinstance(dtype, types.StringLiteral):
1830-
try:
1831-
literal_value = numpy.dtype(dtype.literal_value)
1832-
except:
1833-
pass # Will raise the exception later
1834-
else:
1835-
return hpat_pandas_series_astype_literal_type_numba_impl
1807+
if data_narr and dtype_num_liter or str_check:
1808+
return hpat_pandas_series_astype_numba_impl
18361809

1837-
# Raise error if dtype is not supported
18381810
if errors == 'raise':
18391811
raise TypingError(f'{_func_name} The object must be a supported type. Given dtype: {dtype}')
18401812
else:

sdc/functions/numpy_like.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ def sdc_astype_overload(self, dtype):
108108
"""
109109

110110
ty_checker = TypeChecker("numpy-like 'astype'")
111-
if not isinstance(self, types.Array):
111+
if not isinstance(self, (types.Array, StringArrayType)):
112112
return None
113113

114114
if not isinstance(dtype, (types.functions.NumberClass, types.Function, types.Literal)):

sdc/tests/tests_perf/test_perf_series.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def _test_case(self, pyfunc, name, total_data_length, data_num=1, input_data=tes
6868
TC(name='append', size=[10 ** 7], params='other', data_num=2),
6969
TC(name='apply', size=[10 ** 7], params='lambda x: x'),
7070
TC(name='argsort', size=[10 ** 4]),
71-
TC(name='astype', size=[10 ** 5], call_expr='data.astype(np.int8)', usecase_params='data',
71+
TC(name='astype', size=[10 ** 8], call_expr='data.astype(np.int8)', usecase_params='data',
7272
input_data=[test_global_input_data_float64[0]]),
7373
TC(name='at', size=[10 ** 7], call_expr='data.at[3]', usecase_params='data'),
7474
TC(name='chain_add_and_sum', size=[20 * 10 ** 6, 25 * 10 ** 6, 30 * 10 ** 6], call_expr='(A + B).sum()',

0 commit comments

Comments
 (0)