Skip to content

Commit 4c2f998

Browse files
committed
PERF: speedup vectorized string functions by:
push to cython, eliminate convert_objects call by creating directly (but need specific routines that have a call signature for this) overhead still in the actual x.endswith call (which operates on a pyobject)
1 parent 4a114d2 commit 4c2f998

File tree

4 files changed

+56
-5
lines changed

4 files changed

+56
-5
lines changed

pandas/core/strings.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import re
77
import pandas.lib as lib
88

9-
109
def _get_array_list(arr, others):
1110
if isinstance(others[0], (list, np.ndarray)):
1211
arrays = [arr] + list(others)
@@ -203,9 +202,26 @@ def str_endswith(arr, pat, na=np.nan):
203202
-------
204203
endswith : array (boolean)
205204
"""
206-
f = lambda x: x.endswith(pat)
207-
return _na_map(f, arr, na)
208-
205+
if not isinstance(arr, np.ndarray):
206+
arr = np.asarray(arr, dtype=object)
207+
if True:
208+
mask = isnull(arr)
209+
try:
210+
result = lib.string_na_map_bool(arr.values, 'endswith', [ pat ], mask.view(np.uint8))
211+
except (TypeError, AttributeError):
212+
def g(x):
213+
try:
214+
return f(x)
215+
except (TypeError, AttributeError):
216+
return na_value
217+
return _map(g, arr)
218+
#if na_value is not np.nan:
219+
# np.putmask(result, mask, na_value)
220+
# if result.dtype == object:
221+
# result = lib.maybe_convert_objects(result)
222+
return result
223+
else:
224+
return lib.map_infer(arr, f)
209225

210226
def str_lower(arr):
211227
"""

pandas/lib.pyx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1280,3 +1280,4 @@ def indices_fast(object index, ndarray[int64_t] labels, list keys,
12801280
include "reduce.pyx"
12811281
include "properties.pyx"
12821282
include "inference.pyx"
1283+
include "strings.pyx"

pandas/src/strings.pyx

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
cimport util
2+
3+
def string_na_map_bool(ndarray arr, object fn, list args, ndarray[uint8_t] mask,
4+
bint convert=1):
5+
'''
6+
Substitute for np.vectorize with pandas-friendly dtype inference
7+
8+
Parameters
9+
----------
10+
arr : ndarray
11+
fm : function name
12+
args : a list of the args to pass function
13+
14+
Returns
15+
-------
16+
mapped : ndarray
17+
'''
18+
cdef:
19+
Py_ssize_t i, n
20+
ndarray[uint8_t] result
21+
22+
n = len(arr)
23+
result = np.empty(n, dtype=np.uint8)
24+
for i in range(n):
25+
if mask[i]:
26+
result[i] = 0
27+
else:
28+
result[i] = string_endswith(arr[i], args[0])
29+
30+
return result
31+
32+
cdef inline bint string_endswith(char *x, object pat):
33+
return x.endswith(pat)
34+

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -561,7 +561,7 @@ def run(self):
561561
cmdclass['build_src'] = DummyBuildSrc
562562
cmdclass['build_ext'] = CheckingBuildExt
563563

564-
lib_depends = ['reduce', 'inference', 'properties']
564+
lib_depends = ['reduce', 'inference', 'properties', 'strings']
565565

566566

567567
def srcpath(name=None, suffix='.pyx', subdir='src'):

0 commit comments

Comments
 (0)