Skip to content

Commit 9100b1d

Browse files
committed
ENH: use faster Cython code for DataFrame.count, GH #341
1 parent 79a8609 commit 9100b1d

File tree

5 files changed

+56
-68
lines changed

5 files changed

+56
-68
lines changed

pandas/core/frame.py

Lines changed: 21 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -2435,52 +2435,26 @@ def count(self, axis=0, level=None, numeric_only=False):
24352435
return result
24362436

24372437
def _count_level(self, level, axis=0, numeric_only=False):
2438-
# TODO: deal with sortedness??
2439-
obj = self.sortlevel(level, axis=axis)
2440-
axis_index = obj._get_axis(axis)
2441-
y, _ = self._get_agg_data(axis, numeric_only=numeric_only)
2442-
mask = notnull(y)
2443-
2444-
level_index = axis_index.levels[level]
2438+
if numeric_only:
2439+
frame = self._get_numeric_data()
2440+
else:
2441+
frame = self
24452442

2446-
if len(self) == 0:
2447-
return DataFrame(np.zeros((len(level_index),
2448-
len(self.columns)), dtype=int),
2449-
index=level_index, columns=self.columns)
2443+
if axis == 1:
2444+
frame = frame.T
24502445

2451-
n = len(level_index)
2452-
locs = axis_index.labels[level].searchsorted(np.arange(n))
2446+
mask = notnull(frame.values)
2447+
level_index = frame.index.levels[level]
2448+
counts = lib.count_level_2d(mask, frame.index.labels[level],
2449+
len(level_index))
24532450

2454-
# WORKAROUND: reduceat fusses about the endpoints. should file ticket?
2455-
start = locs.searchsorted(0, side='right') - 1
2456-
end = locs.searchsorted(len(mask), side='left')
2451+
result = DataFrame(counts, index=level_index,
2452+
columns=frame.columns)
24572453

2458-
if axis == 0:
2459-
index = level_index
2460-
columns = self.columns
2461-
result = np.zeros((n, len(self.columns)), dtype=int)
2462-
out = result[start:end]
2463-
np.add.reduceat(mask, locs[start:end], axis=axis, out=out)
2454+
if axis == 1:
2455+
return result.T
24642456
else:
2465-
index = self.index
2466-
columns = level_index
2467-
result = np.zeros((len(self.index), n), dtype=int)
2468-
out = result[:, start:end]
2469-
np.add.reduceat(mask, locs[start:end], axis=axis, out=out)
2470-
2471-
# WORKAROUND: to see why, try this
2472-
# arr = np.ones((10, 4), dtype=bool)
2473-
# np.add.reduceat(arr, [0, 3, 3, 7, 9], axis=0)
2474-
2475-
# this stinks
2476-
if len(locs) > 1:
2477-
workaround_mask = locs[:-1] == locs[1:]
2478-
if axis == 0:
2479-
result[:-1][workaround_mask] = 0
2480-
else:
2481-
result[:, :-1][:, workaround_mask] = 0
2482-
2483-
return DataFrame(result, index=index, columns=columns)
2457+
return result
24842458

24852459
def sum(self, axis=0, numeric_only=True, skipna=True, level=None):
24862460
if level is not None:
@@ -2568,7 +2542,7 @@ def median(self, axis=0, skipna=True, level=None):
25682542
return self._agg_by_level('median', axis=axis, level=level,
25692543
skipna=skipna)
25702544

2571-
frame = self._get_numeric_frame()
2545+
frame = self._get_numeric_data()
25722546

25732547
if axis == 0:
25742548
values = frame.values.T
@@ -2598,7 +2572,7 @@ def mad(self, axis=0, skipna=True, level=None):
25982572
return self._agg_by_level('mad', axis=axis, level=level,
25992573
skipna=skipna)
26002574

2601-
frame = self._get_numeric_frame()
2575+
frame = self._get_numeric_data()
26022576

26032577
if axis == 0:
26042578
demeaned = frame - frame.mean(axis=0)
@@ -2665,12 +2639,6 @@ def skew(self, axis=0, skipna=True, level=None):
26652639
return Series(result, index=axis_labels)
26662640
_add_stat_doc(skew, 'unbiased skewness', 'skew')
26672641

2668-
def _get_numeric_frame(self):
2669-
frame = self
2670-
if self._is_mixed_type:
2671-
frame = self.ix[:, self._get_numeric_columns()]
2672-
return frame
2673-
26742642
def _agg_by_level(self, name, axis=0, level=0, skipna=True):
26752643
method = getattr(type(self), name)
26762644
applyf = lambda x: method(x, axis=axis, skipna=skipna)
@@ -2945,8 +2913,10 @@ def _write_to_buffer(self):
29452913
to_write = []
29462914

29472915
if len(frame.columns) == 0 or len(frame.index) == 0:
2948-
to_write.append('Empty %s\n' % type(self.frame).__name__)
2949-
to_write.append(repr(frame.index))
2916+
info_line = 'Empty %s\nColumns: %s\nIndex: %s'
2917+
to_write.append(info_line % (type(self.frame).__name__,
2918+
repr(frame.columns),
2919+
repr(frame.index)))
29502920
else:
29512921
# may include levels names also
29522922
str_index = self._get_formatted_index()

pandas/core/series.py

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -597,23 +597,19 @@ def count(self, level=None):
597597
nobs : int or Series (if level specified)
598598
"""
599599
if level is not None:
600-
return self._count_level(level)
600+
mask = notnull(self.values)
601+
level_index = self.index.levels[level]
601602

602-
return notnull(self.values).sum()
603-
604-
def _count_level(self, level):
605-
# TODO: GENERALIZE CODE OVERLAP WITH DATAFRAME
606-
mask = notnull(self.values)
607-
level_index = self.index.levels[level]
603+
if len(self) == 0:
604+
return Series(0, index=level_index)
608605

609-
if len(self) == 0:
610-
return Series(0, index=level_index)
606+
# call cython function
607+
max_bin = len(level_index)
608+
counts = lib.count_level_1d(mask.view(np.uint8),
609+
self.index.labels[level], max_bin)
610+
return Series(counts, index=level_index)
611611

612-
# call cython function
613-
max_bin = len(level_index)
614-
counts = lib.count_level_1d(mask.view(np.uint8),
615-
self.index.labels[level], max_bin)
616-
return Series(counts, index=level_index)
612+
return notnull(self.values).sum()
617613

618614
def value_counts(self):
619615
"""

pandas/src/groupby.pyx

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -526,7 +526,7 @@ def _bucket_locs(index, buckets, inclusive=False):
526526
return locs
527527

528528
def count_level_1d(ndarray[uint8_t, cast=True] mask,
529-
ndarray[int32_t] labels, Py_ssize_t max_bin):
529+
ndarray[int32_t] labels, Py_ssize_t max_bin):
530530
cdef:
531531
Py_ssize_t i, n
532532
ndarray[int64_t] counts
@@ -541,6 +541,22 @@ def count_level_1d(ndarray[uint8_t, cast=True] mask,
541541

542542
return counts
543543

544+
def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
545+
ndarray[int32_t] labels, Py_ssize_t max_bin):
546+
cdef:
547+
Py_ssize_t i, j, k, n
548+
ndarray[int64_t, ndim=2] counts
549+
550+
n, k = (<object> mask).shape
551+
counts = np.zeros((max_bin, k), dtype='i8')
552+
553+
for i from 0 <= i < n:
554+
for j from 0 <= j < k:
555+
if mask[i, j]:
556+
counts[labels[i], j] += 1
557+
558+
return counts
559+
544560

545561
'''
546562

pandas/tests/test_frame.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1351,8 +1351,9 @@ def test_to_string(self):
13511351
joined = '\n'.join([re.sub('\s+', ' ', x).strip() for x in lines[1:]])
13521352
recons = read_table(StringIO(joined), names=header, sep=' ')
13531353
assert_series_equal(recons['B'], biggie['B'])
1354-
assert_series_equal(np.round(recons['A'], 2),
1355-
np.round(biggie['A'], 2))
1354+
self.assertEqual(recons['A'].count(), biggie['A'].count())
1355+
self.assert_((np.abs(recons['A'].dropna() -
1356+
biggie['A'].dropna()) < 0.1).all())
13561357

13571358
# expected = ['B', 'A']
13581359
# self.assertEqual(header, expected)

pandas/tests/test_multilevel.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,11 @@ def _check_counts(frame, axis=0):
294294
result = frame.count(axis=axis, level=i)
295295
expected = frame.groupby(axis=axis, level=i).count(axis=axis)
296296

297+
self.frame.ix[1, [1, 2]] = np.nan
298+
self.frame.ix[7, [0, 1]] = np.nan
299+
self.ymd.ix[1, [1, 2]] = np.nan
300+
self.ymd.ix[7, [0, 1]] = np.nan
301+
297302
_check_counts(self.frame)
298303
_check_counts(self.ymd)
299304
_check_counts(self.frame.T, axis=1)

0 commit comments

Comments
 (0)