Skip to content

Commit b84d649

Browse files
committed
BUG: handle missing indexers in duplicate indices similary to how unique handles (e.g. by reindexing)
1 parent ffc90d6 commit b84d649

File tree

7 files changed

+81
-17
lines changed

7 files changed

+81
-17
lines changed

RELEASE.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ pandas 0.11.1
9191
(removed warning) (GH2786_), and fix (GH3230_)
9292
- Fix to_csv to handle non-unique columns (GH3495_)
9393
- Duplicate indexes with getitem will return items in the correct order (GH3455_, GH3457_)
94+
and handle missing elements like unique indices (GH3561_)
9495
- Duplicate indexes with and empty DataFrame.from_records will return a correct frame (GH3562_)
9596
- Fixed bug in groupby with empty series referencing a variable before assignment. (GH3510_)
9697
- Fixed bug in mixed-frame assignment with aligned series (GH3492_)

pandas/core/index.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -860,17 +860,23 @@ def get_indexer(self, target, method=None, limit=None):
860860
return com._ensure_platform_int(indexer)
861861

862862
def get_indexer_non_unique(self, target, **kwargs):
863-
""" return an indexer suitable for takng from a non unique index
864-
return the labels in the same order ast the target,
865-
target must be an iterable """
863+
""" return an indexer suitable for taking from a non unique index
864+
return the labels in the same order as the target, and
865+
return a missing indexer into the target (missing are marked as -1
866+
in the indexer); target must be an iterable """
866867
target = _ensure_index(target)
867868
pself, ptarget = self._possibly_promote(target)
868869
if pself is not self or ptarget is not target:
869870
return pself.get_indexer_non_unique(ptarget)
870871

871872
if self.is_all_dates:
872-
return Index(Index(self.asi8)._engine.get_indexer_non_unique(target.asi8))
873-
return Index(self._engine.get_indexer_non_unique(target.values))
873+
self = Index(self.asi8)
874+
tgt_values = target.asi8
875+
else:
876+
tgt_values = target.values
877+
878+
indexer, missing = self._engine.get_indexer_non_unique(tgt_values)
879+
return Index(indexer), missing
874880

875881
def _possibly_promote(self, other):
876882
# A hack, but it works

pandas/core/indexing.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -458,8 +458,23 @@ def _reindex(keys, level=None):
458458
if labels.is_unique:
459459
return _reindex(keyarr, level=level)
460460
else:
461-
indexer = labels.get_indexer_non_unique(keyarr)
462-
return self.obj.take(indexer, axis=axis, convert=False)
461+
indexer, missing = labels.get_indexer_non_unique(keyarr)
462+
check = indexer != -1
463+
result = self.obj.take(indexer[check], axis=axis, convert=False)
464+
465+
# need to merge the result labels and the missing labels
466+
if len(missing):
467+
l = np.arange(len(indexer))
468+
469+
missing_labels = keyarr.take(missing)
470+
missing_labels_indexer = l[~check]
471+
cur_labels = result._get_axis(axis).values
472+
cur_labels_indexer = l[check]
473+
new_labels = lib.combine_from_indexers(cur_labels, cur_labels_indexer,
474+
missing_labels, missing_labels_indexer)
475+
result = result.reindex_axis(new_labels,axis=axis)
476+
477+
return result
463478

464479
def _convert_to_indexer(self, obj, axis=0):
465480
"""
@@ -569,7 +584,8 @@ def _convert_to_indexer(self, obj, axis=0):
569584

570585
# non-unique (dups)
571586
else:
572-
indexer = check = labels.get_indexer_non_unique(objarr)
587+
indexer, missing = labels.get_indexer_non_unique(objarr)
588+
check = indexer
573589

574590
mask = check == -1
575591
if mask.any():

pandas/index.pyx

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -269,35 +269,44 @@ cdef class IndexEngine:
269269

270270
def get_indexer_non_unique(self, targets):
271271
""" return an indexer suitable for takng from a non unique index
272-
return the labels in the same order ast the target """
272+
return the labels in the same order ast the target
273+
and a missing indexer into the targets (which correspond
274+
to the -1 indicies in the results """
273275

274276
cdef:
275277
ndarray values
276-
ndarray[int64_t] result
278+
ndarray[int64_t] result, missing
277279
object v, val
278-
int count = 0
279-
Py_ssize_t i, j, n
280+
int count = 0, count_missing = 0
281+
Py_ssize_t i, j, n, found
280282

281283
self._ensure_mapping_populated()
282284
values = self._get_index_values()
283285
n = len(values)
284286
n_t = len(targets)
285-
result = np.empty(n, dtype=np.int64)
287+
result = np.empty(n+n_t, dtype=np.int64)
288+
missing = np.empty(n_t, dtype=np.int64)
286289

287290
for i in range(n_t):
288291
val = util.get_value_at(targets, i)
292+
found = 0
289293

290294
for j in range(n):
291295
v = util.get_value_at(values, j)
292296

293297
if v == val:
294298
result[count] = j
295299
count += 1
300+
found = 1
296301

297-
if count == 0:
298-
raise KeyError
302+
# value not found
303+
if found == 0:
304+
result[count] = -1
305+
count += 1
306+
missing[count_missing] = i
307+
count_missing += 1
299308

300-
return result[0:count]
309+
return result[0:count], missing[0:count_missing]
301310

302311
cdef class Int64Engine(IndexEngine):
303312

pandas/lib.pyx

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,25 @@ def dicts_to_array(list dicts, list columns):
416416

417417
return result
418418

419+
@cython.wraparound(False)
420+
@cython.boundscheck(False)
421+
def combine_from_indexers(ndarray a, ndarray[int64_t] a_indexer,
422+
ndarray b, ndarray[int64_t] b_indexer):
423+
cdef:
424+
Py_ssize_t i, n_a, n_b
425+
ndarray result
426+
427+
n_a = len(a)
428+
n_b = len(b)
429+
result = np.empty(n_a+n_b,dtype=object)
430+
431+
for i in range(n_a):
432+
result[a_indexer[i]] = a[i]
433+
for i in range(n_b):
434+
result[b_indexer[i]] = b[i]
435+
436+
return result
437+
419438

420439
def fast_zip(list ndarrays):
421440
'''

pandas/tests/test_frame.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4676,7 +4676,8 @@ def _check_df(df,cols=None):
46764676
if df.columns.is_unique:
46774677
rs_c.columns = cols
46784678
else:
4679-
rs_c.columns = df.columns.take(df.columns.get_indexer_non_unique(cols))
4679+
indexer, missing = df.columns.get_indexer_non_unique(cols)
4680+
rs_c.columns = df.columns.take(indexer)
46804681

46814682
for c in cols:
46824683
obj_df = df[c]

pandas/tests/test_indexing.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -794,6 +794,18 @@ def test_dups_fancy_indexing(self):
794794
res = df.ix[Index(rows)]
795795
self.assert_(Index(rows).equals(res.index))
796796

797+
rows = ['C','B','E']
798+
res = df.ix[rows]
799+
self.assert_(rows == list(res.index))
800+
801+
# inconcistent returns for unique/duplicate indices when values are missing
802+
df = DataFrame(randn(4,3),index=list('ABCD'))
803+
expected = df.ix[['E']]
804+
805+
dfnu = DataFrame(randn(5,3),index=list('AABCD'))
806+
result = dfnu.ix[['E']]
807+
assert_frame_equal(result, expected)
808+
797809
def test_indexing_mixed_frame_bug(self):
798810

799811
# GH3492

0 commit comments

Comments
 (0)