Skip to content

BUG: (GH3561) non-unique indexers with a list-like now return in the same order as the passed values #3563

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 14, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ pandas 0.11.1
(removed warning) (GH2786_), and fix (GH3230_)
- Fix to_csv to handle non-unique columns (GH3495_)
- Duplicate indexes with getitem will return items in the correct order (GH3455_, GH3457_)
and handle missing elements like unique indices (GH3561_)
- Duplicate indexes with and empty DataFrame.from_records will return a correct frame (GH3562_)
- Fixed bug in groupby with empty series referencing a variable before assignment. (GH3510_)
- Fixed bug in mixed-frame assignment with aligned series (GH3492_)
Expand Down Expand Up @@ -148,6 +149,7 @@ pandas 0.11.1
.. _GH3552: https://github.com/pydata/pandas/issues/3552
.. _GH3562: https://github.com/pydata/pandas/issues/3562
.. _GH3586: https://github.com/pydata/pandas/issues/3586
.. _GH3561: https://github.com/pydata/pandas/issues/3561
.. _GH3493: https://github.com/pydata/pandas/issues/3493
.. _GH3579: https://github.com/pydata/pandas/issues/3579
.. _GH3593: https://github.com/pydata/pandas/issues/3593
Expand Down
3 changes: 3 additions & 0 deletions doc/source/indexing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1368,6 +1368,9 @@ incompatible the new object internals are with the ``Index`` functions):
- ``slice_locs``: returns the "range" to slice between two labels
- ``get_indexer``: Computes the indexing vector for reindexing / data
alignment purposes. See the source / docstrings for more on this
- ``get_indexer_non_unique``: Computes the indexing vector for reindexing / data
alignment purposes when the index is non-unique. See the source / docstrings
for more on this
- ``reindex``: Does any pre-conversion of the input index then calls
``get_indexer``
- ``union``, ``intersection``: computes the union or intersection of two
Expand Down
19 changes: 19 additions & 0 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -859,6 +859,25 @@ def get_indexer(self, target, method=None, limit=None):

return com._ensure_platform_int(indexer)

def get_indexer_non_unique(self, target, **kwargs):
""" return an indexer suitable for taking from a non unique index
return the labels in the same order as the target, and
return a missing indexer into the target (missing are marked as -1
in the indexer); target must be an iterable """
target = _ensure_index(target)
pself, ptarget = self._possibly_promote(target)
if pself is not self or ptarget is not target:
return pself.get_indexer_non_unique(ptarget)

if self.is_all_dates:
self = Index(self.asi8)
tgt_values = target.asi8
else:
tgt_values = target.values

indexer, missing = self._engine.get_indexer_non_unique(tgt_values)
return Index(indexer), missing

def _possibly_promote(self, other):
# A hack, but it works
from pandas.tseries.index import DatetimeIndex
Expand Down
35 changes: 19 additions & 16 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,8 +458,23 @@ def _reindex(keys, level=None):
if labels.is_unique:
return _reindex(keyarr, level=level)
else:
mask = labels.isin(keyarr)
return self.obj.take(mask.nonzero()[0], axis=axis, convert=False)
indexer, missing = labels.get_indexer_non_unique(keyarr)
check = indexer != -1
result = self.obj.take(indexer[check], axis=axis, convert=False)

# need to merge the result labels and the missing labels
if len(missing):
l = np.arange(len(indexer))

missing_labels = keyarr.take(missing)
missing_labels_indexer = l[~check]
cur_labels = result._get_axis(axis).values
cur_labels_indexer = l[check]
new_labels = lib.combine_from_indexers(cur_labels, cur_labels_indexer,
missing_labels, missing_labels_indexer)
result = result.reindex_axis(new_labels,axis=axis)

return result

def _convert_to_indexer(self, obj, axis=0):
"""
Expand Down Expand Up @@ -569,20 +584,8 @@ def _convert_to_indexer(self, obj, axis=0):

# non-unique (dups)
else:
indexer = []
check = np.arange(len(labels))
lvalues = labels.values
for x in objarr:
# ugh
to_or = lib.map_infer(lvalues, x.__eq__)
if not to_or.any():
raise KeyError('%s not in index' % str(x))

# add the indicies (as we want to take)
indexer.extend(check[to_or])

indexer = Index(indexer)

indexer, missing = labels.get_indexer_non_unique(objarr)
check = indexer

mask = check == -1
if mask.any():
Expand Down
38 changes: 38 additions & 0 deletions pandas/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -267,8 +267,46 @@ cdef class IndexEngine:
self._ensure_mapping_populated()
return self.mapping.lookup(values)

def get_indexer_non_unique(self, targets):
""" return an indexer suitable for takng from a non unique index
return the labels in the same order ast the target
and a missing indexer into the targets (which correspond
to the -1 indicies in the results """

cdef:
ndarray values
ndarray[int64_t] result, missing
object v, val
int count = 0, count_missing = 0
Py_ssize_t i, j, n, found

self._ensure_mapping_populated()
values = self._get_index_values()
n = len(values)
n_t = len(targets)
result = np.empty(n+n_t, dtype=np.int64)
missing = np.empty(n_t, dtype=np.int64)

for i in range(n_t):
val = util.get_value_at(targets, i)
found = 0

for j in range(n):
v = util.get_value_at(values, j)

if v == val:
result[count] = j
count += 1
found = 1

# value not found
if found == 0:
result[count] = -1
count += 1
missing[count_missing] = i
count_missing += 1

return result[0:count], missing[0:count_missing]

cdef class Int64Engine(IndexEngine):

Expand Down
19 changes: 19 additions & 0 deletions pandas/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,25 @@ def dicts_to_array(list dicts, list columns):

return result

@cython.wraparound(False)
@cython.boundscheck(False)
def combine_from_indexers(ndarray a, ndarray[int64_t] a_indexer,
ndarray b, ndarray[int64_t] b_indexer):
cdef:
Py_ssize_t i, n_a, n_b
ndarray result

n_a = len(a)
n_b = len(b)
result = np.empty(n_a+n_b,dtype=object)

for i in range(n_a):
result[a_indexer[i]] = a[i]
for i in range(n_b):
result[b_indexer[i]] = b[i]

return result


def fast_zip(list ndarrays):
'''
Expand Down
25 changes: 23 additions & 2 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4668,8 +4668,29 @@ def _check_df(df,cols=None):
with ensure_clean() as path:
df.to_csv(path,cols = cols,chunksize=chunksize)
rs_c = pd.read_csv(path,index_col=0)
rs_c.columns = df.columns
assert_frame_equal(df,rs_c,check_names=False)

# we wrote them in a different order
# so compare them in that order
if cols is not None:

if df.columns.is_unique:
rs_c.columns = cols
else:
indexer, missing = df.columns.get_indexer_non_unique(cols)
rs_c.columns = df.columns.take(indexer)

for c in cols:
obj_df = df[c]
obj_rs = rs_c[c]
if isinstance(obj_df,Series):
assert_series_equal(obj_df,obj_rs)
else:
assert_frame_equal(obj_df,obj_rs,check_names=False)

# wrote in the same order
else:
rs_c.columns = df.columns
assert_frame_equal(df,rs_c,check_names=False)

chunksize=5
N = int(chunksize*2.5)
Expand Down
22 changes: 22 additions & 0 deletions pandas/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -784,6 +784,28 @@ def test_dups_fancy_indexing(self):

assert_frame_equal(df,result)

# GH 3561, dups not in selected order
ind = ['A', 'A', 'B', 'C']
df = DataFrame({'test':range(len(ind))}, index=ind)
rows = ['C', 'B']
res = df.ix[rows]
self.assert_(rows == list(res.index))

res = df.ix[Index(rows)]
self.assert_(Index(rows).equals(res.index))

rows = ['C','B','E']
res = df.ix[rows]
self.assert_(rows == list(res.index))

# inconcistent returns for unique/duplicate indices when values are missing
df = DataFrame(randn(4,3),index=list('ABCD'))
expected = df.ix[['E']]

dfnu = DataFrame(randn(5,3),index=list('AABCD'))
result = dfnu.ix[['E']]
assert_frame_equal(result, expected)

def test_indexing_mixed_frame_bug(self):

# GH3492
Expand Down