diff --git a/RELEASE.rst b/RELEASE.rst index 31627cec01d1e..4e6570669656d 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -91,6 +91,7 @@ pandas 0.11.1 (removed warning) (GH2786_), and fix (GH3230_) - Fix to_csv to handle non-unique columns (GH3495_) - Duplicate indexes with getitem will return items in the correct order (GH3455_, GH3457_) + and handle missing elements like unique indices (GH3561_) - Duplicate indexes with and empty DataFrame.from_records will return a correct frame (GH3562_) - Fixed bug in groupby with empty series referencing a variable before assignment. (GH3510_) - Fixed bug in mixed-frame assignment with aligned series (GH3492_) @@ -148,6 +149,7 @@ pandas 0.11.1 .. _GH3552: https://github.com/pydata/pandas/issues/3552 .. _GH3562: https://github.com/pydata/pandas/issues/3562 .. _GH3586: https://github.com/pydata/pandas/issues/3586 +.. _GH3561: https://github.com/pydata/pandas/issues/3561 .. _GH3493: https://github.com/pydata/pandas/issues/3493 .. _GH3579: https://github.com/pydata/pandas/issues/3579 .. _GH3593: https://github.com/pydata/pandas/issues/3593 diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index d67a2d51cc1b8..55b7e653c3630 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -1368,6 +1368,9 @@ incompatible the new object internals are with the ``Index`` functions): - ``slice_locs``: returns the "range" to slice between two labels - ``get_indexer``: Computes the indexing vector for reindexing / data alignment purposes. See the source / docstrings for more on this + - ``get_indexer_non_unique``: Computes the indexing vector for reindexing / data + alignment purposes when the index is non-unique. See the source / docstrings + for more on this - ``reindex``: Does any pre-conversion of the input index then calls ``get_indexer`` - ``union``, ``intersection``: computes the union or intersection of two diff --git a/pandas/core/index.py b/pandas/core/index.py index 7baae543714ec..3e5a4f5676437 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -859,6 +859,25 @@ def get_indexer(self, target, method=None, limit=None): return com._ensure_platform_int(indexer) + def get_indexer_non_unique(self, target, **kwargs): + """ return an indexer suitable for taking from a non unique index + return the labels in the same order as the target, and + return a missing indexer into the target (missing are marked as -1 + in the indexer); target must be an iterable """ + target = _ensure_index(target) + pself, ptarget = self._possibly_promote(target) + if pself is not self or ptarget is not target: + return pself.get_indexer_non_unique(ptarget) + + if self.is_all_dates: + self = Index(self.asi8) + tgt_values = target.asi8 + else: + tgt_values = target.values + + indexer, missing = self._engine.get_indexer_non_unique(tgt_values) + return Index(indexer), missing + def _possibly_promote(self, other): # A hack, but it works from pandas.tseries.index import DatetimeIndex diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index bc8b7a3646a33..29adce4e02591 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -458,8 +458,23 @@ def _reindex(keys, level=None): if labels.is_unique: return _reindex(keyarr, level=level) else: - mask = labels.isin(keyarr) - return self.obj.take(mask.nonzero()[0], axis=axis, convert=False) + indexer, missing = labels.get_indexer_non_unique(keyarr) + check = indexer != -1 + result = self.obj.take(indexer[check], axis=axis, convert=False) + + # need to merge the result labels and the missing labels + if len(missing): + l = np.arange(len(indexer)) + + missing_labels = keyarr.take(missing) + missing_labels_indexer = l[~check] + cur_labels = result._get_axis(axis).values + cur_labels_indexer = l[check] + new_labels = lib.combine_from_indexers(cur_labels, cur_labels_indexer, + missing_labels, missing_labels_indexer) + result = result.reindex_axis(new_labels,axis=axis) + + return result def _convert_to_indexer(self, obj, axis=0): """ @@ -569,20 +584,8 @@ def _convert_to_indexer(self, obj, axis=0): # non-unique (dups) else: - indexer = [] - check = np.arange(len(labels)) - lvalues = labels.values - for x in objarr: - # ugh - to_or = lib.map_infer(lvalues, x.__eq__) - if not to_or.any(): - raise KeyError('%s not in index' % str(x)) - - # add the indicies (as we want to take) - indexer.extend(check[to_or]) - - indexer = Index(indexer) - + indexer, missing = labels.get_indexer_non_unique(objarr) + check = indexer mask = check == -1 if mask.any(): diff --git a/pandas/index.pyx b/pandas/index.pyx index 2ad5474549ec6..7d33d6083d0eb 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -267,8 +267,46 @@ cdef class IndexEngine: self._ensure_mapping_populated() return self.mapping.lookup(values) + def get_indexer_non_unique(self, targets): + """ return an indexer suitable for takng from a non unique index + return the labels in the same order ast the target + and a missing indexer into the targets (which correspond + to the -1 indicies in the results """ + cdef: + ndarray values + ndarray[int64_t] result, missing + object v, val + int count = 0, count_missing = 0 + Py_ssize_t i, j, n, found + + self._ensure_mapping_populated() + values = self._get_index_values() + n = len(values) + n_t = len(targets) + result = np.empty(n+n_t, dtype=np.int64) + missing = np.empty(n_t, dtype=np.int64) + + for i in range(n_t): + val = util.get_value_at(targets, i) + found = 0 + + for j in range(n): + v = util.get_value_at(values, j) + + if v == val: + result[count] = j + count += 1 + found = 1 + + # value not found + if found == 0: + result[count] = -1 + count += 1 + missing[count_missing] = i + count_missing += 1 + return result[0:count], missing[0:count_missing] cdef class Int64Engine(IndexEngine): diff --git a/pandas/lib.pyx b/pandas/lib.pyx index d043691bc061e..30c65d9fcdd9f 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -416,6 +416,25 @@ def dicts_to_array(list dicts, list columns): return result +@cython.wraparound(False) +@cython.boundscheck(False) +def combine_from_indexers(ndarray a, ndarray[int64_t] a_indexer, + ndarray b, ndarray[int64_t] b_indexer): + cdef: + Py_ssize_t i, n_a, n_b + ndarray result + + n_a = len(a) + n_b = len(b) + result = np.empty(n_a+n_b,dtype=object) + + for i in range(n_a): + result[a_indexer[i]] = a[i] + for i in range(n_b): + result[b_indexer[i]] = b[i] + + return result + def fast_zip(list ndarrays): ''' diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index ce89dda63597f..e92cc22dccaf6 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -4668,8 +4668,29 @@ def _check_df(df,cols=None): with ensure_clean() as path: df.to_csv(path,cols = cols,chunksize=chunksize) rs_c = pd.read_csv(path,index_col=0) - rs_c.columns = df.columns - assert_frame_equal(df,rs_c,check_names=False) + + # we wrote them in a different order + # so compare them in that order + if cols is not None: + + if df.columns.is_unique: + rs_c.columns = cols + else: + indexer, missing = df.columns.get_indexer_non_unique(cols) + rs_c.columns = df.columns.take(indexer) + + for c in cols: + obj_df = df[c] + obj_rs = rs_c[c] + if isinstance(obj_df,Series): + assert_series_equal(obj_df,obj_rs) + else: + assert_frame_equal(obj_df,obj_rs,check_names=False) + + # wrote in the same order + else: + rs_c.columns = df.columns + assert_frame_equal(df,rs_c,check_names=False) chunksize=5 N = int(chunksize*2.5) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 01651f2674a90..46fd98fc14ffb 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -784,6 +784,28 @@ def test_dups_fancy_indexing(self): assert_frame_equal(df,result) + # GH 3561, dups not in selected order + ind = ['A', 'A', 'B', 'C'] + df = DataFrame({'test':range(len(ind))}, index=ind) + rows = ['C', 'B'] + res = df.ix[rows] + self.assert_(rows == list(res.index)) + + res = df.ix[Index(rows)] + self.assert_(Index(rows).equals(res.index)) + + rows = ['C','B','E'] + res = df.ix[rows] + self.assert_(rows == list(res.index)) + + # inconcistent returns for unique/duplicate indices when values are missing + df = DataFrame(randn(4,3),index=list('ABCD')) + expected = df.ix[['E']] + + dfnu = DataFrame(randn(5,3),index=list('AABCD')) + result = dfnu.ix[['E']] + assert_frame_equal(result, expected) + def test_indexing_mixed_frame_bug(self): # GH3492