BUG: non-unique indexers with a list-like now return in the same order as the passed values

jreback · jreback · commit ffc90d68e1de · 2013-05-14T17:42:44.000-04:00
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -148,6 +148,7 @@ pandas 0.11.1
 .. _GH3552: https://github.com/pydata/pandas/issues/3552
 .. _GH3562: https://github.com/pydata/pandas/issues/3562
 .. _GH3586: https://github.com/pydata/pandas/issues/3586
+.. _GH3561: https://github.com/pydata/pandas/issues/3561
 .. _GH3493: https://github.com/pydata/pandas/issues/3493
 .. _GH3579: https://github.com/pydata/pandas/issues/3579
 .. _GH3593: https://github.com/pydata/pandas/issues/3593
diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst
@@ -1368,6 +1368,9 @@ incompatible the new object internals are with the ``Index`` functions):
   - ``slice_locs``: returns the "range" to slice between two labels
   - ``get_indexer``: Computes the indexing vector for reindexing / data
     alignment purposes. See the source / docstrings for more on this
+  - ``get_indexer_non_unique``: Computes the indexing vector for reindexing / data
+    alignment purposes when the index is non-unique. See the source / docstrings 
+    for more on this
   - ``reindex``: Does any pre-conversion of the input index then calls
     ``get_indexer``
   - ``union``, ``intersection``: computes the union or intersection of two
diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -859,6 +859,19 @@ def get_indexer(self, target, method=None, limit=None):
 
         return com._ensure_platform_int(indexer)
 
+    def get_indexer_non_unique(self, target, **kwargs):
+        """ return an indexer suitable for takng from a non unique index
+            return the labels in the same order ast the target,
+            target must be an iterable """
+        target = _ensure_index(target)
+        pself, ptarget = self._possibly_promote(target)
+        if pself is not self or ptarget is not target:
+            return pself.get_indexer_non_unique(ptarget)
+
+        if self.is_all_dates:
+            return Index(Index(self.asi8)._engine.get_indexer_non_unique(target.asi8))
+        return Index(self._engine.get_indexer_non_unique(target.values))
+
     def _possibly_promote(self, other):
         # A hack, but it works
         from pandas.tseries.index import DatetimeIndex
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -458,8 +458,8 @@ def _reindex(keys, level=None):
             if labels.is_unique:
                 return _reindex(keyarr, level=level)
             else:
-                mask = labels.isin(keyarr)
-                return self.obj.take(mask.nonzero()[0], axis=axis, convert=False)
+                indexer = labels.get_indexer_non_unique(keyarr)
+                return self.obj.take(indexer, axis=axis, convert=False)
 
     def _convert_to_indexer(self, obj, axis=0):
         """
@@ -569,20 +569,7 @@ def _convert_to_indexer(self, obj, axis=0):
 
                     # non-unique (dups)
                     else:
-                        indexer = []
-                        check   = np.arange(len(labels))
-                        lvalues = labels.values
-                        for x in objarr:
-                            # ugh
-                            to_or = lib.map_infer(lvalues, x.__eq__)
-                            if not to_or.any():
-                                raise KeyError('%s not in index' % str(x))
-
-                            # add the indicies (as we want to take)
-                            indexer.extend(check[to_or])
-
-                        indexer = Index(indexer)
-
+                        indexer = check = labels.get_indexer_non_unique(objarr)
 
                 mask = check == -1
                 if mask.any():
diff --git a/pandas/index.pyx b/pandas/index.pyx
@@ -267,8 +267,37 @@ cdef class IndexEngine:
         self._ensure_mapping_populated()
         return self.mapping.lookup(values)
 
+    def get_indexer_non_unique(self, targets):
+        """ return an indexer suitable for takng from a non unique index
+            return the labels in the same order ast the target """
 
+        cdef:
+            ndarray values
+            ndarray[int64_t] result
+            object v, val
+            int count = 0
+            Py_ssize_t i, j, n
+
+        self._ensure_mapping_populated()
+        values = self._get_index_values()
+        n = len(values)
+        n_t = len(targets)
+        result = np.empty(n, dtype=np.int64)
+
+        for i in range(n_t):
+            val = util.get_value_at(targets, i)
+
+            for j in range(n):
+                v = util.get_value_at(values, j)
+
+                if v == val:
+                   result[count] = j
+                   count += 1
+
+        if count == 0:
+            raise KeyError
 
+        return result[0:count]
 
 cdef class Int64Engine(IndexEngine):
 
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -4668,8 +4668,28 @@ def _check_df(df,cols=None):
             with ensure_clean() as path:
                 df.to_csv(path,cols = cols,chunksize=chunksize)
                 rs_c = pd.read_csv(path,index_col=0)
-                rs_c.columns = df.columns
-                assert_frame_equal(df,rs_c,check_names=False)
+
+                # we wrote them in a different order
+                # so compare them in that order
+                if cols is not None:
+
+                    if df.columns.is_unique:
+                        rs_c.columns = cols
+                    else:
+                        rs_c.columns = df.columns.take(df.columns.get_indexer_non_unique(cols))
+
+                    for c in cols:
+                       obj_df = df[c]
+                       obj_rs = rs_c[c]
+                       if isinstance(obj_df,Series):
+                           assert_series_equal(obj_df,obj_rs)
+                       else:
+                           assert_frame_equal(obj_df,obj_rs,check_names=False) 
+
+                # wrote in the same order
+                else:
+                    rs_c.columns = df.columns
+                    assert_frame_equal(df,rs_c,check_names=False)
 
         chunksize=5
         N = int(chunksize*2.5)
diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py
@@ -784,6 +784,16 @@ def test_dups_fancy_indexing(self):
 
         assert_frame_equal(df,result)
 
+        # GH 3561, dups not in selected order
+        ind = ['A', 'A', 'B', 'C']
+        df = DataFrame({'test':range(len(ind))}, index=ind)
+        rows = ['C', 'B']
+        res = df.ix[rows]
+        self.assert_(rows == list(res.index))
+
+        res = df.ix[Index(rows)]
+        self.assert_(Index(rows).equals(res.index))
+
     def test_indexing_mixed_frame_bug(self):
 
         # GH3492