BUG: handle missing indexers in duplicate indices similary to how unique handles (e.g. by reindexing)

jreback · jreback · commit b84d64935426 · 2013-05-14T17:42:44.000-04:00
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -91,6 +91,7 @@ pandas 0.11.1
       (removed warning) (GH2786_), and fix (GH3230_)
     - Fix to_csv to handle non-unique columns (GH3495_)
     - Duplicate indexes with getitem will return items in the correct order (GH3455_, GH3457_)
+      and handle missing elements like unique indices (GH3561_)
     - Duplicate indexes with and empty DataFrame.from_records will return a correct frame (GH3562_)
   - Fixed bug in groupby with empty series referencing a variable before assignment. (GH3510_)
   - Fixed bug in mixed-frame assignment with aligned series (GH3492_)
diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -860,17 +860,23 @@ def get_indexer(self, target, method=None, limit=None):
         return com._ensure_platform_int(indexer)
 
     def get_indexer_non_unique(self, target, **kwargs):
-        """ return an indexer suitable for takng from a non unique index
-            return the labels in the same order ast the target,
-            target must be an iterable """
+        """ return an indexer suitable for taking from a non unique index
+            return the labels in the same order as the target, and
+            return a missing indexer into the target (missing are marked as -1
+            in the indexer); target must be an iterable """
         target = _ensure_index(target)
         pself, ptarget = self._possibly_promote(target)
         if pself is not self or ptarget is not target:
             return pself.get_indexer_non_unique(ptarget)
 
         if self.is_all_dates:
-            return Index(Index(self.asi8)._engine.get_indexer_non_unique(target.asi8))
-        return Index(self._engine.get_indexer_non_unique(target.values))
+            self = Index(self.asi8)
+            tgt_values = target.asi8
+        else:
+            tgt_values = target.values
+
+        indexer, missing = self._engine.get_indexer_non_unique(tgt_values)
+        return Index(indexer), missing
 
     def _possibly_promote(self, other):
         # A hack, but it works
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -458,8 +458,23 @@ def _reindex(keys, level=None):
             if labels.is_unique:
                 return _reindex(keyarr, level=level)
             else:
-                indexer = labels.get_indexer_non_unique(keyarr)
-                return self.obj.take(indexer, axis=axis, convert=False)
+                indexer, missing = labels.get_indexer_non_unique(keyarr)
+                check = indexer != -1
+                result = self.obj.take(indexer[check], axis=axis, convert=False)
+
+                # need to merge the result labels and the missing labels
+                if len(missing):
+                    l = np.arange(len(indexer))
+
+                    missing_labels = keyarr.take(missing)
+                    missing_labels_indexer = l[~check]
+                    cur_labels = result._get_axis(axis).values
+                    cur_labels_indexer = l[check]
+                    new_labels = lib.combine_from_indexers(cur_labels, cur_labels_indexer,
+                                                           missing_labels, missing_labels_indexer)
+                    result = result.reindex_axis(new_labels,axis=axis)
+
+                return result
 
     def _convert_to_indexer(self, obj, axis=0):
         """
@@ -569,7 +584,8 @@ def _convert_to_indexer(self, obj, axis=0):
 
                     # non-unique (dups)
                     else:
-                        indexer = check = labels.get_indexer_non_unique(objarr)
+                        indexer, missing = labels.get_indexer_non_unique(objarr)
+                        check = indexer
 
                 mask = check == -1
                 if mask.any():
diff --git a/pandas/index.pyx b/pandas/index.pyx
@@ -269,35 +269,44 @@ cdef class IndexEngine:
 
     def get_indexer_non_unique(self, targets):
         """ return an indexer suitable for takng from a non unique index
-            return the labels in the same order ast the target """
+            return the labels in the same order ast the target
+            and a missing indexer into the targets (which correspond
+            to the -1 indicies in the results """
 
         cdef:
             ndarray values
-            ndarray[int64_t] result
+            ndarray[int64_t] result, missing
             object v, val
-            int count = 0
-            Py_ssize_t i, j, n
+            int count = 0, count_missing = 0
+            Py_ssize_t i, j, n, found
 
         self._ensure_mapping_populated()
         values = self._get_index_values()
         n = len(values)
         n_t = len(targets)
-        result = np.empty(n, dtype=np.int64)
+        result  = np.empty(n+n_t, dtype=np.int64)
+        missing = np.empty(n_t, dtype=np.int64)
 
         for i in range(n_t):
             val = util.get_value_at(targets, i)
+            found = 0
 
             for j in range(n):
                 v = util.get_value_at(values, j)
 
                 if v == val:
                    result[count] = j
                    count += 1
+                   found = 1
 
-        if count == 0:
-            raise KeyError
+            # value not found
+            if found == 0:
+                result[count] = -1
+                count += 1
+                missing[count_missing] = i
+                count_missing += 1
 
-        return result[0:count]
+        return result[0:count], missing[0:count_missing]
 
 cdef class Int64Engine(IndexEngine):
 
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
@@ -416,6 +416,25 @@ def dicts_to_array(list dicts, list columns):
 
     return result
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def combine_from_indexers(ndarray a, ndarray[int64_t] a_indexer,
+                          ndarray b, ndarray[int64_t] b_indexer):
+    cdef:
+        Py_ssize_t i, n_a, n_b
+        ndarray result
+
+    n_a = len(a)
+    n_b = len(b)
+    result = np.empty(n_a+n_b,dtype=object)
+
+    for i in range(n_a):
+        result[a_indexer[i]] = a[i]
+    for i in range(n_b):
+        result[b_indexer[i]] = b[i]
+
+    return result
+
 
 def fast_zip(list ndarrays):
     '''
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -4676,7 +4676,8 @@ def _check_df(df,cols=None):
                     if df.columns.is_unique:
                         rs_c.columns = cols
                     else:
-                        rs_c.columns = df.columns.take(df.columns.get_indexer_non_unique(cols))
+                        indexer, missing = df.columns.get_indexer_non_unique(cols)
+                        rs_c.columns = df.columns.take(indexer)
 
                     for c in cols:
                        obj_df = df[c]
diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py
@@ -794,6 +794,18 @@ def test_dups_fancy_indexing(self):
         res = df.ix[Index(rows)]
         self.assert_(Index(rows).equals(res.index))
 
+        rows = ['C','B','E']
+        res = df.ix[rows]
+        self.assert_(rows == list(res.index))
+
+        # inconcistent returns for unique/duplicate indices when values are missing
+        df = DataFrame(randn(4,3),index=list('ABCD'))
+        expected = df.ix[['E']]
+
+        dfnu = DataFrame(randn(5,3),index=list('AABCD'))
+        result = dfnu.ix[['E']]
+        assert_frame_equal(result, expected)
+
     def test_indexing_mixed_frame_bug(self):
 
         # GH3492