Modify behavior of append on duplicates

araraonline · araraonline · commit eee2daea0a79 · 2018-09-29T20:42:30.000-03:00
When there were duplicates on the columns index, sort was allowed and
duplicates were allowed if the indexes had the same values (as found by
idx.tolist()).

Now, considering that pandas doesn't allow to sort the index when there
are duplicate values (DataFrame.reindex fails) and that searching for
the same values is counter-productive and prone to fail, depending on
the different types of indexes, the behavior was modified to this:

- When sort=True and there are duplicates in at least one index, an
  error is raised and append stops.
- Dframes with duplicate indexes are only considered to be joined when
  the indexes share the same identity (that is, they are the same object
  comparable with `idx1 is  idx2`)

Some other improvements to the code have also been made and I believe it
is better in a general mode.
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -6420,31 +6420,15 @@ def _append_list_of_frames(self, other, *args, **kwargs):
         _obj_type = kwargs['_obj_type']
         _item_type = kwargs.get('_item_type')
 
-        from pandas.core.indexes.api import (
-            CannotSortError,
-            _normalize_dataframes,
-        )
+        from pandas.core.indexes.api import _normalize_dataframes
         from pandas.core.reshape.concat import concat
 
-        # The default value of sort in version 0.23.0 is None.
-        # The behavior when this was the value is very
-        # varied and changes according to input type, columns index
-        # type, whether a reindex is necessary or not, etc.
-        #
-        # The code below is a try to reproduce the old behavior,
-        # but note that this is deprecated.
-        #
-        # TODO: handle sort=None here
-
-        # The behavior of concat is a bit problematic as it is. To get around
-        # this, we prepare the DataFrames before feeding them into concat.
+        # TODO: sorting behavior when sort=None
+
+        # The behavior of concat is a bit problematic as it is. To get around,
+        # we prepare the DataFrames before feeding them into concat.
         to_concat = [self] + other
-        try:
-            to_concat_norm = _normalize_dataframes(to_concat, sort=sort)
-        except CannotSortError:
-            raise TypeError("The resulting columns could not be sorted."
-                            " You can try setting sort=False or use"
-                            " compatible index types.")
+        to_concat_norm = _normalize_dataframes(to_concat, sort=sort)
         result = concat(to_concat_norm, ignore_index=ignore_index,
                         verify_integrity=verify_integrity, sort=sort)
 
diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py
@@ -12,8 +12,7 @@
     _new_Index,
     ensure_index,
     ensure_index_from_sequences,
-    CannotSortError,
-    InvalidIndexError
+    InvalidIndexError,
 )
 from pandas.core.indexes.category import CategoricalIndex  # noqa
 from pandas.core.indexes.multi import MultiIndex  # noqa
@@ -38,6 +37,18 @@
 """)
 
 
+class _CannotSortError(Exception):
+    pass
+
+
+class _CannotSortDuplicatesError(Exception):
+    pass
+
+
+class _DuplicatesError(Exception):
+    pass
+
+
 # TODO: there are many places that rely on these private methods existing in
 # pandas.core.index
 __all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index',
@@ -181,20 +192,40 @@ def _normalize_dataframes(frame_list, verify_inputs=True, sort=False):
     ----------
     index_list: list of Index objects
     verify_inputs: boolean, default True
-        Verify if the input indexes contain overlapping values.
+        Verify if the input indexes contain duplicate values. Ignored when all
+        input indexes share the same identity (a is b).
     sort: boolean, default False
-        Order result index. If False, values will come in the order they
+        Order resulting index. If False, values will come in the order they
         appear.
 
     Raises
     ------
-    CannotSortError
-        When sort=True and the result index is not sortable.
-    InvalidIndexError
-        When verify_inputs=True and 1+ of the indexes contain duplicates.
+    InvalidIndexError:
+        When there are duplicates in at least one of the indexes (col)
+        and they are not allowed.
+    TypeError:
+        When sort=True and the resulting index (col) could not be sorted.
     """
     orig_columns = [df.columns for df in frame_list]
-    merged_columns = _merge_index_list(orig_columns, verify_inputs, sort)
+
+    kwargs = {
+        'verify_dups': verify_inputs,
+        'allow_matching_dups': verify_inputs,
+        'sort': sort,
+    }
+
+    try:
+        merged_columns = _merge_index_list(orig_columns, **kwargs)
+    except _DuplicatesError:
+        raise InvalidIndexError("Indexes with duplicates are only allowed"
+                                " when they are the same (a is b).")
+    except _CannotSortDuplicatesError:
+        raise InvalidIndexError("When sort=True, indexes with duplicate"
+                                " values are not allowed.")
+    except _CannotSortError:
+        raise TypeError("The resulting columns could not be sorted."
+                        " You can try setting sort=False or use"
+                        " compatible index types.")
 
     # Because _merge_index_list may infer the index dtype based on values,
     # we have to provide a workaround to conserve the original dtype.
@@ -217,33 +248,64 @@ def _normalize_dataframes(frame_list, verify_inputs=True, sort=False):
     return [_reindex(df, merged_columns, axis=1) for df in frame_list]
 
 
-def _merge_index_list(index_list, verify_inputs=True, sort=False):
+def _merge_index_list(index_list,
+                      verify_dups=True,
+                      allow_matching_dups=False,
+                      sort=False):
     """Merge a list of indexes into one big index
 
     Parameters
     ----------
     index_list: list of Index objects
-    verify_inputs: boolean, default True
-        Verify if the input indexes contain overlapping values.
+    verify_dups: boolean, default True
+        Verify if the input indexes contain duplicate values.
+    allow_matching_dups: boolean, default False
+        Only relevant when verify_dups=True. Allow duplicate values when all
+        indexes have the same identity.
     sort: boolean, default False
         Order result index. If False, values will come in the order they
         appear.
 
     Raises
     ------
-    CannotSortError
+    _CannotSortError
         When sort=True and the result index is not sortable.
-    InvalidIndexError
-        When verify_inputs=True and 1+ of the indexes contain duplicates.
+    _CannotSortDuplicatesError
+        When sort=True and at least one of the inputs contain duplicate
+        values.
+    _DuplicatesError
+        When verify_dups=True and at least one of the input indexes contain
+        duplicate values. This is error is not raised if
+        allow_matching_dups=True and all the indexes have a common identity.
     """
-    if verify_inputs:
-        if any([ix.has_duplicates for ix in index_list]):
-            raise InvalidIndexError("Input index has duplicate values")
-
-    result = index_list[0]
-    for idx in index_list[1:]:
+    # unique index list (a is b)
+    uindex_list = com.get_distinct_objs(index_list)
+
+    # verify duplicates
+    if sort or verify_dups:
+        has_dups = any(ix.has_duplicates for ix in uindex_list)
+        if has_dups:
+            if sort:
+                raise _CannotSortDuplicatesError("Cannot sort an index that"
+                                                " contains duplicate values.")
+            elif verify_dups and not allow_matching_dups:
+                raise _DuplicatesError("Index has duplicate values.")
+            elif verify_dups and allow_matching_dups and len(uindex_list) >= 2:
+                raise _DuplicatesError("Index has duplicate values and does"
+                                      " not match other indexes.")
+
+    # edge results
+    if len(uindex_list) == 0:
+        return pd.Index()
+    elif len(uindex_list) == 1:
+        return uindex_list[0]
+
+    # reduce to one result
+    result = uindex_list[0]
+    for idx in uindex_list[1:]:
         result = _merge_indexes(result, idx)
 
+    # sort
     return result if not sort else _sort_index(result)
 
 
@@ -278,7 +340,7 @@ def _sort_index(index):
     try:
         return index.sort_values()
     except TypeError:
-        raise CannotSortError
+        raise _CannotSortError
 
 
 def _reindex(df, new_index, axis=0):
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -158,10 +158,6 @@ def index_arithmetic_method(self, other):
     return set_function_name(index_arithmetic_method, name, cls)
 
 
-class CannotSortError(Exception):
-    pass
-
-
 class InvalidIndexError(Exception):
     pass
 
diff --git a/pandas/tests/reshape/test_append.py b/pandas/tests/reshape/test_append.py
@@ -39,6 +39,36 @@
 ]
 
 
+indexes_with_dups = [
+    # base
+    pd.Index(['A', 'B', 'B']),
+    pd.Index(['B', 'B', 'A']),
+    pd.Index(['A', 'B', 'B'], name='foo'),
+    pd.Index(['B', 'B', 'A'], name='bar'),
+
+    # numeric
+    pd.Index([9, 10, 10], dtype=object),
+    pd.Int64Index([3, 4, 4]),
+    pd.UInt64Index([6, 7, 7]),
+    pd.Float64Index([3.5, 4.5, 4.5]),
+
+    # datetime
+    pd.to_datetime(['2013-01-01', '2013-01-10', '2013-01-10']),
+    pd.to_timedelta(['1 day', '2 days', '2 days']),
+    pd.PeriodIndex([2000, 2001, 2001], freq='A'),
+
+    # interval
+    pd.IntervalIndex.from_arrays([0, 1, 1], [1, 2, 2]),
+
+    # categorical
+    pd.CategoricalIndex('A B B'.split()),
+    pd.CategoricalIndex('D E E'.split(), ordered=True),
+
+    # multi-index
+    pd.MultiIndex.from_arrays(['A B B'.split(), 'D E E'.split()]),
+]
+
+
 index_sort_groups = [
     # When indexes from the same group are joined, the result is sortable.
     # When indexes from different groups are joined, the result is not
@@ -403,39 +433,90 @@ def test_preserve_index_values_with_sort(self, index1, index2):
         for value in index2:
             assert value in result.columns
 
-    def test_raise_on_duplicates(self, sort):
-        # Append should not allow DataFrames with repeated
-        # column names (or series with repeated row names).
-
-        # dupe on base
-        df1 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'B'])
-        df2 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'C'])
-        with pytest.raises(InvalidIndexError):
-            df1.append([], sort=sort)
-        with pytest.raises(InvalidIndexError):
-            df1.append([df2], sort=sort)
-        with pytest.raises(InvalidIndexError):
-            df1.append([df2, df2], sort=sort)
-
-        # dupe on other
-        df1 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'C'])
-        df2 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'B'])
-        with pytest.raises(InvalidIndexError):
-            df1.append([df2], sort=sort)
-        with pytest.raises(InvalidIndexError):
-            df1.append([df2, df2], sort=sort)
-
-        # dupe on both
-        # (we could avoid raising errors here, but, to keep the api
-        #  consistent, we don't)
-        df1 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'B'])
-        df2 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'B'])
-        with pytest.raises(InvalidIndexError):
-            df1.append([], sort=sort)
-        with pytest.raises(InvalidIndexError):
-            df1.append([df2], sort=sort)
-        with pytest.raises(InvalidIndexError):
-            df1.append([df2, df2], sort=sort)
+    @pytest.mark.parametrize('col_index', indexes_with_dups, ids=cls_name)
+    def test_good_duplicates_without_sort(self, col_index):
+        # When all indexes have the same identity (a is b), duplicates should
+        # be allowed and append works.
+
+        df1 = pd.DataFrame([[1, 2, 3]], columns=col_index)
+        df2 = pd.DataFrame([[4, 5, 6]], columns=col_index)
+
+        # df1.append([])
+        result = df1.append([], sort=False)
+        expected = df1.copy()
+        assert_frame_equal(result, expected)
+
+        # df1.append([df2])
+        result = df1.append([df2], ignore_index=True, sort=False)
+        expected = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
+        expected.columns = col_index
+        assert_frame_equal(result, expected)
+
+        # df1.append([df2, df2])
+        result = df1.append([df2, df2], ignore_index=True, sort=False)
+        expected = pd.DataFrame([[1, 2, 3], [4, 5, 6], [4, 5, 6]])
+        expected.columns = col_index
+        assert_frame_equal(result, expected)
+
+        # df2.append([])
+        result = df2.append([], sort=False)
+        expected = df2.copy()
+        assert_frame_equal(result, expected)
+
+        # df2.append([df1])
+        result = df2.append([df1], ignore_index=True, sort=False)
+        expected = pd.DataFrame([[4, 5, 6], [1, 2, 3]])
+        expected.columns = col_index
+        assert_frame_equal(result, expected)
+
+        # df2.append([df1, df1])
+        result = df2.append([df1, df1], ignore_index=True, sort=False)
+        expected = pd.DataFrame([[4, 5, 6], [1, 2, 3], [1, 2, 3]])
+        expected.columns = col_index
+        assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize('col_index', indexes_with_dups, ids=cls_name)
+    def test_bad_duplicates_without_sort(self, col_index):
+        # When the indexes do not share a common identity, duplicates are not
+        # allowed and append raises.
+
+        df1 = pd.DataFrame([[1, 2, 3]], columns=col_index)
+        df2 = pd.DataFrame([[4, 5, 6]], columns=col_index)
+        df3 = pd.DataFrame([[7, 8, 9]], columns=col_index.copy())  # different
+        ctx = pytest.raises(InvalidIndexError,
+                            match=r'Indexes with duplicates.*a is b.*')
+        with ctx:
+            result = df1.append([df3], sort=False)
+        with ctx:
+            result = df1.append([df2, df3], sort=False)
+        with ctx:
+            result = df1.append([df3, df2], sort=False)
+        with ctx:
+            result = df1.append([df3, df3], sort=False)
+
+    @pytest.mark.parametrize('col_index', indexes_with_dups, ids=cls_name)
+    def test_duplicates_with_sort(self, col_index):
+        # When sort=True, indexes with duplicate values are not be allowed.
+
+        df1 = pd.DataFrame([[1, 2, 3]], columns=col_index)
+        df2 = pd.DataFrame([[4, 5, 6]], columns=col_index.copy())
+        ctx = pytest.raises(InvalidIndexError,
+                            match=r'When sort=True, indexes with dupl.*')
+
+        with ctx:
+            result = df1.append([], sort=True)
+        with ctx:
+            result = df1.append([df1], sort=True)
+        with ctx:
+            result = df1.append([df2], sort=True)
+        with ctx:
+            result = df1.append([df1, df1], sort=True)
+        with ctx:
+            result = df1.append([df1, df2], sort=True)
+        with ctx:
+            result = df1.append([df2, df1], sort=True)
+        with ctx:
+            result = df1.append([df2, df2], sort=True)
 
     def test_nosort_basic(self):
         # When sort=False, the resulting columns come