Merge pull request #7 from araraonline/duplicate

araraonline · web-flow · commit 5dfe32e37fe0 · 2018-09-29T20:44:26.000-03:00
Handle columns index duplicates
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -6420,31 +6420,15 @@ def _append_list_of_frames(self, other, *args, **kwargs):
         _obj_type = kwargs['_obj_type']
         _item_type = kwargs.get('_item_type')
 
-        from pandas.core.indexes.api import (
-            CannotSortError,
-            _normalize_dataframes,
-        )
+        from pandas.core.indexes.api import _normalize_dataframes
         from pandas.core.reshape.concat import concat
 
-        # The default value of sort in version 0.23.0 is None.
-        # The behavior when this was the value is very
-        # varied and changes according to input type, columns index
-        # type, whether a reindex is necessary or not, etc.
-        #
-        # The code below is a try to reproduce the old behavior,
-        # but note that this is deprecated.
-        #
-        # TODO: handle sort=None here
-
-        # The behavior of concat is a bit problematic as it is. To get around
-        # this, we prepare the DataFrames before feeding them into concat.
+        # TODO: sorting behavior when sort=None
+
+        # The behavior of concat is a bit problematic as it is. To get around,
+        # we prepare the DataFrames before feeding them into concat.
         to_concat = [self] + other
-        try:
-            to_concat_norm = _normalize_dataframes(to_concat, sort=sort)
-        except CannotSortError:
-            raise TypeError("The resulting columns could not be sorted."
-                            " You can try setting sort=False or use"
-                            " compatible index types.")
+        to_concat_norm = _normalize_dataframes(to_concat, sort=sort)
         result = concat(to_concat_norm, ignore_index=ignore_index,
                         verify_integrity=verify_integrity, sort=sort)
 
@@ -6454,45 +6438,13 @@ def _append_list_of_frames(self, other, *args, **kwargs):
         if not ignore_index:
             result.index.name = self.index.name
 
-        # the conditionals below will be refactored or removed
-
-        if sort is None:
-            # The sorting behaviour for None was weird.
-            # It is getting deprecated.
-            #
-            # By now, fix tests by only sorting when the
-            # original 'other' was a series or a dict.
-            if _obj_type in (dict, Series):
-                sort = False
-            elif _item_type in (dict, Series):
-                # A list of dicts/Series had a different behaviour
-                # when sorting is None.
-                #
-                # We do not sort if the 'other' columns are all
-                # contained in self.columns. Otherwise we do
-                # sort.
-                #
-                # TODO: as per documentation, this seems like the original
-                # behaviour intended for append. Should I implement this
-                # for any inputs that come?
-                self_idx = self.columns
-                other_idx = other[0].columns
-                idx_diff = other_idx.difference(self_idx)
-                sort = len(idx_diff) > 0
-            else:
-                sort = True
-
+        # Reindexing the columns created an artificial float64 where it
+        # was not needed. We can convert the columns back to the expected
+        # type.
         if result.shape[0] == 1:
-            from pandas.core.dtypes.cast import find_common_type
-
-            # Reindexing the columns created an artificial float64 where it
-            # was not needed. We can convert the columns back to the expected
-            # type.
-
-            for col in result:
-                types = [df[col].dtype for df in to_concat if col in df]
-                common_type = find_common_type(types)
-                result[col] = result[col].astype(common_type)
+            base_frame = next(df for df in to_concat_norm if df.shape[0] == 1)
+            dtypes = base_frame.dtypes.to_dict()
+            result = result.astype(dtypes)  # won't work well dups cols
 
         return result
 
diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py
@@ -12,8 +12,7 @@
     _new_Index,
     ensure_index,
     ensure_index_from_sequences,
-    CannotSortError,
-    InvalidIndexError
+    InvalidIndexError,
 )
 from pandas.core.indexes.category import CategoricalIndex  # noqa
 from pandas.core.indexes.multi import MultiIndex  # noqa
@@ -38,6 +37,18 @@
 """)
 
 
+class _CannotSortError(Exception):
+    pass
+
+
+class _CannotSortDuplicatesError(Exception):
+    pass
+
+
+class _DuplicatesError(Exception):
+    pass
+
+
 # TODO: there are many places that rely on these private methods existing in
 # pandas.core.index
 __all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index',
@@ -181,20 +192,40 @@ def _normalize_dataframes(frame_list, verify_inputs=True, sort=False):
     ----------
     index_list: list of Index objects
     verify_inputs: boolean, default True
-        Verify if the input indexes contain overlapping values.
+        Verify if the input indexes contain duplicate values. Ignored when all
+        input indexes share the same identity (a is b).
     sort: boolean, default False
-        Order result index. If False, values will come in the order they
+        Order resulting index. If False, values will come in the order they
         appear.
 
     Raises
     ------
-    CannotSortError
-        When sort=True and the result index is not sortable.
-    InvalidIndexError
-        When verify_inputs=True and 1+ of the indexes contain duplicates.
+    InvalidIndexError:
+        When there are duplicates in at least one of the indexes (col)
+        and they are not allowed.
+    TypeError:
+        When sort=True and the resulting index (col) could not be sorted.
     """
     orig_columns = [df.columns for df in frame_list]
-    merged_columns = _merge_index_list(orig_columns, verify_inputs, sort)
+
+    kwargs = {
+        'verify_dups': verify_inputs,
+        'allow_matching_dups': verify_inputs,
+        'sort': sort,
+    }
+
+    try:
+        merged_columns = _merge_index_list(orig_columns, **kwargs)
+    except _DuplicatesError:
+        raise InvalidIndexError("Indexes with duplicates are only allowed"
+                                " when they are the same (a is b).")
+    except _CannotSortDuplicatesError:
+        raise InvalidIndexError("When sort=True, indexes with duplicate"
+                                " values are not allowed.")
+    except _CannotSortError:
+        raise TypeError("The resulting columns could not be sorted."
+                        " You can try setting sort=False or use"
+                        " compatible index types.")
 
     # Because _merge_index_list may infer the index dtype based on values,
     # we have to provide a workaround to conserve the original dtype.
@@ -217,33 +248,64 @@ def _normalize_dataframes(frame_list, verify_inputs=True, sort=False):
     return [_reindex(df, merged_columns, axis=1) for df in frame_list]
 
 
-def _merge_index_list(index_list, verify_inputs=True, sort=False):
+def _merge_index_list(index_list,
+                      verify_dups=True,
+                      allow_matching_dups=False,
+                      sort=False):
     """Merge a list of indexes into one big index
 
     Parameters
     ----------
     index_list: list of Index objects
-    verify_inputs: boolean, default True
-        Verify if the input indexes contain overlapping values.
+    verify_dups: boolean, default True
+        Verify if the input indexes contain duplicate values.
+    allow_matching_dups: boolean, default False
+        Only relevant when verify_dups=True. Allow duplicate values when all
+        indexes have the same identity.
     sort: boolean, default False
         Order result index. If False, values will come in the order they
         appear.
 
     Raises
     ------
-    CannotSortError
+    _CannotSortError
         When sort=True and the result index is not sortable.
-    InvalidIndexError
-        When verify_inputs=True and 1+ of the indexes contain duplicates.
+    _CannotSortDuplicatesError
+        When sort=True and at least one of the inputs contain duplicate
+        values.
+    _DuplicatesError
+        When verify_dups=True and at least one of the input indexes contain
+        duplicate values. This is error is not raised if
+        allow_matching_dups=True and all the indexes have a common identity.
     """
-    if verify_inputs:
-        if any([ix.has_duplicates for ix in index_list]):
-            raise InvalidIndexError("Input index has duplicate values")
-
-    result = index_list[0]
-    for idx in index_list[1:]:
+    # unique index list (a is b)
+    uindex_list = com.get_distinct_objs(index_list)
+
+    # verify duplicates
+    if sort or verify_dups:
+        has_dups = any(ix.has_duplicates for ix in uindex_list)
+        if has_dups:
+            if sort:
+                raise _CannotSortDuplicatesError("Cannot sort an index that"
+                                                " contains duplicate values.")
+            elif verify_dups and not allow_matching_dups:
+                raise _DuplicatesError("Index has duplicate values.")
+            elif verify_dups and allow_matching_dups and len(uindex_list) >= 2:
+                raise _DuplicatesError("Index has duplicate values and does"
+                                      " not match other indexes.")
+
+    # edge results
+    if len(uindex_list) == 0:
+        return pd.Index()
+    elif len(uindex_list) == 1:
+        return uindex_list[0]
+
+    # reduce to one result
+    result = uindex_list[0]
+    for idx in uindex_list[1:]:
         result = _merge_indexes(result, idx)
 
+    # sort
     return result if not sort else _sort_index(result)
 
 
@@ -278,7 +340,7 @@ def _sort_index(index):
     try:
         return index.sort_values()
     except TypeError:
-        raise CannotSortError
+        raise _CannotSortError
 
 
 def _reindex(df, new_index, axis=0):
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -158,10 +158,6 @@ def index_arithmetic_method(self, other):
     return set_function_name(index_arithmetic_method, name, cls)
 
 
-class CannotSortError(Exception):
-    pass
-
-
 class InvalidIndexError(Exception):
     pass
 
diff --git a/pandas/tests/reshape/test_append.py b/pandas/tests/reshape/test_append.py