Skip to content
This repository was archived by the owner on Dec 22, 2019. It is now read-only.

Commit 5dfe32e

Browse files
author
araraonline
authored
Merge pull request #7 from araraonline/duplicate
Handle columns index duplicates
2 parents cb28274 + eee2dae commit 5dfe32e

File tree

4 files changed

+212
-189
lines changed

4 files changed

+212
-189
lines changed

pandas/core/frame.py

Lines changed: 12 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -6420,31 +6420,15 @@ def _append_list_of_frames(self, other, *args, **kwargs):
64206420
_obj_type = kwargs['_obj_type']
64216421
_item_type = kwargs.get('_item_type')
64226422

6423-
from pandas.core.indexes.api import (
6424-
CannotSortError,
6425-
_normalize_dataframes,
6426-
)
6423+
from pandas.core.indexes.api import _normalize_dataframes
64276424
from pandas.core.reshape.concat import concat
64286425

6429-
# The default value of sort in version 0.23.0 is None.
6430-
# The behavior when this was the value is very
6431-
# varied and changes according to input type, columns index
6432-
# type, whether a reindex is necessary or not, etc.
6433-
#
6434-
# The code below is a try to reproduce the old behavior,
6435-
# but note that this is deprecated.
6436-
#
6437-
# TODO: handle sort=None here
6438-
6439-
# The behavior of concat is a bit problematic as it is. To get around
6440-
# this, we prepare the DataFrames before feeding them into concat.
6426+
# TODO: sorting behavior when sort=None
6427+
6428+
# The behavior of concat is a bit problematic as it is. To get around,
6429+
# we prepare the DataFrames before feeding them into concat.
64416430
to_concat = [self] + other
6442-
try:
6443-
to_concat_norm = _normalize_dataframes(to_concat, sort=sort)
6444-
except CannotSortError:
6445-
raise TypeError("The resulting columns could not be sorted."
6446-
" You can try setting sort=False or use"
6447-
" compatible index types.")
6431+
to_concat_norm = _normalize_dataframes(to_concat, sort=sort)
64486432
result = concat(to_concat_norm, ignore_index=ignore_index,
64496433
verify_integrity=verify_integrity, sort=sort)
64506434

@@ -6454,45 +6438,13 @@ def _append_list_of_frames(self, other, *args, **kwargs):
64546438
if not ignore_index:
64556439
result.index.name = self.index.name
64566440

6457-
# the conditionals below will be refactored or removed
6458-
6459-
if sort is None:
6460-
# The sorting behaviour for None was weird.
6461-
# It is getting deprecated.
6462-
#
6463-
# By now, fix tests by only sorting when the
6464-
# original 'other' was a series or a dict.
6465-
if _obj_type in (dict, Series):
6466-
sort = False
6467-
elif _item_type in (dict, Series):
6468-
# A list of dicts/Series had a different behaviour
6469-
# when sorting is None.
6470-
#
6471-
# We do not sort if the 'other' columns are all
6472-
# contained in self.columns. Otherwise we do
6473-
# sort.
6474-
#
6475-
# TODO: as per documentation, this seems like the original
6476-
# behaviour intended for append. Should I implement this
6477-
# for any inputs that come?
6478-
self_idx = self.columns
6479-
other_idx = other[0].columns
6480-
idx_diff = other_idx.difference(self_idx)
6481-
sort = len(idx_diff) > 0
6482-
else:
6483-
sort = True
6484-
6441+
# Reindexing the columns created an artificial float64 where it
6442+
# was not needed. We can convert the columns back to the expected
6443+
# type.
64856444
if result.shape[0] == 1:
6486-
from pandas.core.dtypes.cast import find_common_type
6487-
6488-
# Reindexing the columns created an artificial float64 where it
6489-
# was not needed. We can convert the columns back to the expected
6490-
# type.
6491-
6492-
for col in result:
6493-
types = [df[col].dtype for df in to_concat if col in df]
6494-
common_type = find_common_type(types)
6495-
result[col] = result[col].astype(common_type)
6445+
base_frame = next(df for df in to_concat_norm if df.shape[0] == 1)
6446+
dtypes = base_frame.dtypes.to_dict()
6447+
result = result.astype(dtypes) # won't work well dups cols
64966448

64976449
return result
64986450

pandas/core/indexes/api.py

Lines changed: 84 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,7 @@
1212
_new_Index,
1313
ensure_index,
1414
ensure_index_from_sequences,
15-
CannotSortError,
16-
InvalidIndexError
15+
InvalidIndexError,
1716
)
1817
from pandas.core.indexes.category import CategoricalIndex # noqa
1918
from pandas.core.indexes.multi import MultiIndex # noqa
@@ -38,6 +37,18 @@
3837
""")
3938

4039

40+
class _CannotSortError(Exception):
41+
pass
42+
43+
44+
class _CannotSortDuplicatesError(Exception):
45+
pass
46+
47+
48+
class _DuplicatesError(Exception):
49+
pass
50+
51+
4152
# TODO: there are many places that rely on these private methods existing in
4253
# pandas.core.index
4354
__all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index',
@@ -181,20 +192,40 @@ def _normalize_dataframes(frame_list, verify_inputs=True, sort=False):
181192
----------
182193
index_list: list of Index objects
183194
verify_inputs: boolean, default True
184-
Verify if the input indexes contain overlapping values.
195+
Verify if the input indexes contain duplicate values. Ignored when all
196+
input indexes share the same identity (a is b).
185197
sort: boolean, default False
186-
Order result index. If False, values will come in the order they
198+
Order resulting index. If False, values will come in the order they
187199
appear.
188200
189201
Raises
190202
------
191-
CannotSortError
192-
When sort=True and the result index is not sortable.
193-
InvalidIndexError
194-
When verify_inputs=True and 1+ of the indexes contain duplicates.
203+
InvalidIndexError:
204+
When there are duplicates in at least one of the indexes (col)
205+
and they are not allowed.
206+
TypeError:
207+
When sort=True and the resulting index (col) could not be sorted.
195208
"""
196209
orig_columns = [df.columns for df in frame_list]
197-
merged_columns = _merge_index_list(orig_columns, verify_inputs, sort)
210+
211+
kwargs = {
212+
'verify_dups': verify_inputs,
213+
'allow_matching_dups': verify_inputs,
214+
'sort': sort,
215+
}
216+
217+
try:
218+
merged_columns = _merge_index_list(orig_columns, **kwargs)
219+
except _DuplicatesError:
220+
raise InvalidIndexError("Indexes with duplicates are only allowed"
221+
" when they are the same (a is b).")
222+
except _CannotSortDuplicatesError:
223+
raise InvalidIndexError("When sort=True, indexes with duplicate"
224+
" values are not allowed.")
225+
except _CannotSortError:
226+
raise TypeError("The resulting columns could not be sorted."
227+
" You can try setting sort=False or use"
228+
" compatible index types.")
198229

199230
# Because _merge_index_list may infer the index dtype based on values,
200231
# we have to provide a workaround to conserve the original dtype.
@@ -217,33 +248,64 @@ def _normalize_dataframes(frame_list, verify_inputs=True, sort=False):
217248
return [_reindex(df, merged_columns, axis=1) for df in frame_list]
218249

219250

220-
def _merge_index_list(index_list, verify_inputs=True, sort=False):
251+
def _merge_index_list(index_list,
252+
verify_dups=True,
253+
allow_matching_dups=False,
254+
sort=False):
221255
"""Merge a list of indexes into one big index
222256
223257
Parameters
224258
----------
225259
index_list: list of Index objects
226-
verify_inputs: boolean, default True
227-
Verify if the input indexes contain overlapping values.
260+
verify_dups: boolean, default True
261+
Verify if the input indexes contain duplicate values.
262+
allow_matching_dups: boolean, default False
263+
Only relevant when verify_dups=True. Allow duplicate values when all
264+
indexes have the same identity.
228265
sort: boolean, default False
229266
Order result index. If False, values will come in the order they
230267
appear.
231268
232269
Raises
233270
------
234-
CannotSortError
271+
_CannotSortError
235272
When sort=True and the result index is not sortable.
236-
InvalidIndexError
237-
When verify_inputs=True and 1+ of the indexes contain duplicates.
273+
_CannotSortDuplicatesError
274+
When sort=True and at least one of the inputs contain duplicate
275+
values.
276+
_DuplicatesError
277+
When verify_dups=True and at least one of the input indexes contain
278+
duplicate values. This is error is not raised if
279+
allow_matching_dups=True and all the indexes have a common identity.
238280
"""
239-
if verify_inputs:
240-
if any([ix.has_duplicates for ix in index_list]):
241-
raise InvalidIndexError("Input index has duplicate values")
242-
243-
result = index_list[0]
244-
for idx in index_list[1:]:
281+
# unique index list (a is b)
282+
uindex_list = com.get_distinct_objs(index_list)
283+
284+
# verify duplicates
285+
if sort or verify_dups:
286+
has_dups = any(ix.has_duplicates for ix in uindex_list)
287+
if has_dups:
288+
if sort:
289+
raise _CannotSortDuplicatesError("Cannot sort an index that"
290+
" contains duplicate values.")
291+
elif verify_dups and not allow_matching_dups:
292+
raise _DuplicatesError("Index has duplicate values.")
293+
elif verify_dups and allow_matching_dups and len(uindex_list) >= 2:
294+
raise _DuplicatesError("Index has duplicate values and does"
295+
" not match other indexes.")
296+
297+
# edge results
298+
if len(uindex_list) == 0:
299+
return pd.Index()
300+
elif len(uindex_list) == 1:
301+
return uindex_list[0]
302+
303+
# reduce to one result
304+
result = uindex_list[0]
305+
for idx in uindex_list[1:]:
245306
result = _merge_indexes(result, idx)
246307

308+
# sort
247309
return result if not sort else _sort_index(result)
248310

249311

@@ -278,7 +340,7 @@ def _sort_index(index):
278340
try:
279341
return index.sort_values()
280342
except TypeError:
281-
raise CannotSortError
343+
raise _CannotSortError
282344

283345

284346
def _reindex(df, new_index, axis=0):

pandas/core/indexes/base.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -158,10 +158,6 @@ def index_arithmetic_method(self, other):
158158
return set_function_name(index_arithmetic_method, name, cls)
159159

160160

161-
class CannotSortError(Exception):
162-
pass
163-
164-
165161
class InvalidIndexError(Exception):
166162
pass
167163

0 commit comments

Comments
 (0)