Skip to content
This repository was archived by the owner on Dec 22, 2019. It is now read-only.

Commit eee2dae

Browse files
author
araraonline
committed
Modify behavior of append on duplicates
When there were duplicates on the columns index, sort was allowed and duplicates were allowed if the indexes had the same values (as found by idx.tolist()). Now, considering that pandas doesn't allow to sort the index when there are duplicate values (DataFrame.reindex fails) and that searching for the same values is counter-productive and prone to fail, depending on the different types of indexes, the behavior was modified to this: - When sort=True and there are duplicates in at least one index, an error is raised and append stops. - Dframes with duplicate indexes are only considered to be joined when the indexes share the same identity (that is, they are the same object comparable with `idx1 is idx2`) Some other improvements to the code have also been made and I believe it is better in a general mode.
1 parent 87cc878 commit eee2dae

File tree

4 files changed

+204
-81
lines changed

4 files changed

+204
-81
lines changed

pandas/core/frame.py

Lines changed: 6 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6420,31 +6420,15 @@ def _append_list_of_frames(self, other, *args, **kwargs):
64206420
_obj_type = kwargs['_obj_type']
64216421
_item_type = kwargs.get('_item_type')
64226422

6423-
from pandas.core.indexes.api import (
6424-
CannotSortError,
6425-
_normalize_dataframes,
6426-
)
6423+
from pandas.core.indexes.api import _normalize_dataframes
64276424
from pandas.core.reshape.concat import concat
64286425

6429-
# The default value of sort in version 0.23.0 is None.
6430-
# The behavior when this was the value is very
6431-
# varied and changes according to input type, columns index
6432-
# type, whether a reindex is necessary or not, etc.
6433-
#
6434-
# The code below is a try to reproduce the old behavior,
6435-
# but note that this is deprecated.
6436-
#
6437-
# TODO: handle sort=None here
6438-
6439-
# The behavior of concat is a bit problematic as it is. To get around
6440-
# this, we prepare the DataFrames before feeding them into concat.
6426+
# TODO: sorting behavior when sort=None
6427+
6428+
# The behavior of concat is a bit problematic as it is. To get around,
6429+
# we prepare the DataFrames before feeding them into concat.
64416430
to_concat = [self] + other
6442-
try:
6443-
to_concat_norm = _normalize_dataframes(to_concat, sort=sort)
6444-
except CannotSortError:
6445-
raise TypeError("The resulting columns could not be sorted."
6446-
" You can try setting sort=False or use"
6447-
" compatible index types.")
6431+
to_concat_norm = _normalize_dataframes(to_concat, sort=sort)
64486432
result = concat(to_concat_norm, ignore_index=ignore_index,
64496433
verify_integrity=verify_integrity, sort=sort)
64506434

pandas/core/indexes/api.py

Lines changed: 84 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,7 @@
1212
_new_Index,
1313
ensure_index,
1414
ensure_index_from_sequences,
15-
CannotSortError,
16-
InvalidIndexError
15+
InvalidIndexError,
1716
)
1817
from pandas.core.indexes.category import CategoricalIndex # noqa
1918
from pandas.core.indexes.multi import MultiIndex # noqa
@@ -38,6 +37,18 @@
3837
""")
3938

4039

40+
class _CannotSortError(Exception):
41+
pass
42+
43+
44+
class _CannotSortDuplicatesError(Exception):
45+
pass
46+
47+
48+
class _DuplicatesError(Exception):
49+
pass
50+
51+
4152
# TODO: there are many places that rely on these private methods existing in
4253
# pandas.core.index
4354
__all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index',
@@ -181,20 +192,40 @@ def _normalize_dataframes(frame_list, verify_inputs=True, sort=False):
181192
----------
182193
index_list: list of Index objects
183194
verify_inputs: boolean, default True
184-
Verify if the input indexes contain overlapping values.
195+
Verify if the input indexes contain duplicate values. Ignored when all
196+
input indexes share the same identity (a is b).
185197
sort: boolean, default False
186-
Order result index. If False, values will come in the order they
198+
Order resulting index. If False, values will come in the order they
187199
appear.
188200
189201
Raises
190202
------
191-
CannotSortError
192-
When sort=True and the result index is not sortable.
193-
InvalidIndexError
194-
When verify_inputs=True and 1+ of the indexes contain duplicates.
203+
InvalidIndexError:
204+
When there are duplicates in at least one of the indexes (col)
205+
and they are not allowed.
206+
TypeError:
207+
When sort=True and the resulting index (col) could not be sorted.
195208
"""
196209
orig_columns = [df.columns for df in frame_list]
197-
merged_columns = _merge_index_list(orig_columns, verify_inputs, sort)
210+
211+
kwargs = {
212+
'verify_dups': verify_inputs,
213+
'allow_matching_dups': verify_inputs,
214+
'sort': sort,
215+
}
216+
217+
try:
218+
merged_columns = _merge_index_list(orig_columns, **kwargs)
219+
except _DuplicatesError:
220+
raise InvalidIndexError("Indexes with duplicates are only allowed"
221+
" when they are the same (a is b).")
222+
except _CannotSortDuplicatesError:
223+
raise InvalidIndexError("When sort=True, indexes with duplicate"
224+
" values are not allowed.")
225+
except _CannotSortError:
226+
raise TypeError("The resulting columns could not be sorted."
227+
" You can try setting sort=False or use"
228+
" compatible index types.")
198229

199230
# Because _merge_index_list may infer the index dtype based on values,
200231
# we have to provide a workaround to conserve the original dtype.
@@ -217,33 +248,64 @@ def _normalize_dataframes(frame_list, verify_inputs=True, sort=False):
217248
return [_reindex(df, merged_columns, axis=1) for df in frame_list]
218249

219250

220-
def _merge_index_list(index_list, verify_inputs=True, sort=False):
251+
def _merge_index_list(index_list,
252+
verify_dups=True,
253+
allow_matching_dups=False,
254+
sort=False):
221255
"""Merge a list of indexes into one big index
222256
223257
Parameters
224258
----------
225259
index_list: list of Index objects
226-
verify_inputs: boolean, default True
227-
Verify if the input indexes contain overlapping values.
260+
verify_dups: boolean, default True
261+
Verify if the input indexes contain duplicate values.
262+
allow_matching_dups: boolean, default False
263+
Only relevant when verify_dups=True. Allow duplicate values when all
264+
indexes have the same identity.
228265
sort: boolean, default False
229266
Order result index. If False, values will come in the order they
230267
appear.
231268
232269
Raises
233270
------
234-
CannotSortError
271+
_CannotSortError
235272
When sort=True and the result index is not sortable.
236-
InvalidIndexError
237-
When verify_inputs=True and 1+ of the indexes contain duplicates.
273+
_CannotSortDuplicatesError
274+
When sort=True and at least one of the inputs contain duplicate
275+
values.
276+
_DuplicatesError
277+
When verify_dups=True and at least one of the input indexes contain
278+
duplicate values. This is error is not raised if
279+
allow_matching_dups=True and all the indexes have a common identity.
238280
"""
239-
if verify_inputs:
240-
if any([ix.has_duplicates for ix in index_list]):
241-
raise InvalidIndexError("Input index has duplicate values")
242-
243-
result = index_list[0]
244-
for idx in index_list[1:]:
281+
# unique index list (a is b)
282+
uindex_list = com.get_distinct_objs(index_list)
283+
284+
# verify duplicates
285+
if sort or verify_dups:
286+
has_dups = any(ix.has_duplicates for ix in uindex_list)
287+
if has_dups:
288+
if sort:
289+
raise _CannotSortDuplicatesError("Cannot sort an index that"
290+
" contains duplicate values.")
291+
elif verify_dups and not allow_matching_dups:
292+
raise _DuplicatesError("Index has duplicate values.")
293+
elif verify_dups and allow_matching_dups and len(uindex_list) >= 2:
294+
raise _DuplicatesError("Index has duplicate values and does"
295+
" not match other indexes.")
296+
297+
# edge results
298+
if len(uindex_list) == 0:
299+
return pd.Index()
300+
elif len(uindex_list) == 1:
301+
return uindex_list[0]
302+
303+
# reduce to one result
304+
result = uindex_list[0]
305+
for idx in uindex_list[1:]:
245306
result = _merge_indexes(result, idx)
246307

308+
# sort
247309
return result if not sort else _sort_index(result)
248310

249311

@@ -278,7 +340,7 @@ def _sort_index(index):
278340
try:
279341
return index.sort_values()
280342
except TypeError:
281-
raise CannotSortError
343+
raise _CannotSortError
282344

283345

284346
def _reindex(df, new_index, axis=0):

pandas/core/indexes/base.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -158,10 +158,6 @@ def index_arithmetic_method(self, other):
158158
return set_function_name(index_arithmetic_method, name, cls)
159159

160160

161-
class CannotSortError(Exception):
162-
pass
163-
164-
165161
class InvalidIndexError(Exception):
166162
pass
167163

pandas/tests/reshape/test_append.py

Lines changed: 114 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,36 @@
3939
]
4040

4141

42+
indexes_with_dups = [
43+
# base
44+
pd.Index(['A', 'B', 'B']),
45+
pd.Index(['B', 'B', 'A']),
46+
pd.Index(['A', 'B', 'B'], name='foo'),
47+
pd.Index(['B', 'B', 'A'], name='bar'),
48+
49+
# numeric
50+
pd.Index([9, 10, 10], dtype=object),
51+
pd.Int64Index([3, 4, 4]),
52+
pd.UInt64Index([6, 7, 7]),
53+
pd.Float64Index([3.5, 4.5, 4.5]),
54+
55+
# datetime
56+
pd.to_datetime(['2013-01-01', '2013-01-10', '2013-01-10']),
57+
pd.to_timedelta(['1 day', '2 days', '2 days']),
58+
pd.PeriodIndex([2000, 2001, 2001], freq='A'),
59+
60+
# interval
61+
pd.IntervalIndex.from_arrays([0, 1, 1], [1, 2, 2]),
62+
63+
# categorical
64+
pd.CategoricalIndex('A B B'.split()),
65+
pd.CategoricalIndex('D E E'.split(), ordered=True),
66+
67+
# multi-index
68+
pd.MultiIndex.from_arrays(['A B B'.split(), 'D E E'.split()]),
69+
]
70+
71+
4272
index_sort_groups = [
4373
# When indexes from the same group are joined, the result is sortable.
4474
# When indexes from different groups are joined, the result is not
@@ -403,39 +433,90 @@ def test_preserve_index_values_with_sort(self, index1, index2):
403433
for value in index2:
404434
assert value in result.columns
405435

406-
def test_raise_on_duplicates(self, sort):
407-
# Append should not allow DataFrames with repeated
408-
# column names (or series with repeated row names).
409-
410-
# dupe on base
411-
df1 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'B'])
412-
df2 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'C'])
413-
with pytest.raises(InvalidIndexError):
414-
df1.append([], sort=sort)
415-
with pytest.raises(InvalidIndexError):
416-
df1.append([df2], sort=sort)
417-
with pytest.raises(InvalidIndexError):
418-
df1.append([df2, df2], sort=sort)
419-
420-
# dupe on other
421-
df1 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'C'])
422-
df2 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'B'])
423-
with pytest.raises(InvalidIndexError):
424-
df1.append([df2], sort=sort)
425-
with pytest.raises(InvalidIndexError):
426-
df1.append([df2, df2], sort=sort)
427-
428-
# dupe on both
429-
# (we could avoid raising errors here, but, to keep the api
430-
# consistent, we don't)
431-
df1 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'B'])
432-
df2 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'B'])
433-
with pytest.raises(InvalidIndexError):
434-
df1.append([], sort=sort)
435-
with pytest.raises(InvalidIndexError):
436-
df1.append([df2], sort=sort)
437-
with pytest.raises(InvalidIndexError):
438-
df1.append([df2, df2], sort=sort)
436+
@pytest.mark.parametrize('col_index', indexes_with_dups, ids=cls_name)
437+
def test_good_duplicates_without_sort(self, col_index):
438+
# When all indexes have the same identity (a is b), duplicates should
439+
# be allowed and append works.
440+
441+
df1 = pd.DataFrame([[1, 2, 3]], columns=col_index)
442+
df2 = pd.DataFrame([[4, 5, 6]], columns=col_index)
443+
444+
# df1.append([])
445+
result = df1.append([], sort=False)
446+
expected = df1.copy()
447+
assert_frame_equal(result, expected)
448+
449+
# df1.append([df2])
450+
result = df1.append([df2], ignore_index=True, sort=False)
451+
expected = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
452+
expected.columns = col_index
453+
assert_frame_equal(result, expected)
454+
455+
# df1.append([df2, df2])
456+
result = df1.append([df2, df2], ignore_index=True, sort=False)
457+
expected = pd.DataFrame([[1, 2, 3], [4, 5, 6], [4, 5, 6]])
458+
expected.columns = col_index
459+
assert_frame_equal(result, expected)
460+
461+
# df2.append([])
462+
result = df2.append([], sort=False)
463+
expected = df2.copy()
464+
assert_frame_equal(result, expected)
465+
466+
# df2.append([df1])
467+
result = df2.append([df1], ignore_index=True, sort=False)
468+
expected = pd.DataFrame([[4, 5, 6], [1, 2, 3]])
469+
expected.columns = col_index
470+
assert_frame_equal(result, expected)
471+
472+
# df2.append([df1, df1])
473+
result = df2.append([df1, df1], ignore_index=True, sort=False)
474+
expected = pd.DataFrame([[4, 5, 6], [1, 2, 3], [1, 2, 3]])
475+
expected.columns = col_index
476+
assert_frame_equal(result, expected)
477+
478+
@pytest.mark.parametrize('col_index', indexes_with_dups, ids=cls_name)
479+
def test_bad_duplicates_without_sort(self, col_index):
480+
# When the indexes do not share a common identity, duplicates are not
481+
# allowed and append raises.
482+
483+
df1 = pd.DataFrame([[1, 2, 3]], columns=col_index)
484+
df2 = pd.DataFrame([[4, 5, 6]], columns=col_index)
485+
df3 = pd.DataFrame([[7, 8, 9]], columns=col_index.copy()) # different
486+
ctx = pytest.raises(InvalidIndexError,
487+
match=r'Indexes with duplicates.*a is b.*')
488+
with ctx:
489+
result = df1.append([df3], sort=False)
490+
with ctx:
491+
result = df1.append([df2, df3], sort=False)
492+
with ctx:
493+
result = df1.append([df3, df2], sort=False)
494+
with ctx:
495+
result = df1.append([df3, df3], sort=False)
496+
497+
@pytest.mark.parametrize('col_index', indexes_with_dups, ids=cls_name)
498+
def test_duplicates_with_sort(self, col_index):
499+
# When sort=True, indexes with duplicate values are not be allowed.
500+
501+
df1 = pd.DataFrame([[1, 2, 3]], columns=col_index)
502+
df2 = pd.DataFrame([[4, 5, 6]], columns=col_index.copy())
503+
ctx = pytest.raises(InvalidIndexError,
504+
match=r'When sort=True, indexes with dupl.*')
505+
506+
with ctx:
507+
result = df1.append([], sort=True)
508+
with ctx:
509+
result = df1.append([df1], sort=True)
510+
with ctx:
511+
result = df1.append([df2], sort=True)
512+
with ctx:
513+
result = df1.append([df1, df1], sort=True)
514+
with ctx:
515+
result = df1.append([df1, df2], sort=True)
516+
with ctx:
517+
result = df1.append([df2, df1], sort=True)
518+
with ctx:
519+
result = df1.append([df2, df2], sort=True)
439520

440521
def test_nosort_basic(self):
441522
# When sort=False, the resulting columns come

0 commit comments

Comments
 (0)