12
12
_new_Index ,
13
13
ensure_index ,
14
14
ensure_index_from_sequences ,
15
- CannotSortError ,
16
- InvalidIndexError
15
+ InvalidIndexError ,
17
16
)
18
17
from pandas .core .indexes .category import CategoricalIndex # noqa
19
18
from pandas .core .indexes .multi import MultiIndex # noqa
38
37
""" )
39
38
40
39
40
+ class _CannotSortError (Exception ):
41
+ pass
42
+
43
+
44
+ class _CannotSortDuplicatesError (Exception ):
45
+ pass
46
+
47
+
48
+ class _DuplicatesError (Exception ):
49
+ pass
50
+
51
+
41
52
# TODO: there are many places that rely on these private methods existing in
42
53
# pandas.core.index
43
54
__all__ = ['Index' , 'MultiIndex' , 'NumericIndex' , 'Float64Index' , 'Int64Index' ,
@@ -181,20 +192,40 @@ def _normalize_dataframes(frame_list, verify_inputs=True, sort=False):
181
192
----------
182
193
index_list: list of Index objects
183
194
verify_inputs: boolean, default True
184
- Verify if the input indexes contain overlapping values.
195
+ Verify if the input indexes contain duplicate values. Ignored when all
196
+ input indexes share the same identity (a is b).
185
197
sort: boolean, default False
186
- Order result index. If False, values will come in the order they
198
+ Order resulting index. If False, values will come in the order they
187
199
appear.
188
200
189
201
Raises
190
202
------
191
- CannotSortError
192
- When sort=True and the result index is not sortable.
193
- InvalidIndexError
194
- When verify_inputs=True and 1+ of the indexes contain duplicates.
203
+ InvalidIndexError:
204
+ When there are duplicates in at least one of the indexes (col)
205
+ and they are not allowed.
206
+ TypeError:
207
+ When sort=True and the resulting index (col) could not be sorted.
195
208
"""
196
209
orig_columns = [df .columns for df in frame_list ]
197
- merged_columns = _merge_index_list (orig_columns , verify_inputs , sort )
210
+
211
+ kwargs = {
212
+ 'verify_dups' : verify_inputs ,
213
+ 'allow_matching_dups' : verify_inputs ,
214
+ 'sort' : sort ,
215
+ }
216
+
217
+ try :
218
+ merged_columns = _merge_index_list (orig_columns , ** kwargs )
219
+ except _DuplicatesError :
220
+ raise InvalidIndexError ("Indexes with duplicates are only allowed"
221
+ " when they are the same (a is b)." )
222
+ except _CannotSortDuplicatesError :
223
+ raise InvalidIndexError ("When sort=True, indexes with duplicate"
224
+ " values are not allowed." )
225
+ except _CannotSortError :
226
+ raise TypeError ("The resulting columns could not be sorted."
227
+ " You can try setting sort=False or use"
228
+ " compatible index types." )
198
229
199
230
# Because _merge_index_list may infer the index dtype based on values,
200
231
# we have to provide a workaround to conserve the original dtype.
@@ -217,33 +248,64 @@ def _normalize_dataframes(frame_list, verify_inputs=True, sort=False):
217
248
return [_reindex (df , merged_columns , axis = 1 ) for df in frame_list ]
218
249
219
250
220
- def _merge_index_list (index_list , verify_inputs = True , sort = False ):
251
+ def _merge_index_list (index_list ,
252
+ verify_dups = True ,
253
+ allow_matching_dups = False ,
254
+ sort = False ):
221
255
"""Merge a list of indexes into one big index
222
256
223
257
Parameters
224
258
----------
225
259
index_list: list of Index objects
226
- verify_inputs: boolean, default True
227
- Verify if the input indexes contain overlapping values.
260
+ verify_dups: boolean, default True
261
+ Verify if the input indexes contain duplicate values.
262
+ allow_matching_dups: boolean, default False
263
+ Only relevant when verify_dups=True. Allow duplicate values when all
264
+ indexes have the same identity.
228
265
sort: boolean, default False
229
266
Order result index. If False, values will come in the order they
230
267
appear.
231
268
232
269
Raises
233
270
------
234
- CannotSortError
271
+ _CannotSortError
235
272
When sort=True and the result index is not sortable.
236
- InvalidIndexError
237
- When verify_inputs=True and 1+ of the indexes contain duplicates.
273
+ _CannotSortDuplicatesError
274
+ When sort=True and at least one of the inputs contain duplicate
275
+ values.
276
+ _DuplicatesError
277
+ When verify_dups=True and at least one of the input indexes contain
278
+ duplicate values. This is error is not raised if
279
+ allow_matching_dups=True and all the indexes have a common identity.
238
280
"""
239
- if verify_inputs :
240
- if any ([ix .has_duplicates for ix in index_list ]):
241
- raise InvalidIndexError ("Input index has duplicate values" )
242
-
243
- result = index_list [0 ]
244
- for idx in index_list [1 :]:
281
+ # unique index list (a is b)
282
+ uindex_list = com .get_distinct_objs (index_list )
283
+
284
+ # verify duplicates
285
+ if sort or verify_dups :
286
+ has_dups = any (ix .has_duplicates for ix in uindex_list )
287
+ if has_dups :
288
+ if sort :
289
+ raise _CannotSortDuplicatesError ("Cannot sort an index that"
290
+ " contains duplicate values." )
291
+ elif verify_dups and not allow_matching_dups :
292
+ raise _DuplicatesError ("Index has duplicate values." )
293
+ elif verify_dups and allow_matching_dups and len (uindex_list ) >= 2 :
294
+ raise _DuplicatesError ("Index has duplicate values and does"
295
+ " not match other indexes." )
296
+
297
+ # edge results
298
+ if len (uindex_list ) == 0 :
299
+ return pd .Index ()
300
+ elif len (uindex_list ) == 1 :
301
+ return uindex_list [0 ]
302
+
303
+ # reduce to one result
304
+ result = uindex_list [0 ]
305
+ for idx in uindex_list [1 :]:
245
306
result = _merge_indexes (result , idx )
246
307
308
+ # sort
247
309
return result if not sort else _sort_index (result )
248
310
249
311
@@ -278,7 +340,7 @@ def _sort_index(index):
278
340
try :
279
341
return index .sort_values ()
280
342
except TypeError :
281
- raise CannotSortError
343
+ raise _CannotSortError
282
344
283
345
284
346
def _reindex (df , new_index , axis = 0 ):
0 commit comments