diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 30bc332f8a04b..961eb237f7d17 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -154,6 +154,33 @@ returned if all the columns were dummy encoded, and a :class:`DataFrame` otherwi Providing any ``SparseSeries`` or ``SparseDataFrame`` to :func:`concat` will cause a ``SparseSeries`` or ``SparseDataFrame`` to be returned, as before. +.. _whatsnew_0250.api_breaking.incompatible_index_unions + +Incompatible Index Type Unions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When performing :func:`Index.union` operations between objects of incompatible dtypes, +the result will be a base :class:`Index` of dtype ``object``. This behavior holds true for +unions between :class:`Index` objects that previously would have been prohibited. The dtype +of empty :class:`Index` objects will now be evaluated before performing union operations +rather than simply returning the other :class:`Index` object. :func:`Index.union` can now be +considered commutative, such that ``A.union(B) == B.union(A)`` (:issue:`23525`). + +*Previous Behavior*: + + In [1]: pd.period_range('19910905', periods=2).union(pd.Int64Index([1, 2, 3])) + ... + ValueError: can only call with other PeriodIndex-ed objects + + In [2]: pd.Index([], dtype=object).union(pd.Index([1, 2, 3])) + Out[2]: Int64Index([1, 2, 3], dtype='int64') + +*New Behavior*: + +.. ipython:: python + + pd.period_range('19910905', periods=2).union(pd.Int64Index([1, 2, 3])) + pd.Index([], dtype=object).union(pd.Index([1, 2, 3])) ``DataFrame`` groupby ffill/bfill no longer return group labels ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index dd3717813ce3f..eff7ff2c9f347 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -20,11 +20,10 @@ ensure_categorical, ensure_int64, ensure_object, ensure_platform_int, is_bool, is_bool_dtype, is_categorical, is_categorical_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, is_dtype_equal, - is_dtype_union_equal, is_extension_array_dtype, is_float, is_float_dtype, - is_hashable, is_integer, is_integer_dtype, is_interval_dtype, is_iterator, - is_list_like, is_object_dtype, is_period_dtype, is_scalar, - is_signed_integer_dtype, is_timedelta64_dtype, is_unsigned_integer_dtype, - pandas_dtype) + is_extension_array_dtype, is_float, is_float_dtype, is_hashable, + is_integer, is_integer_dtype, is_interval_dtype, is_iterator, is_list_like, + is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype, + is_timedelta64_dtype, is_unsigned_integer_dtype, pandas_dtype) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.generic import ( ABCDataFrame, ABCDateOffset, ABCDatetimeArray, ABCIndexClass, @@ -2262,6 +2261,47 @@ def _get_reconciled_name_object(self, other): return self._shallow_copy(name=name) return self + def _union_incompatible_dtypes(self, other, sort): + """ + Casts this and other index to object dtype to allow the formation + of a union between incompatible types. + + Parameters + ---------- + other : Index or array-like + sort : False or None, default False + Whether to sort the resulting index. + + * False : do not sort the result. + * None : sort the result, except when `self` and `other` are equal + or when the values cannot be compared. + + Returns + ------- + Index + """ + this = self.astype(object, copy=False) + # cast to Index for when `other` is list-like + other = Index(other).astype(object, copy=False) + return Index.union(this, other, sort=sort).astype(object, copy=False) + + def _is_compatible_with_other(self, other): + """ + Check whether this and the other dtype are compatible with each other. + Meaning a union can be formed between them without needing to be cast + to dtype object. + + Parameters + ---------- + other : Index or array-like + + Returns + ------- + bool + """ + return (type(self) is type(other) + and is_dtype_equal(self.dtype, other.dtype)) + def _validate_sort_keyword(self, sort): if sort not in [None, False]: raise ValueError("The 'sort' keyword only takes the values of " @@ -2271,6 +2311,11 @@ def union(self, other, sort=None): """ Form the union of two Index objects. + If the Index objects are incompatible, both Index objects will be + cast to dtype('object') first. + + .. versionchanged:: 0.25.0 + Parameters ---------- other : Index or array-like @@ -2300,30 +2345,54 @@ def union(self, other, sort=None): Examples -------- + Union matching dtypes + >>> idx1 = pd.Index([1, 2, 3, 4]) >>> idx2 = pd.Index([3, 4, 5, 6]) >>> idx1.union(idx2) Int64Index([1, 2, 3, 4, 5, 6], dtype='int64') + + Union mismatched dtypes + + >>> idx1 = pd.Index(['a', 'b', 'c', 'd']) + >>> idx2 = pd.Index([1, 2, 3, 4]) + >>> idx1.union(idx2) + Index(['a', 'b', 'c', 'd', 1, 2, 3, 4], dtype='object') """ self._validate_sort_keyword(sort) self._assert_can_do_setop(other) - other = ensure_index(other) - if len(other) == 0 or self.equals(other): + if not self._is_compatible_with_other(other): + return self._union_incompatible_dtypes(other, sort=sort) + + return self._union(other, sort=sort) + + def _union(self, other, sort): + """ + Specific union logic should go here. In subclasses, union behavior + should be overwritten here rather than in `self.union`. + + Parameters + ---------- + other : Index or array-like + sort : False or None, default False + Whether to sort the resulting index. + + * False : do not sort the result. + * None : sort the result, except when `self` and `other` are equal + or when the values cannot be compared. + + Returns + ------- + Index + """ + + if not len(other) or self.equals(other): return self._get_reconciled_name_object(other) - if len(self) == 0: + if not len(self): return other._get_reconciled_name_object(self) - # TODO: is_dtype_union_equal is a hack around - # 1. buggy set ops with duplicates (GH #13432) - # 2. CategoricalIndex lacking setops (GH #10186) - # Once those are fixed, this workaround can be removed - if not is_dtype_union_equal(self.dtype, other.dtype): - this = self.astype('O') - other = other.astype('O') - return this.union(other, sort=sort) - # TODO(EA): setops-refactor, clean all this up if is_period_dtype(self) or is_datetime64tz_dtype(self): lvals = self._ndarray_values @@ -2370,6 +2439,7 @@ def union(self, other, sort=None): def _wrap_setop_result(self, other, result): return self._constructor(result, name=get_op_result_name(self, other)) + # TODO: standardize return type of non-union setops type(self vs other) def intersection(self, other, sort=False): """ Form the intersection of two Index objects. diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 9c735a5598f4a..7fd537fb9989a 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -451,35 +451,9 @@ def _formatter_func(self): # -------------------------------------------------------------------- # Set Operation Methods - def union(self, other, sort=None): - """ - Specialized union for DatetimeIndex objects. If combine - overlapping ranges with the same DateOffset, will be much - faster than Index.union - - Parameters - ---------- - other : DatetimeIndex or array-like - sort : bool or None, default None - Whether to sort the resulting Index. - - * None : Sort the result, except when - - 1. `self` and `other` are equal. - 2. `self` or `other` has length 0. - 3. Some values in `self` or `other` cannot be compared. - A RuntimeWarning is issued in this case. - - * False : do not sort the result - - .. versionadded:: 0.25.0 - - Returns - ------- - y : Index or DatetimeIndex - """ - self._validate_sort_keyword(sort) - self._assert_can_do_setop(other) + def _union(self, other, sort): + if not len(other) or self.equals(other) or not len(self): + return super()._union(other, sort=sort) if len(other) == 0 or self.equals(other) or len(self) == 0: return super().union(other, sort=sort) @@ -495,7 +469,7 @@ def union(self, other, sort=None): if this._can_fast_union(other): return this._fast_union(other, sort=sort) else: - result = Index.union(this, other, sort=sort) + result = Index._union(this, other, sort=sort) if isinstance(result, DatetimeIndex): # TODO: we shouldn't be setting attributes like this; # in all the tests this equality already holds diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index a3dbf2e03957b..87216dcc7b957 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -964,19 +964,6 @@ def insert(self, loc, item): new_right = self.right.insert(loc, right_insert) return self._shallow_copy(new_left, new_right) - def _as_like_interval_index(self, other): - self._assert_can_do_setop(other) - other = ensure_index(other) - if not isinstance(other, IntervalIndex): - msg = ('the other index needs to be an IntervalIndex too, but ' - 'was type {}').format(other.__class__.__name__) - raise TypeError(msg) - elif self.closed != other.closed: - msg = ('can only do set operations between two IntervalIndex ' - 'objects that are closed on the same side') - raise ValueError(msg) - return other - def _concat_same_dtype(self, to_concat, name): """ assert that we all have the same .closed @@ -1092,7 +1079,17 @@ def overlaps(self, other): def _setop(op_name, sort=None): def func(self, other, sort=sort): - other = self._as_like_interval_index(other) + self._assert_can_do_setop(other) + other = ensure_index(other) + if not isinstance(other, IntervalIndex): + result = getattr(self.astype(object), op_name)(other) + if op_name in ('difference',): + result = result.astype(self.dtype) + return result + elif self.closed != other.closed: + msg = ('can only do set operations between two IntervalIndex ' + 'objects that are closed on the same side') + raise ValueError(msg) # GH 19016: ensure set op will not return a prohibited dtype subtypes = [self.dtype.subtype, other.dtype.subtype] @@ -1114,6 +1111,7 @@ def func(self, other, sort=sort): return type(self).from_tuples(result, closed=self.closed, name=result_name) + return func @property diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index a11f34cbdcceb..b6c8ba588f9d6 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -9,6 +9,7 @@ is_bool, is_bool_dtype, is_dtype_equal, is_extension_array_dtype, is_float, is_integer_dtype, is_scalar, needs_i8_conversion, pandas_dtype) import pandas.core.dtypes.concat as _concat +from pandas.core.dtypes.generic import ABCInt64Index, ABCRangeIndex from pandas.core.dtypes.missing import isna from pandas.core import algorithms @@ -221,6 +222,13 @@ def _assert_safe_casting(cls, data, subarr): raise TypeError('Unsafe NumPy casting, you must ' 'explicitly cast') + def _is_compatible_with_other(self, other): + return ( + super()._is_compatible_with_other(other) + or all(isinstance(type(obj), (ABCInt64Index, ABCRangeIndex)) + for obj in [self, other]) + ) + Int64Index._add_numeric_methods() Int64Index._add_logical_methods() diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index ed08de54ad6f2..044951ceda502 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -791,6 +791,11 @@ def join(self, other, how='left', level=None, return_indexers=False, """ self._assert_can_do_setop(other) + if not isinstance(other, PeriodIndex): + return self.astype(object).join(other, how=how, level=level, + return_indexers=return_indexers, + sort=sort) + result = Int64Index.join(self, other, how=how, level=level, return_indexers=return_indexers, sort=sort) @@ -807,10 +812,9 @@ def intersection(self, other, sort=False): def _assert_can_do_setop(self, other): super()._assert_can_do_setop(other) - if not isinstance(other, PeriodIndex): - raise ValueError('can only call with other PeriodIndex-ed objects') - - if self.freq != other.freq: + # *Can't* use PeriodIndexes of different freqs + # *Can* use PeriodIndex/DatetimeIndex + if isinstance(other, PeriodIndex) and self.freq != other.freq: msg = DIFFERENT_FREQ.format(cls=type(self).__name__, own_freq=self.freqstr, other_freq=other.freqstr) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 160e6284d3c59..ea14a4c789cd3 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -470,7 +470,7 @@ def _extended_gcd(self, a, b): old_t, t = t, old_t - quotient * t return old_r, old_s, old_t - def union(self, other, sort=None): + def _union(self, other, sort): """ Form the union of two Index objects and sorts if possible @@ -490,9 +490,8 @@ def union(self, other, sort=None): ------- union : Index """ - self._assert_can_do_setop(other) - if len(other) == 0 or self.equals(other) or len(self) == 0: - return super().union(other, sort=sort) + if not len(other) or self.equals(other) or not len(self): + return super()._union(other, sort=sort) if isinstance(other, RangeIndex) and sort is None: start_s, step_s = self._start, self._step @@ -530,8 +529,7 @@ def union(self, other, sort=None): (start_s + step_o >= start_o) and (end_s - step_o <= end_o)): return RangeIndex(start_r, end_r + step_o, step_o) - - return self._int64index.union(other, sort=sort) + return self._int64index._union(other, sort=sort) @Appender(_index_shared_docs['join']) def join(self, other, how='left', level=None, return_indexers=False, diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 5e62c2ef881e9..6ae17e62b49c6 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -329,24 +329,9 @@ def astype(self, dtype, copy=True): return Index(result.astype('i8'), name=self.name) return DatetimeIndexOpsMixin.astype(self, dtype, copy=copy) - def union(self, other): - """ - Specialized union for TimedeltaIndex objects. If combine - overlapping ranges with the same DateOffset, will be much - faster than Index.union - - Parameters - ---------- - other : TimedeltaIndex or array-like - - Returns - ------- - y : Index or TimedeltaIndex - """ - self._assert_can_do_setop(other) - + def _union(self, other, sort): if len(other) == 0 or self.equals(other) or len(self) == 0: - return super().union(other) + return super()._union(other, sort=sort) if not isinstance(other, TimedeltaIndex): try: @@ -358,7 +343,7 @@ def union(self, other): if this._can_fast_union(other): return this._fast_union(other) else: - result = Index.union(this, other) + result = Index._union(this, other, sort=sort) if isinstance(result, TimedeltaIndex): if result.freq is None: result.freq = to_offset(result.inferred_freq) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 71d1e686f5c02..674f600bc8693 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -447,11 +447,7 @@ def test_intersection_base(self): cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: - if isinstance(idx, PeriodIndex): - msg = "can only call with other PeriodIndex-ed objects" - with pytest.raises(ValueError, match=msg): - first.intersection(case) - elif isinstance(idx, CategoricalIndex): + if isinstance(idx, CategoricalIndex): pass else: result = first.intersection(case) @@ -474,11 +470,7 @@ def test_union_base(self): cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: - if isinstance(idx, PeriodIndex): - msg = "can only call with other PeriodIndex-ed objects" - with pytest.raises(ValueError, match=msg): - first.union(case) - elif isinstance(idx, CategoricalIndex): + if isinstance(idx, CategoricalIndex): pass else: result = first.union(case) @@ -506,11 +498,7 @@ def test_difference_base(self, sort): cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: - if isinstance(idx, PeriodIndex): - msg = "can only call with other PeriodIndex-ed objects" - with pytest.raises(ValueError, match=msg): - first.difference(case, sort) - elif isinstance(idx, CategoricalIndex): + if isinstance(idx, CategoricalIndex): pass elif isinstance(idx, (DatetimeIndex, TimedeltaIndex)): assert result.__class__ == answer.__class__ @@ -540,11 +528,7 @@ def test_symmetric_difference(self): cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: - if isinstance(idx, PeriodIndex): - msg = "can only call with other PeriodIndex-ed objects" - with pytest.raises(ValueError, match=msg): - first.symmetric_difference(case) - elif isinstance(idx, CategoricalIndex): + if isinstance(idx, CategoricalIndex): pass else: result = first.symmetric_difference(case) diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index 632d5b2875a5a..83f1f22b158b1 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -5,23 +5,25 @@ from pandas.core.indexes.api import Index, MultiIndex import pandas.util.testing as tm - -@pytest.fixture(params=[tm.makeUnicodeIndex(100), - tm.makeStringIndex(100), - tm.makeDateIndex(100), - tm.makePeriodIndex(100), - tm.makeTimedeltaIndex(100), - tm.makeIntIndex(100), - tm.makeUIntIndex(100), - tm.makeRangeIndex(100), - tm.makeFloatIndex(100), - Index([True, False]), - tm.makeCategoricalIndex(100), - Index([]), - MultiIndex.from_tuples(zip( - ['foo', 'bar', 'baz'], [1, 2, 3])), - Index([0, 0, 1, 1, 2, 2])], - ids=lambda x: type(x).__name__) +indices_list = [tm.makeUnicodeIndex(100), + tm.makeStringIndex(100), + tm.makeDateIndex(100), + tm.makePeriodIndex(100), + tm.makeTimedeltaIndex(100), + tm.makeIntIndex(100), + tm.makeUIntIndex(100), + tm.makeRangeIndex(100), + tm.makeFloatIndex(100), + Index([True, False]), + tm.makeCategoricalIndex(100), + tm.makeIntervalIndex(100), + Index([]), + MultiIndex.from_tuples(zip( + ['foo', 'bar', 'baz'], [1, 2, 3])), + Index([0, 0, 1, 1, 2, 2])] + + +@pytest.fixture(params=indices_list, ids=lambda x: type(x).__name__) def indices(request): return request.param diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index efa6d006bad6f..01649cb4646de 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -298,9 +298,9 @@ def test_join_with_period_index(self, join_type): c_idx_type='p', r_idx_type='dt') s = df.iloc[:5, 0] - msg = 'can only call with other PeriodIndex-ed objects' - with pytest.raises(ValueError, match=msg): - df.columns.join(s.index, how=join_type) + expected = df.columns.astype('O').join(s.index, how=join_type) + result = df.columns.join(s.index, how=join_type) + tm.assert_index_equal(expected, result) def test_factorize(self): idx1 = DatetimeIndex(['2014-01', '2014-01', '2014-02', '2014-02', diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 45a3a64216cab..fd666f3d56c9d 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -29,11 +29,20 @@ def test_union2(self, sort): union = first.union(second, sort=sort) tm.assert_index_equal(union, everything) + @pytest.mark.parametrize("box", [np.array, Series, list]) + @pytest.mark.parametrize("sort", [None, False]) + def test_union3(self, sort, box): + everything = tm.makeDateIndex(10) + first = everything[:5] + second = everything[5:] + # GH 10149 - cases = [klass(second.values) for klass in [np.array, Series, list]] - for case in cases: - result = first.union(case, sort=sort) - tm.assert_index_equal(result, everything) + expected = first.astype('O').union( + pd.Index(second.values, dtype='O') + ).astype('O') + case = box(second.values) + result = first.union(case, sort=sort) + tm.assert_index_equal(result, expected) @pytest.mark.parametrize("tz", tz) @pytest.mark.parametrize("sort", [None, False]) @@ -303,11 +312,12 @@ def test_datetimeindex_union_join_empty(self, sort): empty = Index([]) result = dti.union(empty, sort=sort) - assert isinstance(result, DatetimeIndex) - assert result is result + expected = dti.astype('O') + tm.assert_index_equal(result, expected) result = dti.join(empty) assert isinstance(result, DatetimeIndex) + tm.assert_index_equal(result, dti) def test_join_nonunique(self): idx1 = to_datetime(['2012-11-06 16:00:11.477563', diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 3f876565119cb..368dc68e516df 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -1077,7 +1077,10 @@ def test_dti_union_aware(self): tz="US/Eastern") result = rng.union(rng2) - assert result.tz.zone == 'UTC' + expected = rng.astype('O').union(rng2.astype('O')) + tm.assert_index_equal(result, expected) + assert result[0].tz.zone == 'US/Central' + assert result[-1].tz.zone == 'US/Eastern' @pytest.mark.parametrize('tz', [None, 'UTC', "US/Central", dateutil.tz.tzoffset(None, -28800)]) diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 61465d8454383..f4f63aaecd336 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -901,15 +901,18 @@ def test_symmetric_difference(self, closed, sort): @pytest.mark.parametrize('op_name', [ 'union', 'intersection', 'difference', 'symmetric_difference']) @pytest.mark.parametrize("sort", [None, False]) - def test_set_operation_errors(self, closed, op_name, sort): + def test_set_incompatible_types(self, closed, op_name, sort): index = self.create_index(closed=closed) set_op = getattr(index, op_name) + # TODO: standardize return type of non-union setops type(self vs other) # non-IntervalIndex - msg = ('the other index needs to be an IntervalIndex too, but ' - 'was type Int64Index') - with pytest.raises(TypeError, match=msg): - set_op(Index([1, 2, 3]), sort=sort) + if op_name == 'difference': + expected = index + else: + expected = getattr(index.astype('O'), op_name)(Index([1, 2, 3])) + result = set_op(Index([1, 2, 3]), sort=sort) + tm.assert_index_equal(result, expected) # mixed closed msg = ('can only do set operations between two IntervalIndex objects ' diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py index 29d07a0985574..a9102aeec060c 100644 --- a/pandas/tests/indexes/period/test_setops.py +++ b/pandas/tests/indexes/period/test_setops.py @@ -127,10 +127,6 @@ def test_union_misc(self, sort): with pytest.raises(period.IncompatibleFrequency): index.union(index2, sort=sort) - msg = 'can only call with other PeriodIndex-ed objects' - with pytest.raises(ValueError, match=msg): - index.join(index.to_timestamp()) - index3 = period_range('1/1/2000', '1/20/2000', freq='2D') with pytest.raises(period.IncompatibleFrequency): index.join(index3) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 7b507a9de6b5d..7e70d77ea70fc 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -889,6 +889,8 @@ def test_union_identity(self, sort): # i.e. identity is not preserved when sort is True assert (union is first) is (not sort) + # This should no longer be the same object, since [] is not consistent, + # both objects will be recast to dtype('O') union = first.union([], sort=sort) assert (union is first) is (not sort) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py new file mode 100644 index 0000000000000..b626ced2ccb1b --- /dev/null +++ b/pandas/tests/indexes/test_setops.py @@ -0,0 +1,76 @@ +''' +The tests in this package are to ensure the proper resultant dtypes of +set operations. +''' +import itertools as it + +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_dtype_equal + +import pandas as pd +from pandas import Int64Index, RangeIndex +from pandas.tests.indexes.conftest import indices_list +import pandas.util.testing as tm + +COMPATIBLE_INCONSISTENT_PAIRS = { + (Int64Index, RangeIndex): (tm.makeIntIndex, tm.makeRangeIndex) +} + + +@pytest.fixture(params=list(it.combinations(indices_list, 2)), + ids=lambda x: type(x[0]).__name__ + type(x[1]).__name__) +def index_pair(request): + """ + Create all combinations of 2 index types. + """ + return request.param + + +def test_union_same_types(indices): + # Union with a non-unique, non-monotonic index raises error + # Only needed for bool index factory + idx1 = indices.sort_values() + idx2 = indices.sort_values() + assert idx1.union(idx2).dtype == idx1.dtype + + +def test_union_different_types(index_pair): + # GH 23525 + idx1, idx2 = index_pair + type_pair = tuple(sorted([type(idx1), type(idx2)], key=lambda x: str(x))) + if type_pair in COMPATIBLE_INCONSISTENT_PAIRS: + pytest.xfail('This test only considers non compatible indexes.') + + if any(isinstance(idx, pd.MultiIndex) for idx in index_pair): + pytest.xfail('This test doesn\'t consider multiindixes.') + + if is_dtype_equal(idx1.dtype, idx2.dtype): + pytest.xfail('This test only considers non matching dtypes.') + + # A union with a CategoricalIndex (even as dtype('O')) and a + # non-CategoricalIndex can only be made if both indices are monotonic. + # This is true before this PR as well. + + # Union with a non-unique, non-monotonic index raises error + # This applies to the boolean index + idx1 = idx1.sort_values() + idx2 = idx2.sort_values() + + assert idx1.union(idx2).dtype == np.dtype('O') + assert idx2.union(idx1).dtype == np.dtype('O') + + +@pytest.mark.parametrize('idx_fact1,idx_fact2', + COMPATIBLE_INCONSISTENT_PAIRS.values()) +def test_compatible_inconsistent_pairs(idx_fact1, idx_fact2): + # GH 23525 + idx1 = idx_fact1(10) + idx2 = idx_fact2(20) + + res1 = idx1.union(idx2) + res2 = idx2.union(idx1) + + assert res1.dtype in (idx1.dtype, idx2.dtype) + assert res2.dtype in (idx1.dtype, idx2.dtype) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 3d9f3da75306a..ecd62380d8c65 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -960,22 +960,23 @@ def test_append_different_columns_types_raises( df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_can_append) ser = pd.Series([7, 8, 9], index=index_cannot_append_with_other, name=2) - msg = ("the other index needs to be an IntervalIndex too, but was" + msg = (r"unorderable types: (Interval|int)\(\) (<|>) " + r"(int|long|float|str|Timestamp)\(\)|" + r"Expected tuple, got (int|long|float|str)|" + r"Cannot compare type 'Timestamp' with type '(int|long)'|" + r"'(<|>)' not supported between instances of 'int' " + r"and '(str|Timestamp)'|" + r"the other index needs to be an IntervalIndex too, but was" r" type {}|" r"object of type '(int|float|Timestamp)' has no len\(\)|" "Expected tuple, got str") - with pytest.raises(TypeError, match=msg.format( - index_can_append.__class__.__name__)): + with pytest.raises(TypeError, match=msg): df.append(ser) df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_cannot_append_with_other) ser = pd.Series([7, 8, 9], index=index_can_append, name=2) - msg = (r"unorderable types: (Interval|int)\(\) > " - r"(int|float|str)\(\)|" - r"Expected tuple, got (int|float|str)|" - r"Cannot compare type 'Timestamp' with type 'int'|" - r"'>' not supported between instances of 'int' and 'str'") + with pytest.raises(TypeError, match=msg): df.append(ser) @@ -2029,7 +2030,8 @@ def test_concat_empty_series(self): s1 = pd.Series([1, 2, 3], name='x') s2 = pd.Series(name='y') res = pd.concat([s1, s2], axis=1) - exp = pd.DataFrame({'x': [1, 2, 3], 'y': [np.nan, np.nan, np.nan]}) + exp = pd.DataFrame({'x': [1, 2, 3], 'y': [np.nan, np.nan, np.nan]}, + index=pd.Index([0, 1, 2], dtype='O')) tm.assert_frame_equal(res, exp) s1 = pd.Series([1, 2, 3], name='x') @@ -2044,7 +2046,8 @@ def test_concat_empty_series(self): s2 = pd.Series(name=None) res = pd.concat([s1, s2], axis=1) exp = pd.DataFrame({'x': [1, 2, 3], 0: [np.nan, np.nan, np.nan]}, - columns=['x', 0]) + columns=['x', 0], + index=pd.Index([0, 1, 2], dtype='O')) tm.assert_frame_equal(res, exp) @pytest.mark.parametrize('tz', [None, 'UTC']) diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index 41c3e220ad06f..ed5cf2d6b2c51 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -100,6 +100,7 @@ def test_combine_first(self): # corner case s = Series([1., 2, 3], index=[0, 1, 2]) result = s.combine_first(Series([], index=[])) + s.index = s.index.astype('O') assert_series_equal(s, result) def test_update(self): diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 13e8d6c885029..11ad238eecd77 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -912,7 +912,7 @@ def test_interpolate_pchip(self): # interpolate at new_index new_index = ser.index.union(Index([49.25, 49.5, 49.75, 50.25, 50.5, - 50.75])) + 50.75])).astype(float) interp_s = ser.reindex(new_index).interpolate(method='pchip') # does not blow up, GH5977 interp_s[49:51] @@ -928,7 +928,9 @@ def test_interpolate_akima(self): index=Index([1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0])) # interpolate at new_index - new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75])) + new_index = ser.index.union( + Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75]) + ).astype(float) interp_s = ser.reindex(new_index).interpolate(method='akima') assert_series_equal(interp_s[1:3], expected) @@ -941,7 +943,9 @@ def test_interpolate_piecewise_polynomial(self): index=Index([1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0])) # interpolate at new_index - new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75])) + new_index = ser.index.union( + Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75]) + ).astype(float) interp_s = ser.reindex(new_index).interpolate( method='piecewise_polynomial') assert_series_equal(interp_s[1:3], expected) @@ -955,7 +959,9 @@ def test_interpolate_from_derivatives(self): index=Index([1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0])) # interpolate at new_index - new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75])) + new_index = ser.index.union( + Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75]) + ).astype(float) interp_s = ser.reindex(new_index).interpolate( method='from_derivatives') assert_series_equal(interp_s[1:3], expected) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index fee1976698b04..215fa9f22277e 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -8,10 +8,12 @@ from pandas import ( Categorical, DataFrame, Index, Series, bdate_range, date_range, isna) from pandas.core import ops +from pandas.core.indexes.base import InvalidIndexError import pandas.core.nanops as nanops import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal) + assert_almost_equal, assert_frame_equal, assert_index_equal, + assert_series_equal) from .common import TestData @@ -171,7 +173,6 @@ def test_scalar_na_logical_ops_corners(self): operator.and_, operator.or_, operator.xor, - ]) def test_logical_ops_with_index(self, op): # GH#22092, GH#19792 @@ -190,6 +191,37 @@ def test_logical_ops_with_index(self, op): result = op(ser, idx2) assert_series_equal(result, expected) + @pytest.mark.parametrize('op', [ + pytest.param(ops.rand_, + marks=pytest.mark.xfail(reason="GH#22092 Index " + "implementation returns " + "Index", + raises=AssertionError, + strict=True)), + pytest.param(ops.ror_, + marks=pytest.mark.xfail(reason="Index.get_indexer " + "with non unique index", + raises=InvalidIndexError, + strict=True)), + ops.rxor, + ]) + def test_reversed_logical_ops_with_index(self, op): + # GH#22092, GH#19792 + ser = Series([True, True, False, False]) + idx1 = Index([True, False, True, False]) + idx2 = Index([1, 0, 1, 0]) + + # symmetric_difference is only for rxor, but other 2 should fail + expected = idx1.symmetric_difference(ser) + + result = op(ser, idx1) + assert_index_equal(result, expected) + + expected = idx2.symmetric_difference(ser) + + result = op(ser, idx2) + assert_index_equal(result, expected) + @pytest.mark.parametrize("op, expected", [ (ops.rand_, pd.Index([False, True])), (ops.ror_, pd.Index([False, True])),