diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index a5ccf5c32b876..34a8d552304d1 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -38,6 +38,43 @@ def time_isin(self, dtypes): self.s.isin(self.values) +class IsInForObjects(object): + + def setup(self): + self.s_nans = Series(np.full(10**4, np.nan)).astype(np.object) + self.vals_nans = np.full(10**4, np.nan).astype(np.object) + self.s_short = Series(np.arange(2)).astype(np.object) + self.s_long = Series(np.arange(10**5)).astype(np.object) + self.vals_short = np.arange(2).astype(np.object) + self.vals_long = np.arange(10**5).astype(np.object) + # because of nans floats are special: + self.s_long_floats = Series(np.arange(10**5, + dtype=np.float)).astype(np.object) + self.vals_long_floats = np.arange(10**5, + dtype=np.float).astype(np.object) + + def time_isin_nans(self): + # if nan-objects are different objects, + # this has the potential to trigger O(n^2) running time + self.s_nans.isin(self.vals_nans) + + def time_isin_short_series_long_values(self): + # running time dominated by the preprocessing + self.s_short.isin(self.vals_long) + + def time_isin_long_series_short_values(self): + # running time dominated by look-up + self.s_long.isin(self.vals_short) + + def time_isin_long_series_long_values(self): + # no dominating part + self.s_long.isin(self.vals_long) + + def time_isin_long_series_long_values_floats(self): + # no dominating part + self.s_long_floats.isin(self.vals_long_floats) + + class NSort(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 42e286f487a7d..a513c2bd03ac7 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -512,6 +512,7 @@ Missing - Bug in :func:`DataFrame.fillna` where a ``ValueError`` would raise when one column contained a ``datetime64[ns, tz]`` dtype (:issue:`15522`) - Bug in :func:`Series.hasnans` that could be incorrectly cached and return incorrect answers if null elements are introduced after an initial call (:issue:`19700`) +- :func:`Series.isin` now treats all nans as equal also for `np.object`-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`) MultiIndex ^^^^^^^^^^ diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index e9fb49e8a5e42..45a93051f78d3 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -47,10 +47,19 @@ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { PyErr_Clear(); return 0; } + if (result == 0) { // still could be two NaNs + return PyFloat_CheckExact(a) && + PyFloat_CheckExact(b) && + Py_IS_NAN(PyFloat_AS_DOUBLE(a)) && + Py_IS_NAN(PyFloat_AS_DOUBLE(b)); + } return result; } - +// For PyObject_Hash holds: +// hash(0.0) == 0 == hash(-0.0) +// hash(X) == 0 if X is a NaN-value +// so it is OK to use it directly #define kh_python_hash_func(key) (PyObject_Hash(key)) #define kh_python_hash_equal(a, b) (pyobject_cmp(a, b)) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 754703dfc4bee..8ed8d2014f5bc 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -13,7 +13,8 @@ from pandas.tests.indexes.common import Base from pandas.compat import (range, lrange, lzip, u, - text_type, zip, PY3, PY35, PY36, PYPY, StringIO) + text_type, zip, PY3, PY35, PY36, StringIO) +import math import operator import numpy as np @@ -1661,9 +1662,13 @@ def test_isin_nan_common_object(self, nulls_fixture, nulls_fixture2): # Test cartesian product of null fixtures and ensure that we don't # mangle the various types (save a corner case with PyPy) - if PYPY and nulls_fixture is np.nan: # np.nan is float('nan') on PyPy + # all nans are the same + if (isinstance(nulls_fixture, float) and + isinstance(nulls_fixture2, float) and + math.isnan(nulls_fixture) and + math.isnan(nulls_fixture2)): tm.assert_numpy_array_equal(Index(['a', nulls_fixture]).isin( - [float('nan')]), np.array([False, True])) + [nulls_fixture2]), np.array([False, True])) elif nulls_fixture is nulls_fixture2: # should preserve NA type tm.assert_numpy_array_equal(Index(['a', nulls_fixture]).isin( diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 7ce2aaf7d7fbb..d8d98318b4c72 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -623,6 +623,14 @@ def test_empty(self, empty): result = algos.isin(vals, empty) tm.assert_numpy_array_equal(expected, result) + def test_different_nan_objects(self): + # GH 22119 + comps = np.array(['nan', np.nan * 1j, float('nan')], dtype=np.object) + vals = np.array([float('nan')], dtype=np.object) + expected = np.array([False, False, True]) + result = algos.isin(comps, vals) + tm.assert_numpy_array_equal(expected, result) + class TestValueCounts(object):