diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 023c200e271ab..39635cb0e612f 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -582,7 +582,7 @@ and :ref:`Advanced Indexing ` you may select along more than .. _indexing.basics.indexing_isin: Indexing with isin -~~~~~~~~~~~~~~~~~~ +------------------ Consider the ``isin`` method of Series, which returns a boolean vector that is true wherever the Series elements exist in the passed list. This allows you to @@ -591,13 +591,30 @@ select rows where one or more columns have values you want: .. ipython:: python s = Series(np.arange(5),index=np.arange(5)[::-1],dtype='int64') - s + s.isin([2, 4, 6]) + s[s.isin([2, 4, 6])] + +The same method is available for ``Index`` objects and is useful for the cases +when you don't know which of the sought labels are in fact present: - s.isin([2, 4]) +.. ipython:: python + + s[s.index.isin([2, 4, 6])] - s[s.isin([2, 4])] + # compare it to the following + s[[2, 4, 6]] +In addition to that, ``MultiIndex`` allows selecting a separate level to use +in the membership check: + +.. ipython:: python + + s_mi = Series(np.arange(6), + index=pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']])) + s_mi + s_mi.iloc[s_mi.index.isin([(1, 'a'), (2, 'b'), (0, 'c')])] + s_mi.iloc[s_mi.index.isin(['a', 'c', 'e'], level=1)] DataFrame also has an ``isin`` method. When calling ``isin``, pass a set of values as either an array or dict. If values is an array, ``isin`` returns @@ -1622,12 +1639,6 @@ with duplicates dropped. idx1.sym_diff(idx2) idx1 ^ idx2 -The ``isin`` method of Index objects -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -One additional operation is the ``isin`` method that works analogously to the -``Series.isin`` method found :ref:`here `. - .. _indexing.hierarchical: Hierarchical indexing (MultiIndex) diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 109ed8b286c22..0f9633bdb908b 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -129,6 +129,19 @@ API changes strings must contain 244 or fewer characters. Attempting to write Stata dta files with strings longer than 244 characters raises a ``ValueError``. (:issue:`7858`) +- ``Index.isin`` now supports a ``level`` argument to specify which index level + to use for membership tests (:issue:`7892`, :issue:`7890`) + + .. code-block:: python + + In [1]: idx = pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']]) + + In [2]: idx.values + Out[2]: array([(0, 'a'), (0, 'b'), (0, 'c'), (1, 'a'), (1, 'b'), (1, 'c')], dtype=object) + + In [3]: idx.isin(['a', 'c', 'e'], level=1) + Out[3]: array([ True, False, True, True, False, True], dtype=bool) + .. _whatsnew_0150.cat: diff --git a/pandas/core/index.py b/pandas/core/index.py index 263e6db8c486a..94bc48d0f4342 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -12,7 +12,7 @@ import pandas.index as _index from pandas.lib import Timestamp, is_datetime_array from pandas.core.base import FrozenList, FrozenNDArray, IndexOpsMixin -from pandas.util.decorators import cache_readonly, deprecate +from pandas.util.decorators import cache_readonly, deprecate, Appender from pandas.core.common import isnull, array_equivalent import pandas.core.common as com from pandas.core.common import (_values_from_object, is_float, is_integer, @@ -687,13 +687,29 @@ def _engine(self): # property, for now, slow to look up return self._engine_type(lambda: self.values, len(self)) + def _validate_index_level(self, level): + """ + Validate index level. + + For single-level Index getting level number is a no-op, but some + verification must be done like in MultiIndex. + + """ + if isinstance(level, int): + if level < 0 and level != -1: + raise IndexError("Too many levels: Index has only 1 level," + " %d is not a valid level number" % (level,)) + elif level > 0: + raise IndexError("Too many levels:" + " Index has only 1 level, not %d" % + (level + 1)) + elif level != self.name: + raise KeyError('Level %s must be same as name (%s)' + % (level, self.name)) + def _get_level_number(self, level): - if not isinstance(level, int): - if level != self.name: - raise AssertionError('Level %s must be same as name (%s)' - % (level, self.name)) - level = 0 - return level + self._validate_index_level(level) + return 0 @cache_readonly def inferred_type(self): @@ -1271,7 +1287,7 @@ def get_level_values(self, level): values : ndarray """ # checks that level number is actually just 1 - self._get_level_number(level) + self._validate_index_level(level) return self def get_indexer(self, target, method=None, limit=None): @@ -1370,7 +1386,7 @@ def groupby(self, to_groupby): def map(self, mapper): return self._arrmap(self.values, mapper) - def isin(self, values): + def isin(self, values, level=None): """ Compute boolean array of whether each index value is found in the passed set of values @@ -1378,12 +1394,26 @@ def isin(self, values): Parameters ---------- values : set or sequence of values + Sought values. + level : str or int, optional + Name or position of the index level to use (if the index is a + MultiIndex). + + Notes + ----- + If `level` is specified: + + - if it is the name of one *and only one* index level, use that level; + - otherwise it should be a number indicating level position. Returns ------- is_contained : ndarray (boolean dtype) + """ value_set = set(values) + if level is not None: + self._validate_index_level(level) return lib.ismember(self._array_values(), value_set) def _array_values(self): @@ -2149,20 +2179,11 @@ def hasnans(self): def is_unique(self): return super(Float64Index, self).is_unique and self._nan_idxs.size < 2 - def isin(self, values): - """ - Compute boolean array of whether each index value is found in the - passed set of values - - Parameters - ---------- - values : set or sequence of values - - Returns - ------- - is_contained : ndarray (boolean dtype) - """ + @Appender(Index.isin.__doc__) + def isin(self, values, level=None): value_set = set(values) + if level is not None: + self._validate_index_level(level) return lib.ismember_nans(self._array_values(), value_set, isnull(list(value_set)).any()) @@ -4052,6 +4073,21 @@ def _wrap_joined_index(self, joined, other): names = self.names if self.names == other.names else None return MultiIndex.from_tuples(joined, names=names) + @Appender(Index.isin.__doc__) + def isin(self, values, level=None): + if level is None: + return lib.ismember(self._array_values(), set(values)) + else: + num = self._get_level_number(level) + levs = self.levels[num] + labs = self.labels[num] + + sought_labels = levs.isin(values).nonzero()[0] + if levs.size == 0: + return np.zeros(len(labs), dtype=np.bool_) + else: + return np.lib.arraysetops.in1d(labs, sought_labels) + # For utility purposes diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 8b1f6ce3e7f45..c32c7ddc55ced 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -840,7 +840,7 @@ def test_get_set_value(self): self.assertEqual(values[67], 10) def test_isin(self): - values = ['foo', 'bar'] + values = ['foo', 'bar', 'quux'] idx = Index(['qux', 'baz', 'foo', 'bar']) result = idx.isin(values) @@ -853,6 +853,49 @@ def test_isin(self): self.assertEqual(len(result), 0) self.assertEqual(result.dtype, np.bool_) + def test_isin_nan(self): + self.assert_numpy_array_equal( + Index(['a', np.nan]).isin([np.nan]), [False, True]) + self.assert_numpy_array_equal( + Index(['a', pd.NaT]).isin([pd.NaT]), [False, True]) + self.assert_numpy_array_equal( + Index(['a', np.nan]).isin([float('nan')]), [False, False]) + self.assert_numpy_array_equal( + Index(['a', np.nan]).isin([pd.NaT]), [False, False]) + # Float64Index overrides isin, so must be checked separately + self.assert_numpy_array_equal( + Float64Index([1.0, np.nan]).isin([np.nan]), [False, True]) + self.assert_numpy_array_equal( + Float64Index([1.0, np.nan]).isin([float('nan')]), [False, True]) + self.assert_numpy_array_equal( + Float64Index([1.0, np.nan]).isin([pd.NaT]), [False, True]) + + def test_isin_level_kwarg(self): + def check_idx(idx): + values = idx.tolist()[-2:] + ['nonexisting'] + + expected = np.array([False, False, True, True]) + self.assert_numpy_array_equal(expected, idx.isin(values, level=0)) + self.assert_numpy_array_equal(expected, idx.isin(values, level=-1)) + + self.assertRaises(IndexError, idx.isin, values, level=1) + self.assertRaises(IndexError, idx.isin, values, level=10) + self.assertRaises(IndexError, idx.isin, values, level=-2) + + self.assertRaises(KeyError, idx.isin, values, level=1.0) + self.assertRaises(KeyError, idx.isin, values, level='foobar') + + idx.name = 'foobar' + self.assert_numpy_array_equal(expected, + idx.isin(values, level='foobar')) + + self.assertRaises(KeyError, idx.isin, values, level='xyzzy') + self.assertRaises(KeyError, idx.isin, values, level=np.nan) + + check_idx(Index(['qux', 'baz', 'foo', 'bar'])) + # Float64Index overrides isin, so must be checked separately + check_idx(Float64Index([1.0, 2.0, 3.0, 4.0])) + def test_boolean_cmp(self): values = [1, 2, 3, 4] @@ -2948,6 +2991,55 @@ def test_level_setting_resets_attributes(self): # if this fails, probably didn't reset the cache correctly. assert not ind.is_monotonic + def test_isin(self): + values = [('foo', 2), ('bar', 3), ('quux', 4)] + + idx = MultiIndex.from_arrays([['qux', 'baz', 'foo', 'bar'], + np.arange(4)]) + result = idx.isin(values) + expected = np.array([False, False, True, True]) + self.assert_numpy_array_equal(result, expected) + + # empty, return dtype bool + idx = MultiIndex.from_arrays([[], []]) + result = idx.isin(values) + self.assertEqual(len(result), 0) + self.assertEqual(result.dtype, np.bool_) + + def test_isin_nan(self): + idx = MultiIndex.from_arrays([['foo', 'bar'], [1.0, np.nan]]) + self.assert_numpy_array_equal(idx.isin([('bar', np.nan)]), + [False, False]) + self.assert_numpy_array_equal(idx.isin([('bar', float('nan'))]), + [False, False]) + + def test_isin_level_kwarg(self): + idx = MultiIndex.from_arrays([['qux', 'baz', 'foo', 'bar'], + np.arange(4)]) + + vals_0 = ['foo', 'bar', 'quux'] + vals_1 = [2, 3, 10] + + expected = np.array([False, False, True, True]) + self.assert_numpy_array_equal(expected, idx.isin(vals_0, level=0)) + self.assert_numpy_array_equal(expected, idx.isin(vals_0, level=-2)) + + self.assert_numpy_array_equal(expected, idx.isin(vals_1, level=1)) + self.assert_numpy_array_equal(expected, idx.isin(vals_1, level=-1)) + + self.assertRaises(IndexError, idx.isin, vals_0, level=5) + self.assertRaises(IndexError, idx.isin, vals_0, level=-5) + + self.assertRaises(KeyError, idx.isin, vals_0, level=1.0) + self.assertRaises(KeyError, idx.isin, vals_1, level=-1.0) + self.assertRaises(KeyError, idx.isin, vals_1, level='A') + + idx.names = ['A', 'B'] + self.assert_numpy_array_equal(expected, idx.isin(vals_0, level='A')) + self.assert_numpy_array_equal(expected, idx.isin(vals_1, level='B')) + + self.assertRaises(KeyError, idx.isin, vals_1, level='C') + def test_get_combined_index(): from pandas.core.index import _get_combined_index