From 862812308abbb72e4355782f2ea442cb88ce4787 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 1 Jun 2019 20:59:49 +0200 Subject: [PATCH 1/5] PERF: custom ops for RangeIndex.[all|any|__contain__] --- pandas/core/common.py | 10 ++++++++++ pandas/core/indexes/base.py | 6 +----- pandas/core/indexes/range.py | 17 ++++++++++++++++- pandas/tests/indexes/test_range.py | 25 +++++++++++++++++++++---- 4 files changed, 48 insertions(+), 10 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 771ded04f461d..470be526d0f7d 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -490,3 +490,13 @@ def f(x): f = mapper return f + + +def ensure_python_int(value: Union[int, Any]) -> int: + msg = "Wrong type {} for value {}" + try: + new_value = int(value) + assert (new_value == value) + except (TypeError, ValueError, AssertionError): + raise TypeError(msg.format(type(value), value)) + return new_value diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8538687ca3e91..b8c020ff0edb1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4013,11 +4013,7 @@ def __contains__(self, key): @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) def contains(self, key): - hash(key) - try: - return key in self._engine - except (TypeError, ValueError): - return False + return key in self def __hash__(self): raise TypeError("unhashable type: %r" % type(self).__name__) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 82fd7342c027c..51114fea21853 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -334,6 +334,14 @@ def is_monotonic_decreasing(self): def has_duplicates(self): return False + def __contains__(self, key): + hash(key) + try: + key = com.ensure_python_int(key) + except TypeError: + return False + return key in self._range + @Appender(_index_shared_docs['get_loc']) def get_loc(self, key, method=None, tolerance=None): if is_integer(key) and method is None and tolerance is None: @@ -640,6 +648,14 @@ def __floordiv__(self, other): return self._simple_new(start, start + 1, 1, name=self.name) return self._int64index // other + def all(self) -> bool: + if 0 in self._range: + return False + return True + + def any(self) -> bool: + return any(self._range) + @classmethod def _add_numeric_methods_binary(cls): """ add in numeric methods, specialized to RangeIndex """ @@ -725,4 +741,3 @@ def _evaluate_numeric_binop(self, other): RangeIndex._add_numeric_methods() -RangeIndex._add_logical_methods() diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index bca50186827de..2d9f2b1390905 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -245,10 +245,9 @@ def test_dtype(self): assert self.index.dtype == np.int64 def test_cached_data(self): - # GH 26565 - # Calling RangeIndex._data caches an int64 array of the same length as - # self at self._cached_data. - # This tests whether _cached_data is being set by various operations. + # GH 26565, GH26617 + # Calling RangeIndex._data caches an int64 array of the same length at + # self._cached_data. This tests whether _cached_data has been set. idx = RangeIndex(0, 100, 10) assert idx._cached_data is None @@ -262,6 +261,24 @@ def test_cached_data(self): idx.get_loc(20) assert idx._cached_data is None + 90 in idx + assert idx._cached_data is None + + 91 in idx + assert idx._cached_data is None + + idx.contains(90) + assert idx._cached_data is None + + idx.contains(91) + assert idx._cached_data is None + + idx.all() + assert idx._cached_data is None + + idx.any() + assert idx._cached_data is None + df = pd.DataFrame({'a': range(10)}, index=idx) df.loc[50] From d9b12383051f4e0eff0e74efa273721edf7f01b8 Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 2 Jun 2019 14:14:51 +0200 Subject: [PATCH 2/5] changes --- pandas/core/common.py | 10 ---------- pandas/core/dtypes/cast.py | 11 +++++++++++ pandas/core/indexes/range.py | 8 ++++---- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 470be526d0f7d..771ded04f461d 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -490,13 +490,3 @@ def f(x): f = mapper return f - - -def ensure_python_int(value: Union[int, Any]) -> int: - msg = "Wrong type {} for value {}" - try: - new_value = int(value) - assert (new_value == value) - except (TypeError, ValueError, AssertionError): - raise TypeError(msg.format(type(value), value)) - return new_value diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index d0f392df70c85..1f9ccfda35b91 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1,6 +1,7 @@ """ routings for casting """ from datetime import datetime, timedelta +from typing import Any, Union import numpy as np @@ -1333,3 +1334,13 @@ def maybe_cast_to_integer_array(arr, dtype, copy=False): if is_integer_dtype(dtype) and (is_float_dtype(arr) or is_object_dtype(arr)): raise ValueError("Trying to coerce float values to integers") + + +def ensure_python_int(value: Union[int, Any]) -> int: + msg = "Wrong type {} for value {}" + try: + new_value = int(value) + assert (new_value == value) + except (TypeError, ValueError, AssertionError): + raise TypeError(msg.format(type(value), value)) + return new_value diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 51114fea21853..f9e3699765b11 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -10,7 +10,7 @@ from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, cache_readonly -from pandas.core.dtypes import concat as _concat +from pandas.core.dtypes import cast, concat as _concat from pandas.core.dtypes.common import ( ensure_python_int, is_int64_dtype, is_integer, is_scalar, is_timedelta64_dtype) @@ -337,7 +337,7 @@ def has_duplicates(self): def __contains__(self, key): hash(key) try: - key = com.ensure_python_int(key) + key = cast.ensure_python_int(key) except TypeError: return False return key in self._range @@ -648,12 +648,12 @@ def __floordiv__(self, other): return self._simple_new(start, start + 1, 1, name=self.name) return self._int64index // other - def all(self) -> bool: + def all(self): if 0 in self._range: return False return True - def any(self) -> bool: + def any(self): return any(self._range) @classmethod From 9ef40b99744edd98f168f7e0a1f2a4f6efadb0e9 Mon Sep 17 00:00:00 2001 From: tp Date: Mon, 3 Jun 2019 00:04:23 +0200 Subject: [PATCH 3/5] Changes to .all() --- pandas/core/dtypes/cast.py | 11 ----------- pandas/core/indexes/range.py | 8 +++----- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 1f9ccfda35b91..d0f392df70c85 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1,7 +1,6 @@ """ routings for casting """ from datetime import datetime, timedelta -from typing import Any, Union import numpy as np @@ -1334,13 +1333,3 @@ def maybe_cast_to_integer_array(arr, dtype, copy=False): if is_integer_dtype(dtype) and (is_float_dtype(arr) or is_object_dtype(arr)): raise ValueError("Trying to coerce float values to integers") - - -def ensure_python_int(value: Union[int, Any]) -> int: - msg = "Wrong type {} for value {}" - try: - new_value = int(value) - assert (new_value == value) - except (TypeError, ValueError, AssertionError): - raise TypeError(msg.format(type(value), value)) - return new_value diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index f9e3699765b11..e9d9afbcef67e 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -10,7 +10,7 @@ from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, cache_readonly -from pandas.core.dtypes import cast, concat as _concat +from pandas.core.dtypes import concat as _concat from pandas.core.dtypes.common import ( ensure_python_int, is_int64_dtype, is_integer, is_scalar, is_timedelta64_dtype) @@ -337,7 +337,7 @@ def has_duplicates(self): def __contains__(self, key): hash(key) try: - key = cast.ensure_python_int(key) + key = ensure_python_int(key) except TypeError: return False return key in self._range @@ -649,9 +649,7 @@ def __floordiv__(self, other): return self._int64index // other def all(self): - if 0 in self._range: - return False - return True + return 0 not in self._range def any(self): return any(self._range) From 1e92983e86068014ef99c1443e4c6b3cc791cadc Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 5 Jun 2019 16:34:44 +0200 Subject: [PATCH 4/5] Rabased and added typing --- pandas/core/indexes/range.py | 7 ++++--- pandas/tests/indexes/test_range.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index e9d9afbcef67e..14ebc3c7e8e2a 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1,6 +1,7 @@ from datetime import timedelta import operator from sys import getsizeof +from typing import Union import warnings import numpy as np @@ -334,7 +335,7 @@ def is_monotonic_decreasing(self): def has_duplicates(self): return False - def __contains__(self, key): + def __contains__(self, key: Union[int, np.integer]) -> bool: hash(key) try: key = ensure_python_int(key) @@ -648,10 +649,10 @@ def __floordiv__(self, other): return self._simple_new(start, start + 1, 1, name=self.name) return self._int64index // other - def all(self): + def all(self) -> bool: return 0 not in self._range - def any(self): + def any(self) -> bool: return any(self._range) @classmethod diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 2d9f2b1390905..6eece0ed8efee 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -247,7 +247,7 @@ def test_dtype(self): def test_cached_data(self): # GH 26565, GH26617 # Calling RangeIndex._data caches an int64 array of the same length at - # self._cached_data. This tests whether _cached_data has been set. + # self._cached_data. This test checks whether _cached_data has been set idx = RangeIndex(0, 100, 10) assert idx._cached_data is None From 4aea46d033aa5a6d32b0127e1df0485127e36846 Mon Sep 17 00:00:00 2001 From: tp Date: Thu, 6 Jun 2019 01:34:29 +0200 Subject: [PATCH 5/5] add issue number to whatsnew --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1fb9b5ae695a0..0f31078d7bf43 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -502,7 +502,7 @@ Performance Improvements - Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`) - Improved performance of :meth:`pandas.core.groupby.GroupBy.quantile` (:issue:`20405`) -- Improved performance when slicing :class:`RangeIndex` (:issue:`26565`) +- Improved performance of slicing and other selected operation on a :class:`RangeIndex` (:issue:`26565`, :issue:`26617`) - Improved performance of :meth:`read_csv` by faster tokenizing and faster parsing of small float numbers (:issue:`25784`) - Improved performance of :meth:`read_csv` by faster parsing of N/A and boolean values (:issue:`25804`) - Improved performance of :attr:`IntervalIndex.is_monotonic`, :attr:`IntervalIndex.is_monotonic_increasing` and :attr:`IntervalIndex.is_monotonic_decreasing` by removing conversion to :class:`MultiIndex` (:issue:`24813`)