From db29fdad8fe563c8a2a8a02f0e03e85b956c0fea Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Wed, 12 Jul 2017 20:18:58 +0530 Subject: [PATCH 1/4] BUG: GH16875 Fix inconsistency in groupby trnaformations --- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/core/groupby.py | 2 +- pandas/tests/groupby/test_transform.py | 13 +++++++++++++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index a5d4259480ba8..762107a261090 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -175,7 +175,7 @@ Groupby/Resample/Rolling - Bug in ``DataFrame.resample(...).size()`` where an empty ``DataFrame`` did not return a ``Series`` (:issue:`14962`) - Bug in :func:`infer_freq` causing indices with 2-day gaps during the working week to be wrongly inferred as business daily (:issue:`16624`) - Bug in ``.rolling(...).quantile()`` which incorrectly used different defaults than :func:`Series.quantile()` and :func:`DataFrame.quantile()` (:issue:`9413`, :issue:`16211`) - +- Bug in ``groupby.transform()`` that would coerce boolean dtypes back to float (:issue:`16875`) Sparse ^^^^^^ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index daf3381ae4e89..0356d8ca28905 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3055,7 +3055,7 @@ def transform(self, func, *args, **kwargs): # we have a numeric dtype, as these are *always* udfs # the cython take a different path (and casting) dtype = self._selected_obj.dtype - if is_numeric_dtype(dtype): + if is_numeric_dtype(dtype) and not is_bool_dtype(result.dtype): result = maybe_downcast_to_dtype(result, dtype) result.name = self._selected_obj.name diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 40434ff510421..98839a17d6e0c 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -195,6 +195,19 @@ def test_transform_bug(self): expected = Series(np.arange(5, 0, step=-1), name='B') assert_series_equal(result, expected) + def test_transform_numeric_to_boolean(self): + # GH 16875 + # inconsistency in transforming boolean values + expected = pd.Series([True, True], name='A') + + df = pd.DataFrame({'A': [1.1, 2.2], 'B': [1, 2]}) + result = df.groupby('B').A.transform(lambda x: True) + assert_series_equal(result, expected) + + df = pd.DataFrame({'A': [1, 2], 'B': [1, 2]}) + result = df.groupby('B').A.transform(lambda x: True) + assert_series_equal(result, expected) + def test_transform_datetime_to_timedelta(self): # GH 15429 # transforming a datetime to timedelta From 8740fd6c933c9c8bd76a63f0e5bc453e41852b8c Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Thu, 13 Jul 2017 10:36:33 +0530 Subject: [PATCH 2/4] BUG: GH16875 Move guard from groupby to in core.dtypes.cast --- pandas/core/dtypes/cast.py | 4 +++- pandas/core/groupby.py | 2 +- pandas/tests/dtypes/test_cast.py | 8 +++++++- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 16b0a5c8a74ca..103c326414930 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -110,7 +110,9 @@ def trans(x): # noqa np.prod(result.shape)): return result - if issubclass(dtype.type, np.floating): + # don't convert bool to float GH16875 + if issubclass(dtype.type, np.floating) and\ + not is_bool_dtype(result.dtype): return result.astype(dtype) elif is_bool_dtype(dtype) or is_integer_dtype(dtype): diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 0356d8ca28905..daf3381ae4e89 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3055,7 +3055,7 @@ def transform(self, func, *args, **kwargs): # we have a numeric dtype, as these are *always* udfs # the cython take a different path (and casting) dtype = self._selected_obj.dtype - if is_numeric_dtype(dtype) and not is_bool_dtype(result.dtype): + if is_numeric_dtype(dtype): result = maybe_downcast_to_dtype(result, dtype) result.name = self._selected_obj.name diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index 767e99d98cf29..6e07487b3e04f 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -9,7 +9,7 @@ from datetime import datetime, timedelta, date import numpy as np -from pandas import Timedelta, Timestamp, DatetimeIndex, DataFrame, NaT +from pandas import Timedelta, Timestamp, DatetimeIndex, DataFrame, NaT, Series from pandas.core.dtypes.cast import ( maybe_downcast_to_dtype, @@ -45,6 +45,12 @@ def test_downcast_conv(self): expected = np.array([8, 8, 8, 8, 9]) assert (np.array_equal(result, expected)) + # GH16875 coercing of bools + ser = Series([True, True, False]) + result = maybe_downcast_to_dtype(ser, np.dtype(np.float64)) + expected = ser + tm.assert_series_equal(result, expected) + # conversions expected = np.array([1, 2]) From 0ae525b4e1f68c3537ee016019329ff499c0bda3 Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Sat, 15 Jul 2017 13:21:57 +0530 Subject: [PATCH 3/4] use parens instead of backward slash --- pandas/core/dtypes/cast.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 103c326414930..ff137601e3aa6 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -111,8 +111,8 @@ def trans(x): # noqa return result # don't convert bool to float GH16875 - if issubclass(dtype.type, np.floating) and\ - not is_bool_dtype(result.dtype): + if (issubclass(dtype.type, np.floating) and + not is_bool_dtype(result.dtype)): return result.astype(dtype) elif is_bool_dtype(dtype) or is_integer_dtype(dtype): From cecf6373e947b07d0571f22c58497e0873a92d4b Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Sun, 16 Jul 2017 01:35:30 +0530 Subject: [PATCH 4/4] GH16875 rearrange if clause in pandas.core.dtypes.cast.maybe_downcast_to_dtype --- pandas/core/dtypes/cast.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index ff137601e3aa6..6532e17695c86 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -110,11 +110,7 @@ def trans(x): # noqa np.prod(result.shape)): return result - # don't convert bool to float GH16875 - if (issubclass(dtype.type, np.floating) and - not is_bool_dtype(result.dtype)): - return result.astype(dtype) - elif is_bool_dtype(dtype) or is_integer_dtype(dtype): + if is_bool_dtype(dtype) or is_integer_dtype(dtype): # if we don't have any elements, just astype it if not np.prod(result.shape): @@ -146,6 +142,9 @@ def trans(x): # noqa # hit here if (new_result == result).all(): return new_result + elif (issubclass(dtype.type, np.floating) and + not is_bool_dtype(result.dtype)): + return result.astype(dtype) # a datetimelike # GH12821, iNaT is casted to float