From d35c39a1f2aacf72dcaa9e346298f66da0b32d7e Mon Sep 17 00:00:00 2001 From: Enis Date: Sun, 19 May 2019 14:51:37 +0100 Subject: [PATCH 1/8] BUG: Ensure that top and freq are reported as None for empty dataframes --- pandas/core/generic.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 76910f425836e..018d562a9c67b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9919,6 +9919,9 @@ def describe_categorical_1d(data): else: names += ['top', 'freq'] result += [top, freq] + else: + names += ['top', 'freq'] + result += [None, None] return pd.Series(result, index=names, name=data.name) From e7a7bdf228d752fc832187864598b21bb98a2b8f Mon Sep 17 00:00:00 2001 From: Enis Date: Mon, 20 May 2019 19:46:17 +0100 Subject: [PATCH 2/8] BUG: Ensure that the index values obtained when calling describe on an empty Categorical / Object column is the same as that of an non empty column --- pandas/core/generic.py | 2 ++ pandas/tests/frame/test_analytics.py | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 018d562a9c67b..a174eed99c9b5 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9919,6 +9919,8 @@ def describe_categorical_1d(data): else: names += ['top', 'freq'] result += [top, freq] + + # If the DataFrame is empty, set 'top' and 'freq' to None to maintain output shape consistency else: names += ['top', 'freq'] result += [None, None] diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index effe7eb47323d..f8c9eca706a8c 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -588,6 +588,12 @@ def test_describe_categorical(self): result = df3.describe() tm.assert_numpy_array_equal(result["cat"].values, result["s"].values) + # Ensure the index of an an empty categoric DataFrame column also contains (count, unique, top, freq) + df = pd.DataFrame({"empty_col": Categorical([])}) + result = df.describe() + expected = DataFrame({'empty_col': [0, 0, None, None]}, index=['count', 'unique', 'top', 'freq']) + tm.assert_frame_equal(result, expected) + def test_describe_categorical_columns(self): # GH 11558 columns = pd.CategoricalIndex(['int1', 'int2', 'obj'], @@ -608,6 +614,7 @@ def test_describe_categorical_columns(self): index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'], columns=exp_columns) + tm.assert_frame_equal(result, expected) tm.assert_categorical_equal(result.columns.values, expected.columns.values) From 10df1570bf8f17146400ad8462d87dc9e5de1f37 Mon Sep 17 00:00:00 2001 From: Enis Date: Mon, 20 May 2019 19:52:13 +0100 Subject: [PATCH 3/8] BUG: Ensure that the index values obtained when calling describe on an empty Categorical / Object column is the same as that of an non empty column --- pandas/core/generic.py | 3 ++- pandas/tests/frame/test_analytics.py | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a174eed99c9b5..080bb6d0b4368 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9920,7 +9920,8 @@ def describe_categorical_1d(data): names += ['top', 'freq'] result += [top, freq] - # If the DataFrame is empty, set 'top' and 'freq' to None to maintain output shape consistency + # If the DataFrame is empty, set 'top' and 'freq' to None + # to maintain output shape consistency else: names += ['top', 'freq'] result += [None, None] diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index f8c9eca706a8c..adc2b9e697370 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -588,10 +588,12 @@ def test_describe_categorical(self): result = df3.describe() tm.assert_numpy_array_equal(result["cat"].values, result["s"].values) - # Ensure the index of an an empty categoric DataFrame column also contains (count, unique, top, freq) + # Ensure the index of an an empty categoric DataFrame column + # also contains (count, unique, top, freq) df = pd.DataFrame({"empty_col": Categorical([])}) result = df.describe() - expected = DataFrame({'empty_col': [0, 0, None, None]}, index=['count', 'unique', 'top', 'freq']) + expected = DataFrame({'empty_col': [0, 0, None, None]}, + index=['count', 'unique', 'top', 'freq']) tm.assert_frame_equal(result, expected) def test_describe_categorical_columns(self): From 1c7b6d1b830819c0c3cdba98f284ce79376b6e99 Mon Sep 17 00:00:00 2001 From: Enis Date: Mon, 20 May 2019 22:06:27 +0100 Subject: [PATCH 4/8] BUG: Ensure that the index values obtained when calling describe on an empty Categorical / Object column is the same as that of an non empty column --- pandas/core/arrays/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d25ccd1b158be..8ca4ba182b545 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1477,7 +1477,7 @@ def value_counts(self, dropna=True): if dropna or clean: obs = code if clean else code[mask] - count = bincount(obs, minlength=ncat or None) + count = bincount(obs, minlength=ncat or 0) else: count = bincount(np.where(mask, code, ncat)) ix = np.append(ix, -1) From 302b28ce2aea4a0735ed9160ace71bb7cbc2729a Mon Sep 17 00:00:00 2001 From: Enis Date: Sun, 26 May 2019 13:59:53 +0100 Subject: [PATCH 5/8] BUG: Ensure that the index values obtained when calling describe on an empty Categorical / Object column is the same as that of an non empty column --- pandas/tests/frame/test_analytics.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index adc2b9e697370..487ff7932ec5f 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -588,6 +588,8 @@ def test_describe_categorical(self): result = df3.describe() tm.assert_numpy_array_equal(result["cat"].values, result["s"].values) + def test_describe_empty_categorical_column(self): + # GH 26397 # Ensure the index of an an empty categoric DataFrame column # also contains (count, unique, top, freq) df = pd.DataFrame({"empty_col": Categorical([])}) From 65b1cc91c14c2dbe72d1f031664824ccfd9c4806 Mon Sep 17 00:00:00 2001 From: Enis Date: Sun, 26 May 2019 22:04:46 +0100 Subject: [PATCH 6/8] Add entry to api_breaking section for 0.25.0 --- doc/source/whatsnew/v0.25.0.rst | 34 +++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index dc87ae8f48b8a..1adcd406d5f8d 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -184,6 +184,40 @@ are returned. (:issue:`21521`) df.groupby("a").ffill() +``DataFrame`` describe on an empty categorical / object column will return top and freq +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When calling :func:`describe`` on a :class:`DataFrame` with an empty categorical / object +column, the 'top' and 'freq' columns were previously omitted, which was inconsistent with +the output for non-empty columns. Now, the 'top' and 'freq' for the column will be shown +as np.NaN in the case of an empty :class:`DataFrame` (:issue:`26397`) + +.. ipython:: python + + df = pd.DataFrame({"empty_col": pd.Categorical([])}) + +*Previous Behavior*: + +.. code-block:: python + + In [3]: df.describe() + Out[3]: + empty_col + count 0 + unique 0 + +*New Behavior*: + +.. code-block:: python + + In [4]: df.describe() + Out[4]: + empty_col + count 0.0 + unique 0.0 + top NaN + freq NaN + .. _whatsnew_0250.api_breaking.deps: From 41a9c36fa0d541b4cf197e2edfbb51824a086617 Mon Sep 17 00:00:00 2001 From: enisnazif Date: Thu, 30 May 2019 13:42:20 +0100 Subject: [PATCH 7/8] Update v0.25.0.rst Addressed review comments --- doc/source/whatsnew/v0.25.0.rst | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1adcd406d5f8d..329e5195136f2 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -195,6 +195,7 @@ as np.NaN in the case of an empty :class:`DataFrame` (:issue:`26397`) .. ipython:: python df = pd.DataFrame({"empty_col": pd.Categorical([])}) + df *Previous Behavior*: @@ -208,16 +209,9 @@ as np.NaN in the case of an empty :class:`DataFrame` (:issue:`26397`) *New Behavior*: -.. code-block:: python - - In [4]: df.describe() - Out[4]: - empty_col - count 0.0 - unique 0.0 - top NaN - freq NaN +.. ipython:: python + df.describe() .. _whatsnew_0250.api_breaking.deps: From 55702b28bd5fefae5e41f5975c6a3e92130059fc Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 30 May 2019 17:16:14 -0500 Subject: [PATCH 8/8] fixup --- doc/source/whatsnew/v0.25.0.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 329e5195136f2..8c9256dab1658 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -187,10 +187,10 @@ are returned. (:issue:`21521`) ``DataFrame`` describe on an empty categorical / object column will return top and freq ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -When calling :func:`describe`` on a :class:`DataFrame` with an empty categorical / object +When calling :meth:`DataFrame.describe` with an empty categorical / object column, the 'top' and 'freq' columns were previously omitted, which was inconsistent with -the output for non-empty columns. Now, the 'top' and 'freq' for the column will be shown -as np.NaN in the case of an empty :class:`DataFrame` (:issue:`26397`) +the output for non-empty columns. Now the 'top' and 'freq' columns will always be included, +with :attr:`numpy.nan` in the case of an empty :class:`DataFrame` (:issue:`26397`) .. ipython:: python