From 20a02c3a9d14889aef0c2415afa4783f65c4734b Mon Sep 17 00:00:00 2001 From: Niklas Osterlund Date: Tue, 27 Oct 2015 15:13:36 +0100 Subject: [PATCH 1/6] Fix for DataFrame.hist() with by- and weights-keyword will make the following work import pandas as pd d = {'one' : ['A', 'A', 'B', 'C'], 'two' : [4., 3., 2., 1.], 'three' : [10., 8., 5., 7.]} df = pd.DataFrame(d) df.hist('two', by='one', weights='three', bins=range(0, 10)) --- pandas/tools/plotting.py | 53 ++++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 15 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 98d6f5e8eb797..9e95cdbbb386d 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2770,9 +2770,10 @@ def plot_group(group, ax): return fig -def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, - xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False, - sharey=False, figsize=None, layout=None, bins=10, **kwds): +def hist_frame(data, column=None, weights=None, by=None, grid=True, + xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, ax=None, + sharex=False, sharey=False, figsize=None, layout=None, bins=10, + **kwds): """ Draw histogram of the DataFrame's series using matplotlib / pylab. @@ -2781,6 +2782,8 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, data : DataFrame column : string or sequence If passed, will be used to limit data to a subset of columns + weights : string or sequence + If passed, will be used to weight the data by : object, optional If passed, then used to form histograms for separate groups grid : boolean, default True @@ -2812,7 +2815,7 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, """ if by is not None: - axes = grouped_hist(data, column=column, by=by, ax=ax, grid=grid, figsize=figsize, + axes = grouped_hist(data, column=column, weights=weights, by=by, ax=ax, grid=grid, figsize=figsize, sharex=sharex, sharey=sharey, layout=layout, bins=bins, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot, **kwds) @@ -2916,10 +2919,10 @@ def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, return axes -def grouped_hist(data, column=None, by=None, ax=None, bins=50, figsize=None, - layout=None, sharex=False, sharey=False, rot=90, grid=True, - xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, - **kwargs): +def grouped_hist(data, column=None, weights=None, by=None, ax=None, bins=50, + figsize=None, layout=None, sharex=False, sharey=False, rot=90, + grid=True, xlabelsize=None, xrot=None, ylabelsize=None, + yrot=None, **kwargs): """ Grouped histogram @@ -2927,6 +2930,7 @@ def grouped_hist(data, column=None, by=None, ax=None, bins=50, figsize=None, ---------- data: Series/DataFrame column: object, optional + weights: object, optional by: object, optional ax: axes, optional bins: int, default 50 @@ -2942,12 +2946,20 @@ def grouped_hist(data, column=None, by=None, ax=None, bins=50, figsize=None, ------- axes: collection of Matplotlib Axes """ - def plot_group(group, ax): - ax.hist(group.dropna().values, bins=bins, **kwargs) + def plot_group(group, ax, weights=None): + if weights is not None: + # remove fields where we have nan in weights OR in group + # for both data sets + inx_na = (np.isnan(weights)) | (np.isnan(group)) + weights = weights.ix[~inx_na] + group = group.ix[~inx_na] + else: + group = group.dropna() + ax.hist(group.values, weights=weights.values, bins=bins, **kwargs) xrot = xrot or rot - fig, axes = _grouped_plot(plot_group, data, column=column, + fig, axes = _grouped_plot(plot_group, data, column=column, weights=weights, by=by, sharex=sharex, sharey=sharey, ax=ax, figsize=figsize, layout=layout, rot=rot) @@ -3034,9 +3046,9 @@ def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, return ret -def _grouped_plot(plotf, data, column=None, by=None, numeric_only=True, - figsize=None, sharex=True, sharey=True, layout=None, - rot=0, ax=None, **kwargs): +def _grouped_plot(plotf, data, column=None, weights=None, by=None, + numeric_only=True, figsize=None, sharex=True, sharey=True, + layout=None, rot=0, ax=None, **kwargs): from pandas import DataFrame if figsize == 'default': @@ -3047,6 +3059,8 @@ def _grouped_plot(plotf, data, column=None, by=None, numeric_only=True, grouped = data.groupby(by) if column is not None: + if weights is not None: + weights = grouped[weights] grouped = grouped[column] naxes = len(grouped) @@ -3056,11 +3070,20 @@ def _grouped_plot(plotf, data, column=None, by=None, numeric_only=True, _axes = _flatten(axes) + weight = None for i, (key, group) in enumerate(grouped): ax = _axes[i] + if weights is not None: + weight = weights.get_group(key) if numeric_only and isinstance(group, DataFrame): group = group._get_numeric_data() - plotf(group, ax, **kwargs) + if weight is not None: + weight = weight._get_numeric_data() + if weight is not None: + plotf(group, ax, weight, **kwargs) + else: + # scatterplot etc has not the weight implemented in plotf + plotf(group, ax, **kwargs) ax.set_title(com.pprint_thing(key)) return fig, axes From 5bbba11e14625bd36f8ce1f1bead9add3c8a32a1 Mon Sep 17 00:00:00 2001 From: Niklas Osterlund Date: Tue, 27 Oct 2015 19:17:22 +0100 Subject: [PATCH 2/6] Fix for handling both ndarray and series. Also fix if NaN's make an entire group empty --- pandas/tools/plotting.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 9e95cdbbb386d..e6055e2882a67 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2947,15 +2947,22 @@ def grouped_hist(data, column=None, weights=None, by=None, ax=None, bins=50, axes: collection of Matplotlib Axes """ def plot_group(group, ax, weights=None): + if isinstance(group, np.ndarray) == False: + group = group.values if weights is not None: # remove fields where we have nan in weights OR in group # for both data sets + if isinstance(weights, np.ndarray) == False: + weights = weights.values inx_na = (np.isnan(weights)) | (np.isnan(group)) - weights = weights.ix[~inx_na] - group = group.ix[~inx_na] + weights = weights[~inx_na] + group = group[~inx_na] else: group = group.dropna() - ax.hist(group.values, weights=weights.values, bins=bins, **kwargs) + if len(group) > 0: + # if length is less than 0, we had only NaN's for this group + # nothing to print! + ax.hist(group, weights=weights, bins=bins, **kwargs) xrot = xrot or rot From 787577e9dd8eaba1904ec4f36adf515f6e159ceb Mon Sep 17 00:00:00 2001 From: Niklas Osterlund Date: Tue, 27 Oct 2015 20:26:53 +0100 Subject: [PATCH 3/6] Fix for if weights is supplied by an array instead of the column name in the DataFrame --- pandas/tools/plotting.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index e6055e2882a67..e2925849db823 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2949,16 +2949,15 @@ def grouped_hist(data, column=None, weights=None, by=None, ax=None, bins=50, def plot_group(group, ax, weights=None): if isinstance(group, np.ndarray) == False: group = group.values + inx_na = np.isnan(group) if weights is not None: # remove fields where we have nan in weights OR in group # for both data sets if isinstance(weights, np.ndarray) == False: weights = weights.values - inx_na = (np.isnan(weights)) | (np.isnan(group)) + inx_na |= (np.isnan(weights)) weights = weights[~inx_na] - group = group[~inx_na] - else: - group = group.dropna() + group = group[~inx_na] if len(group) > 0: # if length is less than 0, we had only NaN's for this group # nothing to print! @@ -3064,6 +3063,11 @@ def _grouped_plot(plotf, data, column=None, weights=None, by=None, "size by tuple instead", FutureWarning, stacklevel=4) figsize = None + if isinstance(weights, np.ndarray): + # weights supplied as an array instead of a part of the dataframe + data['weights'] = weights + weights = 'weights' + grouped = data.groupby(by) if column is not None: if weights is not None: From b6bcb5c93cb0776ae3205bb3c51db3af7e86c808 Mon Sep 17 00:00:00 2001 From: Niklas Osterlund Date: Tue, 27 Oct 2015 21:39:52 +0100 Subject: [PATCH 4/6] Fixed bug where weights were not used, since not included in **kwds anymore Also made the logic better when using weights without group by by aligning NaN's, and finding out if weights is supplied by column or by array. --- pandas/tools/plotting.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index e2925849db823..19c8605a00fea 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2813,7 +2813,6 @@ def hist_frame(data, column=None, weights=None, by=None, grid=True, kwds : other plotting keyword arguments To be passed to hist function """ - if by is not None: axes = grouped_hist(data, column=column, weights=weights, by=by, ax=ax, grid=grid, figsize=figsize, sharex=sharex, sharey=sharey, layout=layout, bins=bins, @@ -2821,11 +2820,28 @@ def hist_frame(data, column=None, weights=None, by=None, grid=True, **kwds) return axes + inx_na = np.zeros(len(data), dtype=bool) + if weights is not None: + # first figure out if given my column name, or by an array + if isinstance(weights, str): + weights = data[weights] + if isinstance(weights, np.ndarray) == False: + weights = weights.values + # remove fields where we have nan in weights OR in group + # for both data sets + inx_na = (np.isnan(weights)) + if column is not None: if not isinstance(column, (list, np.ndarray, Index)): column = [column] data = data[column] data = data._get_numeric_data() + inx_na |= np.isnan(data.T.values)[0] + + data = data.ix[~inx_na] + if weights is not None: + weights = weights[~inx_na] + naxes = len(data.columns) fig, axes = _subplots(naxes=naxes, ax=ax, squeeze=False, @@ -2835,7 +2851,7 @@ def hist_frame(data, column=None, weights=None, by=None, grid=True, for i, col in enumerate(com._try_sort(data.columns)): ax = _axes[i] - ax.hist(data[col].dropna().values, bins=bins, **kwds) + ax.hist(data[col].values, bins=bins, weights=weights, **kwds) ax.set_title(col) ax.grid(grid) @@ -3063,10 +3079,12 @@ def _grouped_plot(plotf, data, column=None, weights=None, by=None, "size by tuple instead", FutureWarning, stacklevel=4) figsize = None + added_weights_dummy_column = False if isinstance(weights, np.ndarray): # weights supplied as an array instead of a part of the dataframe data['weights'] = weights weights = 'weights' + added_weights_dummy_column = True grouped = data.groupby(by) if column is not None: @@ -3074,6 +3092,9 @@ def _grouped_plot(plotf, data, column=None, weights=None, by=None, weights = grouped[weights] grouped = grouped[column] + if added_weights_dummy_column: + data = data.drop('weights', axis=1) + naxes = len(grouped) fig, axes = _subplots(naxes=naxes, figsize=figsize, sharex=sharex, sharey=sharey, ax=ax, From cbe68ecb97cb68c632f65f0e788e2fa30095a3b1 Mon Sep 17 00:00:00 2001 From: Niklas Osterlund Date: Wed, 28 Oct 2015 14:28:36 +0100 Subject: [PATCH 5/6] changed weights= to end of functions Uses dropna(subset=...) to delete where nan's over the columns supplied Also doing this in the beginning so we do not have to duplicate this logic --- pandas/tools/plotting.py | 84 +++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 48 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 19c8605a00fea..3acf2f39b8864 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2770,9 +2770,9 @@ def plot_group(group, ax): return fig -def hist_frame(data, column=None, weights=None, by=None, grid=True, - xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, ax=None, - sharex=False, sharey=False, figsize=None, layout=None, bins=10, +def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, + xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False, + sharey=False, figsize=None, layout=None, bins=10, weights=None, **kwds): """ Draw histogram of the DataFrame's series using matplotlib / pylab. @@ -2782,8 +2782,6 @@ def hist_frame(data, column=None, weights=None, by=None, grid=True, data : DataFrame column : string or sequence If passed, will be used to limit data to a subset of columns - weights : string or sequence - If passed, will be used to weight the data by : object, optional If passed, then used to form histograms for separate groups grid : boolean, default True @@ -2810,38 +2808,42 @@ def hist_frame(data, column=None, weights=None, by=None, grid=True, layout: (optional) a tuple (rows, columns) for the layout of the histograms bins: integer, default 10 Number of histogram bins to be used + weights : string or sequence + If passed, will be used to weight the data kwds : other plotting keyword arguments To be passed to hist function """ + subset_cols_drop_nan = [] + if weights is not None: + if isinstance(weights, np.ndarray): + # weights supplied as an array instead of a part of the dataframe + if 'weights' in data.columns: + raise NameError('weights already in data.columns. Could not ' + + 'add dummy column') + data = data.copy() + data['weights'] = weights + weights = 'weights' + subset_cols_drop_nan.append(weights) + if column is not None: + subset_cols_drop_nan.append(column) + data = data.dropna(subset=subset_cols_drop_nan) + if by is not None: - axes = grouped_hist(data, column=column, weights=weights, by=by, ax=ax, grid=grid, figsize=figsize, + axes = grouped_hist(data, column=column, by=by, ax=ax, grid=grid, figsize=figsize, sharex=sharex, sharey=sharey, layout=layout, bins=bins, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot, - **kwds) + weights=weights, **kwds) return axes - inx_na = np.zeros(len(data), dtype=bool) if weights is not None: - # first figure out if given my column name, or by an array - if isinstance(weights, str): - weights = data[weights] - if isinstance(weights, np.ndarray) == False: - weights = weights.values - # remove fields where we have nan in weights OR in group - # for both data sets - inx_na = (np.isnan(weights)) + weights = data[weights] + weights = weights._get_numeric_data() if column is not None: if not isinstance(column, (list, np.ndarray, Index)): column = [column] data = data[column] data = data._get_numeric_data() - inx_na |= np.isnan(data.T.values)[0] - - data = data.ix[~inx_na] - if weights is not None: - weights = weights[~inx_na] - naxes = len(data.columns) fig, axes = _subplots(naxes=naxes, ax=ax, squeeze=False, @@ -2935,10 +2937,10 @@ def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, return axes -def grouped_hist(data, column=None, weights=None, by=None, ax=None, bins=50, +def grouped_hist(data, column=None, by=None, ax=None, bins=50, figsize=None, layout=None, sharex=False, sharey=False, rot=90, grid=True, xlabelsize=None, xrot=None, ylabelsize=None, - yrot=None, **kwargs): + yrot=None, weights=None, **kwargs): """ Grouped histogram @@ -2946,7 +2948,6 @@ def grouped_hist(data, column=None, weights=None, by=None, ax=None, bins=50, ---------- data: Series/DataFrame column: object, optional - weights: object, optional by: object, optional ax: axes, optional bins: int, default 50 @@ -2956,6 +2957,7 @@ def grouped_hist(data, column=None, weights=None, by=None, ax=None, bins=50, sharey: boolean, default False rot: int, default 90 grid: bool, default True + weights: object, optional kwargs: dict, keyword arguments passed to matplotlib.Axes.hist Returns @@ -2965,15 +2967,9 @@ def grouped_hist(data, column=None, weights=None, by=None, ax=None, bins=50, def plot_group(group, ax, weights=None): if isinstance(group, np.ndarray) == False: group = group.values - inx_na = np.isnan(group) if weights is not None: - # remove fields where we have nan in weights OR in group - # for both data sets if isinstance(weights, np.ndarray) == False: weights = weights.values - inx_na |= (np.isnan(weights)) - weights = weights[~inx_na] - group = group[~inx_na] if len(group) > 0: # if length is less than 0, we had only NaN's for this group # nothing to print! @@ -2981,9 +2977,10 @@ def plot_group(group, ax, weights=None): xrot = xrot or rot - fig, axes = _grouped_plot(plot_group, data, column=column, weights=weights, - by=by, sharex=sharex, sharey=sharey, ax=ax, - figsize=figsize, layout=layout, rot=rot) + fig, axes = _grouped_plot(plot_group, data, column=column, by=by, + sharex=sharex, sharey=sharey, ax=ax, + figsize=figsize, layout=layout, rot=rot, + weights=weights) _set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot) @@ -3068,9 +3065,9 @@ def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, return ret -def _grouped_plot(plotf, data, column=None, weights=None, by=None, +def _grouped_plot(plotf, data, column=None, by=None, numeric_only=True, figsize=None, sharex=True, sharey=True, - layout=None, rot=0, ax=None, **kwargs): + layout=None, rot=0, ax=None, weights=None, **kwargs): from pandas import DataFrame if figsize == 'default': @@ -3079,22 +3076,13 @@ def _grouped_plot(plotf, data, column=None, weights=None, by=None, "size by tuple instead", FutureWarning, stacklevel=4) figsize = None - added_weights_dummy_column = False - if isinstance(weights, np.ndarray): - # weights supplied as an array instead of a part of the dataframe - data['weights'] = weights - weights = 'weights' - added_weights_dummy_column = True - grouped = data.groupby(by) + + if weights is not None: + weights = grouped[weights] if column is not None: - if weights is not None: - weights = grouped[weights] grouped = grouped[column] - if added_weights_dummy_column: - data = data.drop('weights', axis=1) - naxes = len(grouped) fig, axes = _subplots(naxes=naxes, figsize=figsize, sharex=sharex, sharey=sharey, ax=ax, From 868fbf0523e3d0d92b796034138d81cf053aad0f Mon Sep 17 00:00:00 2001 From: Niklas Osterlund Date: Wed, 28 Oct 2015 16:22:45 +0100 Subject: [PATCH 6/6] Added the first test (in my life) --- pandas/tests/test_graphics_others.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pandas/tests/test_graphics_others.py b/pandas/tests/test_graphics_others.py index b18cbae600b43..54f6cf50ea5ec 100644 --- a/pandas/tests/test_graphics_others.py +++ b/pandas/tests/test_graphics_others.py @@ -302,6 +302,30 @@ def test_boxplot_empty_column(self): df.loc[:, 0] = np.nan _check_plot_works(df.boxplot, return_type='axes') + @slow + def test_hist_df_nan_and_weights(self): + d = {'category' : ['A', 'A', 'B', 'B', 'C'], + 'items' : [4., 3., 2., np.nan, 1], + 'val' : [10., 8., np.nan, 5, 7.]} + df = DataFrame(d) + orig_columns = df.columns + orig_rows = len(df) + _check_plot_works(df.hist, column='items', by='category', + weights='val', bins=range(0, 10)) + _check_plot_works(df.hist, column='items', by='category', + weights=df.val.values, bins=range(0, 10)) + # check without weights functionality + _check_plot_works(df.hist, column='items', by='category', + bins=range(0, 10)) + _check_plot_works(df.hist, column='items', weights='val', + bins=range(0, 10)) + _check_plot_works(df.hist, column='items', weights=df.val.values, + bins=range(0, 10)) + # also check that we have not changed the original df that had + # nan values in it. + self.assertEqual(len(orig_columns), len(df.columns)) + self.assertEqual(orig_rows, len(df)) + @slow def test_hist_df_legacy(self): from matplotlib.patches import Rectangle