From 20a02c3a9d14889aef0c2415afa4783f65c4734b Mon Sep 17 00:00:00 2001
From: Niklas Osterlund <niklas.osterlund@gmail.com>
Date: Tue, 27 Oct 2015 15:13:36 +0100
Subject: [PATCH 1/6] Fix for DataFrame.hist() with by- and weights-keyword

will make the following work

import pandas as pd
d = {'one' : ['A', 'A', 'B', 'C'],
     'two' : [4., 3., 2., 1.],
     'three' : [10., 8., 5., 7.]}
df = pd.DataFrame(d)
df.hist('two', by='one', weights='three', bins=range(0, 10))
---
 pandas/tools/plotting.py | 53 ++++++++++++++++++++++++++++------------
 1 file changed, 38 insertions(+), 15 deletions(-)

diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py
index 98d6f5e8eb797..9e95cdbbb386d 100644
--- a/pandas/tools/plotting.py
+++ b/pandas/tools/plotting.py
@@ -2770,9 +2770,10 @@ def plot_group(group, ax):
     return fig
 
 
-def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None,
-               xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
-               sharey=False, figsize=None, layout=None, bins=10, **kwds):
+def hist_frame(data, column=None, weights=None, by=None, grid=True,
+               xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, ax=None,
+               sharex=False, sharey=False, figsize=None, layout=None, bins=10,
+               **kwds):
     """
     Draw histogram of the DataFrame's series using matplotlib / pylab.
 
@@ -2781,6 +2782,8 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None,
     data : DataFrame
     column : string or sequence
         If passed, will be used to limit data to a subset of columns
+    weights : string or sequence
+        If passed, will be used to weight the data
     by : object, optional
         If passed, then used to form histograms for separate groups
     grid : boolean, default True
@@ -2812,7 +2815,7 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None,
     """
 
     if by is not None:
-        axes = grouped_hist(data, column=column, by=by, ax=ax, grid=grid, figsize=figsize,
+        axes = grouped_hist(data, column=column, weights=weights, by=by, ax=ax, grid=grid, figsize=figsize,
                             sharex=sharex, sharey=sharey, layout=layout, bins=bins,
                             xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot,
                             **kwds)
@@ -2916,10 +2919,10 @@ def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None,
     return axes
 
 
-def grouped_hist(data, column=None, by=None, ax=None, bins=50, figsize=None,
-                 layout=None, sharex=False, sharey=False, rot=90, grid=True,
-                 xlabelsize=None, xrot=None, ylabelsize=None, yrot=None,
-                 **kwargs):
+def grouped_hist(data, column=None, weights=None, by=None, ax=None, bins=50,
+                 figsize=None, layout=None, sharex=False, sharey=False, rot=90,
+                 grid=True, xlabelsize=None, xrot=None, ylabelsize=None,
+                 yrot=None, **kwargs):
     """
     Grouped histogram
 
@@ -2927,6 +2930,7 @@ def grouped_hist(data, column=None, by=None, ax=None, bins=50, figsize=None,
     ----------
     data: Series/DataFrame
     column: object, optional
+    weights: object, optional
     by: object, optional
     ax: axes, optional
     bins: int, default 50
@@ -2942,12 +2946,20 @@ def grouped_hist(data, column=None, by=None, ax=None, bins=50, figsize=None,
     -------
     axes: collection of Matplotlib Axes
     """
-    def plot_group(group, ax):
-        ax.hist(group.dropna().values, bins=bins, **kwargs)
+    def plot_group(group, ax, weights=None):
+        if weights is not None:
+            # remove fields where we have nan in weights OR in group
+            # for both data sets
+            inx_na = (np.isnan(weights)) | (np.isnan(group))
+            weights = weights.ix[~inx_na]
+            group = group.ix[~inx_na]
+        else:
+            group = group.dropna()
+        ax.hist(group.values, weights=weights.values, bins=bins, **kwargs)
 
     xrot = xrot or rot
 
-    fig, axes = _grouped_plot(plot_group, data, column=column,
+    fig, axes = _grouped_plot(plot_group, data, column=column, weights=weights,
                               by=by, sharex=sharex, sharey=sharey, ax=ax,
                               figsize=figsize, layout=layout, rot=rot)
 
@@ -3034,9 +3046,9 @@ def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None,
     return ret
 
 
-def _grouped_plot(plotf, data, column=None, by=None, numeric_only=True,
-                  figsize=None, sharex=True, sharey=True, layout=None,
-                  rot=0, ax=None, **kwargs):
+def _grouped_plot(plotf, data, column=None, weights=None, by=None,
+                  numeric_only=True, figsize=None, sharex=True, sharey=True,
+                  layout=None, rot=0, ax=None, **kwargs):
     from pandas import DataFrame
 
     if figsize == 'default':
@@ -3047,6 +3059,8 @@ def _grouped_plot(plotf, data, column=None, by=None, numeric_only=True,
 
     grouped = data.groupby(by)
     if column is not None:
+        if weights is not None:
+            weights = grouped[weights]
         grouped = grouped[column]
 
     naxes = len(grouped)
@@ -3056,11 +3070,20 @@ def _grouped_plot(plotf, data, column=None, by=None, numeric_only=True,
 
     _axes = _flatten(axes)
 
+    weight = None
     for i, (key, group) in enumerate(grouped):
         ax = _axes[i]
+        if weights is not None:
+            weight = weights.get_group(key)
         if numeric_only and isinstance(group, DataFrame):
             group = group._get_numeric_data()
-        plotf(group, ax, **kwargs)
+            if weight is not None:
+                weight = weight._get_numeric_data()
+        if weight is not None:
+            plotf(group, ax, weight, **kwargs)
+        else:
+            # scatterplot etc has not the weight implemented in plotf
+            plotf(group, ax, **kwargs)
         ax.set_title(com.pprint_thing(key))
 
     return fig, axes

From 5bbba11e14625bd36f8ce1f1bead9add3c8a32a1 Mon Sep 17 00:00:00 2001
From: Niklas Osterlund <niklas.osterlund@gmail.com>
Date: Tue, 27 Oct 2015 19:17:22 +0100
Subject: [PATCH 2/6] Fix for handling both ndarray and series.

Also fix if NaN's make an entire group empty
---
 pandas/tools/plotting.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py
index 9e95cdbbb386d..e6055e2882a67 100644
--- a/pandas/tools/plotting.py
+++ b/pandas/tools/plotting.py
@@ -2947,15 +2947,22 @@ def grouped_hist(data, column=None, weights=None, by=None, ax=None, bins=50,
     axes: collection of Matplotlib Axes
     """
     def plot_group(group, ax, weights=None):
+        if isinstance(group, np.ndarray) == False:
+            group = group.values
         if weights is not None:
             # remove fields where we have nan in weights OR in group
             # for both data sets
+            if isinstance(weights, np.ndarray) == False:
+                weights = weights.values
             inx_na = (np.isnan(weights)) | (np.isnan(group))
-            weights = weights.ix[~inx_na]
-            group = group.ix[~inx_na]
+            weights = weights[~inx_na]
+            group = group[~inx_na]
         else:
             group = group.dropna()
-        ax.hist(group.values, weights=weights.values, bins=bins, **kwargs)
+        if len(group) > 0:
+            # if length is less than 0, we had only NaN's for this group
+            # nothing to print!
+            ax.hist(group, weights=weights, bins=bins, **kwargs)
 
     xrot = xrot or rot
 

From 787577e9dd8eaba1904ec4f36adf515f6e159ceb Mon Sep 17 00:00:00 2001
From: Niklas Osterlund <niklas.osterlund@gmail.com>
Date: Tue, 27 Oct 2015 20:26:53 +0100
Subject: [PATCH 3/6] Fix for if weights is supplied by an array instead of the
 column name in the DataFrame

---
 pandas/tools/plotting.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py
index e6055e2882a67..e2925849db823 100644
--- a/pandas/tools/plotting.py
+++ b/pandas/tools/plotting.py
@@ -2949,16 +2949,15 @@ def grouped_hist(data, column=None, weights=None, by=None, ax=None, bins=50,
     def plot_group(group, ax, weights=None):
         if isinstance(group, np.ndarray) == False:
             group = group.values
+        inx_na = np.isnan(group)
         if weights is not None:
             # remove fields where we have nan in weights OR in group
             # for both data sets
             if isinstance(weights, np.ndarray) == False:
                 weights = weights.values
-            inx_na = (np.isnan(weights)) | (np.isnan(group))
+            inx_na |= (np.isnan(weights))
             weights = weights[~inx_na]
-            group = group[~inx_na]
-        else:
-            group = group.dropna()
+        group = group[~inx_na]
         if len(group) > 0:
             # if length is less than 0, we had only NaN's for this group
             # nothing to print!
@@ -3064,6 +3063,11 @@ def _grouped_plot(plotf, data, column=None, weights=None, by=None,
                       "size by tuple instead", FutureWarning, stacklevel=4)
         figsize = None
 
+    if isinstance(weights, np.ndarray):
+        # weights supplied as an array instead of a part of the dataframe
+        data['weights'] = weights
+        weights = 'weights'
+        
     grouped = data.groupby(by)
     if column is not None:
         if weights is not None:

From b6bcb5c93cb0776ae3205bb3c51db3af7e86c808 Mon Sep 17 00:00:00 2001
From: Niklas Osterlund <niklas.osterlund@gmail.com>
Date: Tue, 27 Oct 2015 21:39:52 +0100
Subject: [PATCH 4/6] Fixed bug where weights were not used, since not included
 in **kwds anymore

Also made the logic better when using weights without group by by aligning NaN's, and finding out if weights is supplied by column or by array.
---
 pandas/tools/plotting.py | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py
index e2925849db823..19c8605a00fea 100644
--- a/pandas/tools/plotting.py
+++ b/pandas/tools/plotting.py
@@ -2813,7 +2813,6 @@ def hist_frame(data, column=None, weights=None, by=None, grid=True,
     kwds : other plotting keyword arguments
         To be passed to hist function
     """
-
     if by is not None:
         axes = grouped_hist(data, column=column, weights=weights, by=by, ax=ax, grid=grid, figsize=figsize,
                             sharex=sharex, sharey=sharey, layout=layout, bins=bins,
@@ -2821,11 +2820,28 @@ def hist_frame(data, column=None, weights=None, by=None, grid=True,
                             **kwds)
         return axes
 
+    inx_na = np.zeros(len(data), dtype=bool)
+    if weights is not None:
+        # first figure out if given my column name, or by an array
+        if isinstance(weights, str):
+            weights = data[weights]
+        if isinstance(weights, np.ndarray) == False:
+            weights = weights.values
+        # remove fields where we have nan in weights OR in group
+        # for both data sets
+        inx_na = (np.isnan(weights))
+
     if column is not None:
         if not isinstance(column, (list, np.ndarray, Index)):
             column = [column]
         data = data[column]
     data = data._get_numeric_data()
+    inx_na |= np.isnan(data.T.values)[0]
+
+    data = data.ix[~inx_na]
+    if weights is not None:
+        weights = weights[~inx_na]
+
     naxes = len(data.columns)
 
     fig, axes = _subplots(naxes=naxes, ax=ax, squeeze=False,
@@ -2835,7 +2851,7 @@ def hist_frame(data, column=None, weights=None, by=None, grid=True,
 
     for i, col in enumerate(com._try_sort(data.columns)):
         ax = _axes[i]
-        ax.hist(data[col].dropna().values, bins=bins, **kwds)
+        ax.hist(data[col].values, bins=bins, weights=weights, **kwds)
         ax.set_title(col)
         ax.grid(grid)
 
@@ -3063,10 +3079,12 @@ def _grouped_plot(plotf, data, column=None, weights=None, by=None,
                       "size by tuple instead", FutureWarning, stacklevel=4)
         figsize = None
 
+    added_weights_dummy_column = False
     if isinstance(weights, np.ndarray):
         # weights supplied as an array instead of a part of the dataframe
         data['weights'] = weights
         weights = 'weights'
+        added_weights_dummy_column = True
         
     grouped = data.groupby(by)
     if column is not None:
@@ -3074,6 +3092,9 @@ def _grouped_plot(plotf, data, column=None, weights=None, by=None,
             weights = grouped[weights]
         grouped = grouped[column]
 
+    if added_weights_dummy_column:
+        data = data.drop('weights', axis=1)
+
     naxes = len(grouped)
     fig, axes = _subplots(naxes=naxes, figsize=figsize,
                           sharex=sharex, sharey=sharey, ax=ax,

From cbe68ecb97cb68c632f65f0e788e2fa30095a3b1 Mon Sep 17 00:00:00 2001
From: Niklas Osterlund <niklas.osterlund@gmail.com>
Date: Wed, 28 Oct 2015 14:28:36 +0100
Subject: [PATCH 5/6] changed weights= to end of functions

Uses dropna(subset=...) to delete where nan's over the columns supplied
Also doing this in the beginning so we do not have to duplicate this logic
---
 pandas/tools/plotting.py | 84 +++++++++++++++++-----------------------
 1 file changed, 36 insertions(+), 48 deletions(-)

diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py
index 19c8605a00fea..3acf2f39b8864 100644
--- a/pandas/tools/plotting.py
+++ b/pandas/tools/plotting.py
@@ -2770,9 +2770,9 @@ def plot_group(group, ax):
     return fig
 
 
-def hist_frame(data, column=None, weights=None, by=None, grid=True,
-               xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, ax=None,
-               sharex=False, sharey=False, figsize=None, layout=None, bins=10,
+def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None,
+               xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
+               sharey=False, figsize=None, layout=None, bins=10, weights=None,
                **kwds):
     """
     Draw histogram of the DataFrame's series using matplotlib / pylab.
@@ -2782,8 +2782,6 @@ def hist_frame(data, column=None, weights=None, by=None, grid=True,
     data : DataFrame
     column : string or sequence
         If passed, will be used to limit data to a subset of columns
-    weights : string or sequence
-        If passed, will be used to weight the data
     by : object, optional
         If passed, then used to form histograms for separate groups
     grid : boolean, default True
@@ -2810,38 +2808,42 @@ def hist_frame(data, column=None, weights=None, by=None, grid=True,
     layout: (optional) a tuple (rows, columns) for the layout of the histograms
     bins: integer, default 10
         Number of histogram bins to be used
+    weights : string or sequence
+        If passed, will be used to weight the data
     kwds : other plotting keyword arguments
         To be passed to hist function
     """
+    subset_cols_drop_nan = []
+    if weights is not None:
+        if isinstance(weights, np.ndarray):
+            # weights supplied as an array instead of a part of the dataframe
+            if 'weights' in data.columns:
+                raise NameError('weights already in data.columns. Could not ' +
+                                'add dummy column')
+            data = data.copy()
+            data['weights'] = weights
+            weights = 'weights'
+        subset_cols_drop_nan.append(weights)
+    if column is not None:
+        subset_cols_drop_nan.append(column)
+    data = data.dropna(subset=subset_cols_drop_nan)
+
     if by is not None:
-        axes = grouped_hist(data, column=column, weights=weights, by=by, ax=ax, grid=grid, figsize=figsize,
+        axes = grouped_hist(data, column=column, by=by, ax=ax, grid=grid, figsize=figsize,
                             sharex=sharex, sharey=sharey, layout=layout, bins=bins,
                             xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot,
-                            **kwds)
+                            weights=weights, **kwds)
         return axes
 
-    inx_na = np.zeros(len(data), dtype=bool)
     if weights is not None:
-        # first figure out if given my column name, or by an array
-        if isinstance(weights, str):
-            weights = data[weights]
-        if isinstance(weights, np.ndarray) == False:
-            weights = weights.values
-        # remove fields where we have nan in weights OR in group
-        # for both data sets
-        inx_na = (np.isnan(weights))
+        weights = data[weights]
+        weights = weights._get_numeric_data()
 
     if column is not None:
         if not isinstance(column, (list, np.ndarray, Index)):
             column = [column]
         data = data[column]
     data = data._get_numeric_data()
-    inx_na |= np.isnan(data.T.values)[0]
-
-    data = data.ix[~inx_na]
-    if weights is not None:
-        weights = weights[~inx_na]
-
     naxes = len(data.columns)
 
     fig, axes = _subplots(naxes=naxes, ax=ax, squeeze=False,
@@ -2935,10 +2937,10 @@ def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None,
     return axes
 
 
-def grouped_hist(data, column=None, weights=None, by=None, ax=None, bins=50,
+def grouped_hist(data, column=None, by=None, ax=None, bins=50,
                  figsize=None, layout=None, sharex=False, sharey=False, rot=90,
                  grid=True, xlabelsize=None, xrot=None, ylabelsize=None,
-                 yrot=None, **kwargs):
+                 yrot=None, weights=None, **kwargs):
     """
     Grouped histogram
 
@@ -2946,7 +2948,6 @@ def grouped_hist(data, column=None, weights=None, by=None, ax=None, bins=50,
     ----------
     data: Series/DataFrame
     column: object, optional
-    weights: object, optional
     by: object, optional
     ax: axes, optional
     bins: int, default 50
@@ -2956,6 +2957,7 @@ def grouped_hist(data, column=None, weights=None, by=None, ax=None, bins=50,
     sharey: boolean, default False
     rot: int, default 90
     grid: bool, default True
+    weights: object, optional
     kwargs: dict, keyword arguments passed to matplotlib.Axes.hist
 
     Returns
@@ -2965,15 +2967,9 @@ def grouped_hist(data, column=None, weights=None, by=None, ax=None, bins=50,
     def plot_group(group, ax, weights=None):
         if isinstance(group, np.ndarray) == False:
             group = group.values
-        inx_na = np.isnan(group)
         if weights is not None:
-            # remove fields where we have nan in weights OR in group
-            # for both data sets
             if isinstance(weights, np.ndarray) == False:
                 weights = weights.values
-            inx_na |= (np.isnan(weights))
-            weights = weights[~inx_na]
-        group = group[~inx_na]
         if len(group) > 0:
             # if length is less than 0, we had only NaN's for this group
             # nothing to print!
@@ -2981,9 +2977,10 @@ def plot_group(group, ax, weights=None):
 
     xrot = xrot or rot
 
-    fig, axes = _grouped_plot(plot_group, data, column=column, weights=weights,
-                              by=by, sharex=sharex, sharey=sharey, ax=ax,
-                              figsize=figsize, layout=layout, rot=rot)
+    fig, axes = _grouped_plot(plot_group, data, column=column, by=by,
+                              sharex=sharex, sharey=sharey, ax=ax,
+                              figsize=figsize, layout=layout, rot=rot,
+                              weights=weights)
 
     _set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot,
              ylabelsize=ylabelsize, yrot=yrot)
@@ -3068,9 +3065,9 @@ def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None,
     return ret
 
 
-def _grouped_plot(plotf, data, column=None, weights=None, by=None,
+def _grouped_plot(plotf, data, column=None, by=None,
                   numeric_only=True, figsize=None, sharex=True, sharey=True,
-                  layout=None, rot=0, ax=None, **kwargs):
+                  layout=None, rot=0, ax=None, weights=None, **kwargs):
     from pandas import DataFrame
 
     if figsize == 'default':
@@ -3079,22 +3076,13 @@ def _grouped_plot(plotf, data, column=None, weights=None, by=None,
                       "size by tuple instead", FutureWarning, stacklevel=4)
         figsize = None
 
-    added_weights_dummy_column = False
-    if isinstance(weights, np.ndarray):
-        # weights supplied as an array instead of a part of the dataframe
-        data['weights'] = weights
-        weights = 'weights'
-        added_weights_dummy_column = True
-        
     grouped = data.groupby(by)
+
+    if weights is not None:
+        weights = grouped[weights]
     if column is not None:
-        if weights is not None:
-            weights = grouped[weights]
         grouped = grouped[column]
 
-    if added_weights_dummy_column:
-        data = data.drop('weights', axis=1)
-
     naxes = len(grouped)
     fig, axes = _subplots(naxes=naxes, figsize=figsize,
                           sharex=sharex, sharey=sharey, ax=ax,

From 868fbf0523e3d0d92b796034138d81cf053aad0f Mon Sep 17 00:00:00 2001
From: Niklas Osterlund <niklas.osterlund@gmail.com>
Date: Wed, 28 Oct 2015 16:22:45 +0100
Subject: [PATCH 6/6] Added the first test (in my life)

---
 pandas/tests/test_graphics_others.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/pandas/tests/test_graphics_others.py b/pandas/tests/test_graphics_others.py
index b18cbae600b43..54f6cf50ea5ec 100644
--- a/pandas/tests/test_graphics_others.py
+++ b/pandas/tests/test_graphics_others.py
@@ -302,6 +302,30 @@ def test_boxplot_empty_column(self):
         df.loc[:, 0] = np.nan
         _check_plot_works(df.boxplot, return_type='axes')
 
+    @slow
+    def test_hist_df_nan_and_weights(self):
+        d = {'category' : ['A', 'A', 'B', 'B', 'C'],
+             'items' : [4., 3., 2., np.nan, 1],
+             'val' : [10., 8., np.nan, 5, 7.]}
+        df = DataFrame(d)
+        orig_columns = df.columns
+        orig_rows = len(df)
+        _check_plot_works(df.hist, column='items', by='category',
+                          weights='val', bins=range(0, 10))
+        _check_plot_works(df.hist, column='items', by='category',
+                          weights=df.val.values, bins=range(0, 10))
+        # check without weights functionality
+        _check_plot_works(df.hist, column='items', by='category',
+                          bins=range(0, 10))
+        _check_plot_works(df.hist, column='items', weights='val',
+                          bins=range(0, 10))
+        _check_plot_works(df.hist, column='items', weights=df.val.values,
+                          bins=range(0, 10))
+        # also check that we have not changed the original df that had
+        # nan values in it.
+        self.assertEqual(len(orig_columns), len(df.columns))
+        self.assertEqual(orig_rows, len(df))
+
     @slow
     def test_hist_df_legacy(self):
         from matplotlib.patches import Rectangle