From 367b648e1b2f7dcc639b913799063657b1cffe8d Mon Sep 17 00:00:00 2001
From: onesandzeroes <onesandzeroes@github.com>
Date: Thu, 25 Sep 2014 08:26:35 +1000
Subject: [PATCH 1/8] Allow size scaling by passing column name

---
 pandas/tools/plotting.py | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py
index cb669b75e5c96..d493c093f6d5c 100644
--- a/pandas/tools/plotting.py
+++ b/pandas/tools/plotting.py
@@ -1374,7 +1374,8 @@ def _get_errorbars(self, label=None, index=None, xerr=True, yerr=True):
 class ScatterPlot(MPLPlot):
     _layout_type = 'single'
 
-    def __init__(self, data, x, y, c=None, **kwargs):
+    def __init__(self, data, x, y, c=None, s=None,
+                 size_range=(50, 1000), **kwargs):
         MPLPlot.__init__(self, data, **kwargs)
         if x is None or y is None:
             raise ValueError( 'scatter requires and x and y column')
@@ -1387,6 +1388,8 @@ def __init__(self, data, x, y, c=None, **kwargs):
         self.x = x
         self.y = y
         self.c = c
+        self.s = s
+        self.size_range = size_range
 
     @property
     def nseries(self):
@@ -1398,7 +1401,7 @@ def _make_plot(self):
 
         import matplotlib.pyplot as plt
 
-        x, y, c, data = self.x, self.y, self.c, self.data
+        x, y, c, s, data = self.x, self.y, self.c, self.s, self.data
         ax = self.axes[0]
 
         # plot a colorbar only if a colormap is provided or necessary
@@ -1415,12 +1418,20 @@ def _make_plot(self):
         else:
             c_values = c
 
+        # Set up size scaling if necessary
+        if s is None:
+            s_values = self.plt.rcParams['lines.markersize']
+        elif s in self.data.columns:
+            s_values = self._convert_size_vals_to_points(self.data[s].values)
+        else:
+            s_values = s
+
         if self.legend and hasattr(self, 'label'):
             label = self.label
         else:
             label = None
         scatter = ax.scatter(data[x].values, data[y].values, c=c_values,
-                             label=label, cmap=cmap, **self.kwds)
+                             s=s_values, label=label, cmap=cmap, **self.kwds)
         if cb:
             img = ax.collections[0]
             kws = dict(ax=ax)
@@ -1437,6 +1448,13 @@ def _make_plot(self):
             err_kwds['ecolor'] = scatter.get_facecolor()[0]
             ax.errorbar(data[x].values, data[y].values, linestyle='none', **err_kwds)
 
+    def _convert_size_vals_to_points(self, vals):
+        min_size, max_size = self.size_range
+        val_range = vals.max() - vals.min()
+        normalized_vals = (vals - vals.min()) / val_range
+        point_sizes = (min_size + (normalized_vals * (max_size - min_size)))
+        return point_sizes
+
     def _post_plot_logic(self):
         ax = self.axes[0]
         x, y = self.x, self.y

From aab72ba7829ccd76023909e607a26468982ea0da Mon Sep 17 00:00:00 2001
From: onesandzeroes <onesandzeroes@github.com>
Date: Fri, 14 Nov 2014 19:30:01 +1100
Subject: [PATCH 2/8] Allow categorical size column

---
 pandas/tools/plotting.py | 40 ++++++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py
index d493c093f6d5c..19cea86fb541d 100644
--- a/pandas/tools/plotting.py
+++ b/pandas/tools/plotting.py
@@ -1388,9 +1388,17 @@ def __init__(self, data, x, y, c=None, s=None,
         self.x = x
         self.y = y
         self.c = c
-        self.s = s
         self.size_range = size_range
 
+        # Set up size scaling if necessary, need to do this before plot
+        # generation starts and non-numeric data thrown away
+        if s is None:
+            self.s_values = self.plt.rcParams['lines.markersize']
+        elif s in self.data.columns:
+            self.s_values = self._convert_column_to_size(s)
+        else:
+            self.s_values = s
+
     @property
     def nseries(self):
         return 1
@@ -1401,7 +1409,7 @@ def _make_plot(self):
 
         import matplotlib.pyplot as plt
 
-        x, y, c, s, data = self.x, self.y, self.c, self.s, self.data
+        x, y, c, data = self.x, self.y, self.c, self.data
         ax = self.axes[0]
 
         # plot a colorbar only if a colormap is provided or necessary
@@ -1418,20 +1426,14 @@ def _make_plot(self):
         else:
             c_values = c
 
-        # Set up size scaling if necessary
-        if s is None:
-            s_values = self.plt.rcParams['lines.markersize']
-        elif s in self.data.columns:
-            s_values = self._convert_size_vals_to_points(self.data[s].values)
-        else:
-            s_values = s
 
         if self.legend and hasattr(self, 'label'):
             label = self.label
         else:
             label = None
         scatter = ax.scatter(data[x].values, data[y].values, c=c_values,
-                             s=s_values, label=label, cmap=cmap, **self.kwds)
+                             s=self.s_values, label=label, cmap=cmap,
+                             **self.kwds)
         if cb:
             img = ax.collections[0]
             kws = dict(ax=ax)
@@ -1448,11 +1450,21 @@ def _make_plot(self):
             err_kwds['ecolor'] = scatter.get_facecolor()[0]
             ax.errorbar(data[x].values, data[y].values, linestyle='none', **err_kwds)
 
-    def _convert_size_vals_to_points(self, vals):
+    def _convert_column_to_size(self, col_name):
         min_size, max_size = self.size_range
-        val_range = vals.max() - vals.min()
-        normalized_vals = (vals - vals.min()) / val_range
-        point_sizes = (min_size + (normalized_vals * (max_size - min_size)))
+        size_col = self.data[col_name]
+
+        if com.is_categorical_dtype(size_col):
+            n_categories = len(size_col.cat.categories)
+            cat_sizes = np.linspace(min_size, max_size, num=n_categories)
+            size_mapper = Series(cat_sizes, index=size_col.cat.categories)
+            point_sizes = size_col.map(size_mapper)
+        else:
+            vals = self.data[col_name].values
+            val_range = vals.max() - vals.min()
+            normalized_vals = (vals - vals.min()) / val_range
+            point_sizes = (min_size + (normalized_vals * (max_size - min_size)))
+
         return point_sizes
 
     def _post_plot_logic(self):

From cc2415a2c80220d793b2d25e411ccf7526470f63 Mon Sep 17 00:00:00 2001
From: onesandzeroes <onesandzeroes@github.com>
Date: Mon, 24 Nov 2014 21:00:41 +1100
Subject: [PATCH 3/8] Add to release notes

---
 doc/source/whatsnew/v0.15.2.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt
index 6688f106f922e..6ecba9e0e0ad8 100644
--- a/doc/source/whatsnew/v0.15.2.txt
+++ b/doc/source/whatsnew/v0.15.2.txt
@@ -66,6 +66,7 @@ Enhancements
 - Added support for ``utcfromtimestamp()``, ``fromtimestamp()``, and ``combine()`` on `Timestamp` class (:issue:`5351`).
 - Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See :ref:`here<remote_data.ga>`.
 - Added flag ``order_categoricals`` to ``StataReader`` and ``read_stata`` to select whether to order imported categorical data (:issue:`8836`).  See :ref:`here <io.stata-categorical>` for more information on importing categorical variables from Stata data files.
+- Added support for passing a column name as the size argument to ``DataFrame.plot(kind='scatter')``, along with a ``size_range`` argument to control scaling (:issue:`8244`).
 
 .. _whatsnew_0152.performance:
 

From af83de33d2678926fc985bc1201ca801e72b5a15 Mon Sep 17 00:00:00 2001
From: onesandzeroes <onesandzeroes@github.com>
Date: Mon, 24 Nov 2014 21:01:49 +1100
Subject: [PATCH 4/8] Add sizing examples to docs

---
 doc/source/visualization.rst | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst
index f30d6c9d5d4c0..441bdf7b1f16e 100644
--- a/doc/source/visualization.rst
+++ b/doc/source/visualization.rst
@@ -8,7 +8,7 @@
    import pandas as pd
    from numpy.random import randn, rand, randint
    np.random.seed(123456)
-   from pandas import DataFrame, Series, date_range, options
+   from pandas import DataFrame, Series, date_range, options, Categorical
    import pandas.util.testing as tm
    np.set_printoptions(precision=4, suppress=True)
    import matplotlib.pyplot as plt
@@ -587,19 +587,40 @@ each point:
 
    plt.close('all')
 
-You can pass other keywords supported by matplotlib ``scatter``.
-Below example shows a bubble chart using a dataframe column values as bubble size.
+You can also pass a column name as the ``s`` (size) argument to have
+the point sizes scale according to that column's values. The minimum and
+maximum sizes of the bubbles (in points) are controlled by the
+``size_range`` argument, with a default range of ``(50, 1000)``. The
+below example shows a bubble chart using a dataframe column values
+as bubble size.
 
 .. ipython:: python
 
-   @savefig scatter_plot_bubble.png
-   df.plot(kind='scatter', x='a', y='b', s=df['c']*200);
+   @savefig scatter_plot_sizes.png
+   df.plot(kind='scatter', x='a', y='b', s='c');
 
 .. ipython:: python
    :suppress:
 
    plt.close('all')
 
+Categorical columns can also be used to set point sizes, producing
+a set of equally spaced point sizes:
+
+.. ipython:: python
+
+    df['group'] = Categorical(randint(1, 4, 50))
+    @savefig scatter_plot_categorical_sizes.png
+    df.plot(kind='scatter', x='a', y='b', s='group')
+
+.. ipython:: python
+   :suppress:
+
+   plt.close('all')
+
+You can pass other keywords supported by matplotlib ``scatter``, e.g. ``alpha``
+to control the transparency of points.
+
 See the :meth:`scatter <matplotlib.axes.Axes.scatter>` method and the
 `matplotlib scatter documenation <http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.scatter>`__ for more.
 

From 9a385710cfaea4efed9da20c24856d95dd55eddb Mon Sep 17 00:00:00 2001
From: onesandzeroes <onesandzeroes@github.com>
Date: Mon, 24 Nov 2014 21:09:21 +1100
Subject: [PATCH 5/8] Only use s as column name if string

---
 pandas/tools/plotting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py
index 19cea86fb541d..75d8af651dcb8 100644
--- a/pandas/tools/plotting.py
+++ b/pandas/tools/plotting.py
@@ -1394,7 +1394,7 @@ def __init__(self, data, x, y, c=None, s=None,
         # generation starts and non-numeric data thrown away
         if s is None:
             self.s_values = self.plt.rcParams['lines.markersize']
-        elif s in self.data.columns:
+        elif isinstance(s, str) and s in self.data.columns:
             self.s_values = self._convert_column_to_size(s)
         else:
             self.s_values = s

From 69ccae10d2a82a9963d7a814f4b71703595d6cc1 Mon Sep 17 00:00:00 2001
From: onesandzeroes <onesandzeroes@github.com>
Date: Mon, 24 Nov 2014 21:09:46 +1100
Subject: [PATCH 6/8] Docs: only string column names supported

---
 doc/source/visualization.rst | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst
index 441bdf7b1f16e..24f4e1125a59a 100644
--- a/doc/source/visualization.rst
+++ b/doc/source/visualization.rst
@@ -588,7 +588,10 @@ each point:
    plt.close('all')
 
 You can also pass a column name as the ``s`` (size) argument to have
-the point sizes scale according to that column's values. The minimum and
+the point sizes scale according to that column's values. Currently
+this is only supported for string column names.
+
+The minimum and
 maximum sizes of the bubbles (in points) are controlled by the
 ``size_range`` argument, with a default range of ``(50, 1000)``. The
 below example shows a bubble chart using a dataframe column values

From a103bbaca67d1dc98ef8e13f740eee530ac896dd Mon Sep 17 00:00:00 2001
From: onesandzeroes <onesandzeroes@github.com>
Date: Mon, 24 Nov 2014 21:19:33 +1100
Subject: [PATCH 7/8] Add tests

---
 pandas/tests/test_graphics.py | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py
index 74ec6d22ca4cd..0cf886aa58b76 100644
--- a/pandas/tests/test_graphics.py
+++ b/pandas/tests/test_graphics.py
@@ -9,7 +9,8 @@
 
 from datetime import datetime, date
 
-from pandas import Series, DataFrame, MultiIndex, PeriodIndex, date_range
+from pandas import (Series, DataFrame, MultiIndex, PeriodIndex, date_range,
+                    Categorical)
 from pandas.compat import (range, lrange, StringIO, lmap, lzip, u, zip,
                            iteritems, OrderedDict)
 from pandas.util.decorators import cache_readonly
@@ -1645,6 +1646,32 @@ def test_plot_scatter_with_c(self):
         self.assertIs(ax.collections[0].colorbar, None)
         self._check_colors(ax.collections, facecolors=['r'])
 
+    @slow
+    def test_plot_scatter_with_size(self):
+        df = DataFrame(randn(6, 3),
+           index=list(string.ascii_letters[:6]),
+           columns=['x', 'y', 'z'])
+        df['group'] = Categorical(random.randint(1, 4, 6))
+
+        size_range = (100, 500)
+        ax1 = df.plot(kind='scatter', x='x', y='y', s='z',
+                      size_range=size_range)
+        point_sizes1 = ax1.collections[0]._sizes
+        self.assertGreaterEqual(min(point_sizes1), size_range[0])
+        self.assertLessEqual(max(point_sizes1), size_range[1])
+
+        # Categorical size column
+        ax2 = df.plot(kind='scatter', x='x', y='y', s='group',
+                      size_range=size_range)
+        point_sizes2 = ax2.collections[0]._sizes
+        self.assertGreaterEqual(min(point_sizes2), size_range[0])
+        self.assertLessEqual(max(point_sizes2), size_range[1])
+        unique_sizes = np.unique(point_sizes2)
+        self.assertEqual(
+            len(unique_sizes),
+            len(df['group'].cat.categories)
+        )
+
     @slow
     def test_plot_bar(self):
         df = DataFrame(randn(6, 4),

From a404ad88159603f1e1d39fa0d6272d8bacad2014 Mon Sep 17 00:00:00 2001
From: onesandzeroes <onesandzeroes@github.com>
Date: Mon, 24 Nov 2014 21:25:12 +1100
Subject: [PATCH 8/8] Combine paragraphs at end of scatter docs

---
 doc/source/visualization.rst | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst
index 24f4e1125a59a..42fa3da67ff4b 100644
--- a/doc/source/visualization.rst
+++ b/doc/source/visualization.rst
@@ -622,9 +622,8 @@ a set of equally spaced point sizes:
    plt.close('all')
 
 You can pass other keywords supported by matplotlib ``scatter``, e.g. ``alpha``
-to control the transparency of points.
-
-See the :meth:`scatter <matplotlib.axes.Axes.scatter>` method and the
+to control the transparency of points. See the
+:meth:`scatter <matplotlib.axes.Axes.scatter>` method and the
 `matplotlib scatter documenation <http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.scatter>`__ for more.
 
 .. _visualization.hexbin: