From 367b648e1b2f7dcc639b913799063657b1cffe8d Mon Sep 17 00:00:00 2001 From: onesandzeroes Date: Thu, 25 Sep 2014 08:26:35 +1000 Subject: [PATCH 1/8] Allow size scaling by passing column name --- pandas/tools/plotting.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index cb669b75e5c96..d493c093f6d5c 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -1374,7 +1374,8 @@ def _get_errorbars(self, label=None, index=None, xerr=True, yerr=True): class ScatterPlot(MPLPlot): _layout_type = 'single' - def __init__(self, data, x, y, c=None, **kwargs): + def __init__(self, data, x, y, c=None, s=None, + size_range=(50, 1000), **kwargs): MPLPlot.__init__(self, data, **kwargs) if x is None or y is None: raise ValueError( 'scatter requires and x and y column') @@ -1387,6 +1388,8 @@ def __init__(self, data, x, y, c=None, **kwargs): self.x = x self.y = y self.c = c + self.s = s + self.size_range = size_range @property def nseries(self): @@ -1398,7 +1401,7 @@ def _make_plot(self): import matplotlib.pyplot as plt - x, y, c, data = self.x, self.y, self.c, self.data + x, y, c, s, data = self.x, self.y, self.c, self.s, self.data ax = self.axes[0] # plot a colorbar only if a colormap is provided or necessary @@ -1415,12 +1418,20 @@ def _make_plot(self): else: c_values = c + # Set up size scaling if necessary + if s is None: + s_values = self.plt.rcParams['lines.markersize'] + elif s in self.data.columns: + s_values = self._convert_size_vals_to_points(self.data[s].values) + else: + s_values = s + if self.legend and hasattr(self, 'label'): label = self.label else: label = None scatter = ax.scatter(data[x].values, data[y].values, c=c_values, - label=label, cmap=cmap, **self.kwds) + s=s_values, label=label, cmap=cmap, **self.kwds) if cb: img = ax.collections[0] kws = dict(ax=ax) @@ -1437,6 +1448,13 @@ def _make_plot(self): err_kwds['ecolor'] = scatter.get_facecolor()[0] ax.errorbar(data[x].values, data[y].values, linestyle='none', **err_kwds) + def _convert_size_vals_to_points(self, vals): + min_size, max_size = self.size_range + val_range = vals.max() - vals.min() + normalized_vals = (vals - vals.min()) / val_range + point_sizes = (min_size + (normalized_vals * (max_size - min_size))) + return point_sizes + def _post_plot_logic(self): ax = self.axes[0] x, y = self.x, self.y From aab72ba7829ccd76023909e607a26468982ea0da Mon Sep 17 00:00:00 2001 From: onesandzeroes Date: Fri, 14 Nov 2014 19:30:01 +1100 Subject: [PATCH 2/8] Allow categorical size column --- pandas/tools/plotting.py | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index d493c093f6d5c..19cea86fb541d 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -1388,9 +1388,17 @@ def __init__(self, data, x, y, c=None, s=None, self.x = x self.y = y self.c = c - self.s = s self.size_range = size_range + # Set up size scaling if necessary, need to do this before plot + # generation starts and non-numeric data thrown away + if s is None: + self.s_values = self.plt.rcParams['lines.markersize'] + elif s in self.data.columns: + self.s_values = self._convert_column_to_size(s) + else: + self.s_values = s + @property def nseries(self): return 1 @@ -1401,7 +1409,7 @@ def _make_plot(self): import matplotlib.pyplot as plt - x, y, c, s, data = self.x, self.y, self.c, self.s, self.data + x, y, c, data = self.x, self.y, self.c, self.data ax = self.axes[0] # plot a colorbar only if a colormap is provided or necessary @@ -1418,20 +1426,14 @@ def _make_plot(self): else: c_values = c - # Set up size scaling if necessary - if s is None: - s_values = self.plt.rcParams['lines.markersize'] - elif s in self.data.columns: - s_values = self._convert_size_vals_to_points(self.data[s].values) - else: - s_values = s if self.legend and hasattr(self, 'label'): label = self.label else: label = None scatter = ax.scatter(data[x].values, data[y].values, c=c_values, - s=s_values, label=label, cmap=cmap, **self.kwds) + s=self.s_values, label=label, cmap=cmap, + **self.kwds) if cb: img = ax.collections[0] kws = dict(ax=ax) @@ -1448,11 +1450,21 @@ def _make_plot(self): err_kwds['ecolor'] = scatter.get_facecolor()[0] ax.errorbar(data[x].values, data[y].values, linestyle='none', **err_kwds) - def _convert_size_vals_to_points(self, vals): + def _convert_column_to_size(self, col_name): min_size, max_size = self.size_range - val_range = vals.max() - vals.min() - normalized_vals = (vals - vals.min()) / val_range - point_sizes = (min_size + (normalized_vals * (max_size - min_size))) + size_col = self.data[col_name] + + if com.is_categorical_dtype(size_col): + n_categories = len(size_col.cat.categories) + cat_sizes = np.linspace(min_size, max_size, num=n_categories) + size_mapper = Series(cat_sizes, index=size_col.cat.categories) + point_sizes = size_col.map(size_mapper) + else: + vals = self.data[col_name].values + val_range = vals.max() - vals.min() + normalized_vals = (vals - vals.min()) / val_range + point_sizes = (min_size + (normalized_vals * (max_size - min_size))) + return point_sizes def _post_plot_logic(self): From cc2415a2c80220d793b2d25e411ccf7526470f63 Mon Sep 17 00:00:00 2001 From: onesandzeroes Date: Mon, 24 Nov 2014 21:00:41 +1100 Subject: [PATCH 3/8] Add to release notes --- doc/source/whatsnew/v0.15.2.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index 6688f106f922e..6ecba9e0e0ad8 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -66,6 +66,7 @@ Enhancements - Added support for ``utcfromtimestamp()``, ``fromtimestamp()``, and ``combine()`` on `Timestamp` class (:issue:`5351`). - Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See :ref:`here`. - Added flag ``order_categoricals`` to ``StataReader`` and ``read_stata`` to select whether to order imported categorical data (:issue:`8836`). See :ref:`here ` for more information on importing categorical variables from Stata data files. +- Added support for passing a column name as the size argument to ``DataFrame.plot(kind='scatter')``, along with a ``size_range`` argument to control scaling (:issue:`8244`). .. _whatsnew_0152.performance: From af83de33d2678926fc985bc1201ca801e72b5a15 Mon Sep 17 00:00:00 2001 From: onesandzeroes Date: Mon, 24 Nov 2014 21:01:49 +1100 Subject: [PATCH 4/8] Add sizing examples to docs --- doc/source/visualization.rst | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index f30d6c9d5d4c0..441bdf7b1f16e 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -8,7 +8,7 @@ import pandas as pd from numpy.random import randn, rand, randint np.random.seed(123456) - from pandas import DataFrame, Series, date_range, options + from pandas import DataFrame, Series, date_range, options, Categorical import pandas.util.testing as tm np.set_printoptions(precision=4, suppress=True) import matplotlib.pyplot as plt @@ -587,19 +587,40 @@ each point: plt.close('all') -You can pass other keywords supported by matplotlib ``scatter``. -Below example shows a bubble chart using a dataframe column values as bubble size. +You can also pass a column name as the ``s`` (size) argument to have +the point sizes scale according to that column's values. The minimum and +maximum sizes of the bubbles (in points) are controlled by the +``size_range`` argument, with a default range of ``(50, 1000)``. The +below example shows a bubble chart using a dataframe column values +as bubble size. .. ipython:: python - @savefig scatter_plot_bubble.png - df.plot(kind='scatter', x='a', y='b', s=df['c']*200); + @savefig scatter_plot_sizes.png + df.plot(kind='scatter', x='a', y='b', s='c'); .. ipython:: python :suppress: plt.close('all') +Categorical columns can also be used to set point sizes, producing +a set of equally spaced point sizes: + +.. ipython:: python + + df['group'] = Categorical(randint(1, 4, 50)) + @savefig scatter_plot_categorical_sizes.png + df.plot(kind='scatter', x='a', y='b', s='group') + +.. ipython:: python + :suppress: + + plt.close('all') + +You can pass other keywords supported by matplotlib ``scatter``, e.g. ``alpha`` +to control the transparency of points. + See the :meth:`scatter ` method and the `matplotlib scatter documenation `__ for more. From 9a385710cfaea4efed9da20c24856d95dd55eddb Mon Sep 17 00:00:00 2001 From: onesandzeroes Date: Mon, 24 Nov 2014 21:09:21 +1100 Subject: [PATCH 5/8] Only use s as column name if string --- pandas/tools/plotting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 19cea86fb541d..75d8af651dcb8 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -1394,7 +1394,7 @@ def __init__(self, data, x, y, c=None, s=None, # generation starts and non-numeric data thrown away if s is None: self.s_values = self.plt.rcParams['lines.markersize'] - elif s in self.data.columns: + elif isinstance(s, str) and s in self.data.columns: self.s_values = self._convert_column_to_size(s) else: self.s_values = s From 69ccae10d2a82a9963d7a814f4b71703595d6cc1 Mon Sep 17 00:00:00 2001 From: onesandzeroes Date: Mon, 24 Nov 2014 21:09:46 +1100 Subject: [PATCH 6/8] Docs: only string column names supported --- doc/source/visualization.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index 441bdf7b1f16e..24f4e1125a59a 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -588,7 +588,10 @@ each point: plt.close('all') You can also pass a column name as the ``s`` (size) argument to have -the point sizes scale according to that column's values. The minimum and +the point sizes scale according to that column's values. Currently +this is only supported for string column names. + +The minimum and maximum sizes of the bubbles (in points) are controlled by the ``size_range`` argument, with a default range of ``(50, 1000)``. The below example shows a bubble chart using a dataframe column values From a103bbaca67d1dc98ef8e13f740eee530ac896dd Mon Sep 17 00:00:00 2001 From: onesandzeroes Date: Mon, 24 Nov 2014 21:19:33 +1100 Subject: [PATCH 7/8] Add tests --- pandas/tests/test_graphics.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 74ec6d22ca4cd..0cf886aa58b76 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -9,7 +9,8 @@ from datetime import datetime, date -from pandas import Series, DataFrame, MultiIndex, PeriodIndex, date_range +from pandas import (Series, DataFrame, MultiIndex, PeriodIndex, date_range, + Categorical) from pandas.compat import (range, lrange, StringIO, lmap, lzip, u, zip, iteritems, OrderedDict) from pandas.util.decorators import cache_readonly @@ -1645,6 +1646,32 @@ def test_plot_scatter_with_c(self): self.assertIs(ax.collections[0].colorbar, None) self._check_colors(ax.collections, facecolors=['r']) + @slow + def test_plot_scatter_with_size(self): + df = DataFrame(randn(6, 3), + index=list(string.ascii_letters[:6]), + columns=['x', 'y', 'z']) + df['group'] = Categorical(random.randint(1, 4, 6)) + + size_range = (100, 500) + ax1 = df.plot(kind='scatter', x='x', y='y', s='z', + size_range=size_range) + point_sizes1 = ax1.collections[0]._sizes + self.assertGreaterEqual(min(point_sizes1), size_range[0]) + self.assertLessEqual(max(point_sizes1), size_range[1]) + + # Categorical size column + ax2 = df.plot(kind='scatter', x='x', y='y', s='group', + size_range=size_range) + point_sizes2 = ax2.collections[0]._sizes + self.assertGreaterEqual(min(point_sizes2), size_range[0]) + self.assertLessEqual(max(point_sizes2), size_range[1]) + unique_sizes = np.unique(point_sizes2) + self.assertEqual( + len(unique_sizes), + len(df['group'].cat.categories) + ) + @slow def test_plot_bar(self): df = DataFrame(randn(6, 4), From a404ad88159603f1e1d39fa0d6272d8bacad2014 Mon Sep 17 00:00:00 2001 From: onesandzeroes Date: Mon, 24 Nov 2014 21:25:12 +1100 Subject: [PATCH 8/8] Combine paragraphs at end of scatter docs --- doc/source/visualization.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index 24f4e1125a59a..42fa3da67ff4b 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -622,9 +622,8 @@ a set of equally spaced point sizes: plt.close('all') You can pass other keywords supported by matplotlib ``scatter``, e.g. ``alpha`` -to control the transparency of points. - -See the :meth:`scatter ` method and the +to control the transparency of points. See the +:meth:`scatter ` method and the `matplotlib scatter documenation `__ for more. .. _visualization.hexbin: