pydata · shoyer · Oct 28, 2017 · Apr 15, 2017 · May 2, 2017 · Jul 26, 2017
diff --git a/doc/api.rst b/doc/api.rst
@@ -423,6 +423,7 @@ Dataset methods
    save_mfdataset
    Dataset.to_array
    Dataset.to_dataframe
+   Dataset.to_dask_dataframe
    Dataset.to_dict
    Dataset.from_dataframe
    Dataset.from_dict

diff --git a/doc/dask.rst b/doc/dask.rst
@@ -100,6 +100,15 @@ Once you've manipulated a dask array, you can still write a dataset too big to
 fit into memory back to disk by using :py:meth:`~xarray.Dataset.to_netcdf` in the
 usual way.
 
+A dataset can also be converted to a dask DataFrame using :py:meth:`~xarray.Dataset.to_dask_dataframe`.
+
+.. ipython:: python
+
+    df = ds.to_dask_dataframe()
+    df
+
+Dask DataFrames do not support multi-indexes so the coordinate variables from the dataset are included as columns in the dask DataFrame.
+
 Using dask with xarray
 ----------------------
 
@@ -145,7 +154,7 @@ Explicit conversion by wrapping a DataArray with ``np.asarray`` also works:
             ...
 
 Alternatively you can load the data into memory but keep the arrays as
-dask arrays using the `~xarray.Dataset.persist` method:
+dask arrays using the :py:meth:`~xarray.Dataset.persist` method:
 
 .. ipython::
 

diff --git a/doc/pandas.rst b/doc/pandas.rst
@@ -60,6 +60,9 @@ To convert the ``DataFrame`` to any other convenient representation,
 use ``DataFrame`` methods like :py:meth:`~pandas.DataFrame.reset_index`,
 :py:meth:`~pandas.DataFrame.stack` and :py:meth:`~pandas.DataFrame.unstack`.
 
+For datasets containing dask arrays where the data should be lazily loaded, see the
+:py:meth:`Dataset.to_dask_dataframe() <xarray.Dataset.to_dask_dataframe>` method.
+
 To create a ``Dataset`` from a ``DataFrame``, use the
 :py:meth:`~xarray.Dataset.from_dataframe` class method or the equivalent
 :py:meth:`pandas.DataFrame.to_xarray <DataFrame.to_xarray>` method (pandas

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -186,6 +186,11 @@ Enhancements
   functions on data stored as dask arrays (:issue:`1279`).
   By `Joe Hamman <https://github.com/jhamman>`_.
 
+- Added new method :py:meth:`~Dataset.to_dask_dataframe` to
+  ``Dataset``, convert a dataset into a dask dataframe.
+  This allows lazy loading of data from a dataset containing dask arrays (:issue:`1462`).
+  By `James Munroe <https://github.com/jmunroe>`_.
+
 - Support reading and writing unlimited dimensions with h5netcdf (:issue:`1636`).
   By `Joe Hamman <https://github.com/jhamman>`_.
 

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -2416,6 +2416,66 @@ def from_dataframe(cls, dataframe):
             obj[name] = (dims, data)
         return obj
 
+    def to_dask_dataframe(self, set_index=False):
+        """
+        Convert this dataset into a dask.dataframe.DataFrame.
+
+        Both the coordinate and data variables in this dataset form
+        the columns of the DataFrame.
+
+        If set_index=True, the dask DataFrame is indexed by this dataset's
+        coordinate.  Since dask DataFrames to not support multi-indexes,
+        set_index only works if there is one coordinate dimension.
+        """
+
+        import dask.dataframe as dd
+
+        ordered_dims = self.dims
+        chunks = self.chunks
+
+        # order columns so that coordinates appear before data
+        columns = list(self.coords) + list(self.data_vars)
+
+        data = []
+        for k in columns:
+            v = self._variables[k]
+
+            # consider coordinate variables as well as data varibles
+            if isinstance(v, xr.IndexVariable):
+                v = v.to_base_variable()
+
+            # ensure all variables span the same dimensions
+            v = v.set_dims(ordered_dims)
+
+            # ensure all variables have the same chunking structure
+            if v.chunks != chunks:
+                v = v.chunk(chunks)
+
+            # reshape variable contents as a 1d array
+            d = v.data.reshape(-1)
+
+            # convert to dask DataFrames
+            s = dd.from_array(d, columns=[k])
+
+            data.append(s)
+
+        df = dd.concat(data, axis=1)
+
+        if set_index:
+
+            if len(ordered_dims) != 1:
+                raise ValueError(
+                        'set_index=True only is valid for '
+                        'for one-dimensional datasets')
+
+            # extract out first (and only) coordinate variable
+            coord_dim = list(ordered_dims)[0]
+
+            if coord_dim in df.columns:
+                df = df.set_index(coord_dim)
+
+        return df
+
     def to_dict(self):
         """
         Convert this dataset to a dictionary following xarray naming

diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py
@@ -16,6 +16,12 @@
 from xarray.core.pycompat import PY3
 from xarray.testing import assert_equal, assert_identical, assert_allclose
 
+try:
+    from pandas.testing import assert_frame_equal
+except ImportError:
+    # old location, for pandas < 0.20
+    from pandas.util.testing import assert_frame_equal
+
 try:
     import unittest2 as unittest
 except ImportError:

diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py
@@ -13,13 +13,14 @@
 import xarray as xr
 from xarray import Variable, DataArray, Dataset
 import xarray.ufuncs as xu
-from xarray.core.pycompat import suppress
-from . import TestCase
+from xarray.core.pycompat import suppress, OrderedDict
+from . import TestCase, assert_frame_equal
 
 from xarray.tests import mock
 
 dask = pytest.importorskip('dask')
 import dask.array as da
+import dask.dataframe as dd
 
 
 class DaskTestCase(TestCase):
@@ -29,9 +30,9 @@ def assertLazyAnd(self, expected, actual, test):
         if isinstance(actual, Dataset):
             for k, v in actual.variables.items():
                 if k in actual.dims:
-                    self.assertIsInstance(var.data, np.ndarray)
+                    self.assertIsInstance(v.data, np.ndarray)
                 else:
-                    self.assertIsInstance(var.data, da.Array)
+                    self.assertIsInstance(v.data, da.Array)
         elif isinstance(actual, DataArray):
             self.assertIsInstance(actual.data, da.Array)
             for k, v in actual.coords.items():
@@ -546,6 +547,100 @@ def test_from_dask_variable(self):
                       coords={'x': range(4)}, name='foo')
         self.assertLazyAndIdentical(self.lazy_array, a)
 
+    def test_to_dask_dataframe(self):
+        # Test conversion of Datasets to dask DataFrames
+        x = da.from_array(np.random.randn(10), chunks=4)
+        y = np.arange(10, dtype='uint8')
+        t = list('abcdefghij')
+
+        ds = Dataset(OrderedDict([('a', ('t', x)),
+                                  ('b', ('t', y)),
+                                  ('t', ('t', t))]))
+
+        expected_pd = pd.DataFrame({'a': x,
+                                    'b': y},
+                                   index=pd.Index(t, name='t'))
+
+        # test if 1-D index is correctly set up
+        expected = dd.from_pandas(expected_pd, chunksize=4)
+        actual = ds.to_dask_dataframe(set_index=True)
+        # test if we have dask dataframes
+        self.assertIsInstance(actual, dd.DataFrame)
+
+        # use the .equals from pandas to check dataframes are equivalent
+        assert_frame_equal(expected.compute(), actual.compute())
+
+        # test if no index is given
+        expected = dd.from_pandas(expected_pd.reset_index(drop=False),
+                                  chunksize=4)
+
+        actual = ds.to_dask_dataframe(set_index=False)
+
+        self.assertIsInstance(actual, dd.DataFrame)
+        assert_frame_equal(expected.compute(), actual.compute())
+
+    def test_to_dask_dataframe_2D(self):
+        # Test if 2-D dataset is supplied
+        w = da.from_array(np.random.randn(2, 3), chunks=(1, 2))
+        ds = Dataset({'w': (('x', 'y'), w)})
+        ds['x'] = ('x', np.array([0, 1], np.int64))
+        ds['y'] = ('y', list('abc'))
+
+        # dask dataframes do not (yet) support multiindex,
+        # but when it does, this would be the expected index:
+        exp_index = pd.MultiIndex.from_arrays(
+            [[0, 0, 0, 1, 1, 1], ['a', 'b', 'c', 'a', 'b', 'c']],
+            names=['x', 'y'])
+        expected = pd.DataFrame({'w': w.reshape(-1)},
+                                index=exp_index)
+        # so for now, reset the index
+        expected = expected.reset_index(drop=False)
+
+        actual = ds.to_dask_dataframe(set_index=False)
+
+        self.assertIsInstance(actual, dd.DataFrame)
+        assert_frame_equal(expected, actual.compute())
+
+    def test_to_dask_dataframe_coordinates(self):
+        # Test if coordinate is also a dask array
+        x = da.from_array(np.random.randn(10), chunks=4)
+        t = da.from_array(np.arange(10)*2, chunks=4)
+
+        ds = Dataset(OrderedDict([('a', ('t', x)),
+                                  ('t', ('t', t))]))
+
+        expected_pd = pd.DataFrame({'a': x},
+                                   index=pd.Index(t, name='t'))
+        expected = dd.from_pandas(expected_pd, chunksize=4)
+        actual = ds.to_dask_dataframe(set_index=True)
+        self.assertIsInstance(actual, dd.DataFrame)
+        assert_frame_equal(expected.compute(), actual.compute())
+
+    def test_to_dask_dataframe_not_daskarray(self):
+        # Test if DataArray is not a dask array
+        x = np.random.randn(10)
+        y = np.arange(10, dtype='uint8')
+        t = list('abcdefghij')
+
+        ds = Dataset(OrderedDict([('a', ('t', x)),
+                                  ('b', ('t', y)),
+                                  ('t', ('t', t))]))
+
+        expected = pd.DataFrame({'a': x, 'b': y},
+                                index=pd.Index(t, name='t'))
+
+        actual = ds.to_dask_dataframe(set_index=True)
+        self.assertIsInstance(actual, dd.DataFrame)
+        assert_frame_equal(expected, actual.compute())
+
+    def test_to_dask_dataframe_no_coordinate(self):
+        # Test if Dataset has a dimension without coordinates
+        x = da.from_array(np.random.randn(10), chunks=4)
+        ds = Dataset({'x': ('dim_0', x)})
+        expected = pd.DataFrame({'x': x.compute()})
+        actual = ds.to_dask_dataframe(set_index=True)
+        assert_frame_equal(expected, actual.compute())
+
 
 @pytest.mark.parametrize("method", ['load', 'compute'])
 def test_dask_kwargs_variable(method):