Skip to content

Commit 7e9193c

Browse files
shoyerJoe Hamman
authored and
Joe Hamman
committed
Tweak to to_dask_dataframe() (#1667)
* Tweak to to_dask_dataframe() - Add a `dim_order` argument - Always write columns for each dimension - Docstring to NumPy format * Fix windows test failure * More windows failure * Fix failing test * Use da.arange() inside to_dask_dataframe
1 parent 20f9ffd commit 7e9193c

File tree

2 files changed

+99
-45
lines changed

2 files changed

+99
-45
lines changed

xarray/core/dataset.py

Lines changed: 58 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -2424,7 +2424,7 @@ def apply(self, func, keep_attrs=False, args=(), **kwargs):
24242424
-------
24252425
applied : Dataset
24262426
Resulting dataset from applying ``func`` over each data variable.
2427-
2427+
24282428
Examples
24292429
--------
24302430
>>> da = xr.DataArray(np.random.randn(2, 3))
@@ -2442,7 +2442,7 @@ def apply(self, func, keep_attrs=False, args=(), **kwargs):
24422442
Dimensions without coordinates: dim_0, dim_1, x
24432443
Data variables:
24442444
foo (dim_0, dim_1) float64 0.3751 1.951 1.945 0.2948 0.711 0.3948
2445-
bar (x) float64 1.0 2.0
2445+
bar (x) float64 1.0 2.0
24462446
"""
24472447
variables = OrderedDict(
24482448
(k, maybe_wrap_array(v, func(v, *args, **kwargs)))
@@ -2577,63 +2577,79 @@ def from_dataframe(cls, dataframe):
25772577
obj[name] = (dims, data)
25782578
return obj
25792579

2580-
def to_dask_dataframe(self, set_index=False):
2580+
def to_dask_dataframe(self, dim_order=None, set_index=False):
25812581
"""
25822582
Convert this dataset into a dask.dataframe.DataFrame.
25832583
2584-
Both the coordinate and data variables in this dataset form
2584+
The dimensions, coordinates and data variables in this dataset form
25852585
the columns of the DataFrame.
25862586
2587-
If set_index=True, the dask DataFrame is indexed by this dataset's
2588-
coordinate. Since dask DataFrames to not support multi-indexes,
2589-
set_index only works if there is one coordinate dimension.
2587+
Arguments
2588+
---------
2589+
dim_order : list, optional
2590+
Hierarchical dimension order for the resulting dataframe. All
2591+
arrays are transposed to this order and then written out as flat
2592+
vectors in contiguous order, so the last dimension in this list
2593+
will be contiguous in the resulting DataFrame. This has a major
2594+
influence on which operations are efficient on the resulting dask
2595+
dataframe.
2596+
2597+
If provided, must include all dimensions on this dataset. By
2598+
default, dimensions are sorted alphabetically.
2599+
set_index : bool, optional
2600+
If set_index=True, the dask DataFrame is indexed by this dataset's
2601+
coordinate. Since dask DataFrames to not support multi-indexes,
2602+
set_index only works if the dataset only contains one dimension.
2603+
2604+
Returns
2605+
-------
2606+
dask.dataframe.DataFrame
25902607
"""
25912608

2609+
import dask.array as da
25922610
import dask.dataframe as dd
25932611

2594-
ordered_dims = self.dims
2595-
chunks = self.chunks
2596-
2597-
# order columns so that coordinates appear before data
2598-
columns = list(self.coords) + list(self.data_vars)
2599-
2600-
data = []
2601-
for k in columns:
2602-
v = self._variables[k]
2603-
2604-
# consider coordinate variables as well as data varibles
2605-
if isinstance(v, xr.IndexVariable):
2606-
v = v.to_base_variable()
2612+
if dim_order is None:
2613+
dim_order = list(self.dims)
2614+
elif set(dim_order) != set(self.dims):
2615+
raise ValueError(
2616+
'dim_order {} does not match the set of dimensions on this '
2617+
'Dataset: {}'.format(dim_order, list(self.dims)))
26072618

2608-
# ensure all variables span the same dimensions
2609-
v = v.set_dims(ordered_dims)
2619+
ordered_dims = OrderedDict((k, self.dims[k]) for k in dim_order)
26102620

2611-
# ensure all variables have the same chunking structure
2612-
if v.chunks != chunks:
2613-
v = v.chunk(chunks)
2621+
columns = list(ordered_dims)
2622+
columns.extend(k for k in self.coords if k not in self.dims)
2623+
columns.extend(self.data_vars)
26142624

2615-
# reshape variable contents as a 1d array
2616-
d = v.data.reshape(-1)
2625+
series_list = []
2626+
for name in columns:
2627+
try:
2628+
var = self.variables[name]
2629+
except KeyError:
2630+
# dimension without a matching coordinate
2631+
size = self.dims[name]
2632+
data = da.arange(size, chunks=size, dtype=np.int64)
2633+
var = Variable((name,), data)
26172634

2618-
# convert to dask DataFrames
2619-
s = dd.from_array(d, columns=[k])
2635+
# IndexVariable objects have a dummy .chunk() method
2636+
if isinstance(var, IndexVariable):
2637+
var = var.to_base_variable()
26202638

2621-
data.append(s)
2639+
dask_array = var.set_dims(ordered_dims).chunk(self.chunks).data
2640+
series = dd.from_array(dask_array.reshape(-1), columns=[name])
2641+
series_list.append(series)
26222642

2623-
df = dd.concat(data, axis=1)
2643+
df = dd.concat(series_list, axis=1)
26242644

26252645
if set_index:
2626-
2627-
if len(ordered_dims) != 1:
2628-
raise ValueError(
2629-
'set_index=True only is valid for '
2630-
'for one-dimensional datasets')
2631-
2632-
# extract out first (and only) coordinate variable
2633-
coord_dim = list(ordered_dims)[0]
2634-
2635-
if coord_dim in df.columns:
2636-
df = df.set_index(coord_dim)
2646+
if len(dim_order) == 1:
2647+
(dim,) = dim_order
2648+
df = df.set_index(dim)
2649+
else:
2650+
# triggers an error about multi-indexes, even if only one
2651+
# dimension is passed
2652+
df = df.set_index(dim_order)
26372653

26382654
return df
26392655

xarray/tests/test_dask.py

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -547,6 +547,9 @@ def test_from_dask_variable(self):
547547
coords={'x': range(4)}, name='foo')
548548
self.assertLazyAndIdentical(self.lazy_array, a)
549549

550+
551+
class TestToDaskDataFrame(TestCase):
552+
550553
def test_to_dask_dataframe(self):
551554
# Test conversion of Datasets to dask DataFrames
552555
x = da.from_array(np.random.randn(10), chunks=4)
@@ -595,12 +598,24 @@ def test_to_dask_dataframe_2D(self):
595598
index=exp_index)
596599
# so for now, reset the index
597600
expected = expected.reset_index(drop=False)
598-
599601
actual = ds.to_dask_dataframe(set_index=False)
600602

601603
self.assertIsInstance(actual, dd.DataFrame)
602604
assert_frame_equal(expected, actual.compute())
603605

606+
@pytest.mark.xfail(raises=NotImplementedError)
607+
def test_to_dask_dataframe_2D_set_index(self):
608+
# This will fail until dask implements MultiIndex support
609+
w = da.from_array(np.random.randn(2, 3), chunks=(1, 2))
610+
ds = Dataset({'w': (('x', 'y'), w)})
611+
ds['x'] = ('x', np.array([0, 1], np.int64))
612+
ds['y'] = ('y', list('abc'))
613+
614+
expected = ds.compute().to_dataframe()
615+
actual = ds.to_dask_dataframe(set_index=True)
616+
self.assertIsInstance(actual, dd.DataFrame)
617+
assert_frame_equal(expected, actual.compute())
618+
604619
def test_to_dask_dataframe_coordinates(self):
605620
# Test if coordinate is also a dask array
606621
x = da.from_array(np.random.randn(10), chunks=4)
@@ -634,13 +649,36 @@ def test_to_dask_dataframe_not_daskarray(self):
634649
assert_frame_equal(expected, actual.compute())
635650

636651
def test_to_dask_dataframe_no_coordinate(self):
637-
# Test if Dataset has a dimension without coordinates
638652
x = da.from_array(np.random.randn(10), chunks=4)
639653
ds = Dataset({'x': ('dim_0', x)})
640-
expected = pd.DataFrame({'x': x.compute()})
654+
655+
expected = ds.compute().to_dataframe().reset_index()
656+
actual = ds.to_dask_dataframe()
657+
self.assertIsInstance(actual, dd.DataFrame)
658+
assert_frame_equal(expected, actual.compute())
659+
660+
expected = ds.compute().to_dataframe()
641661
actual = ds.to_dask_dataframe(set_index=True)
662+
self.assertIsInstance(actual, dd.DataFrame)
663+
assert_frame_equal(expected, actual.compute())
664+
665+
def test_to_dask_dataframe_dim_order(self):
666+
values = np.array([[1, 2], [3, 4]], dtype=np.int64)
667+
ds = Dataset({'w': (('x', 'y'), values)}).chunk(1)
668+
669+
expected = ds['w'].to_series().reset_index()
670+
actual = ds.to_dask_dataframe(dim_order=['x', 'y'])
671+
self.assertIsInstance(actual, dd.DataFrame)
672+
assert_frame_equal(expected, actual.compute())
673+
674+
expected = ds['w'].T.to_series().reset_index()
675+
actual = ds.to_dask_dataframe(dim_order=['y', 'x'])
676+
self.assertIsInstance(actual, dd.DataFrame)
642677
assert_frame_equal(expected, actual.compute())
643678

679+
with raises_regex(ValueError, 'does not match the set of dimensions'):
680+
ds.to_dask_dataframe(dim_order=['x'])
681+
644682

645683
@pytest.mark.parametrize("method", ['load', 'compute'])
646684
def test_dask_kwargs_variable(method):

0 commit comments

Comments
 (0)