@@ -2424,7 +2424,7 @@ def apply(self, func, keep_attrs=False, args=(), **kwargs):
2424
2424
-------
2425
2425
applied : Dataset
2426
2426
Resulting dataset from applying ``func`` over each data variable.
2427
-
2427
+
2428
2428
Examples
2429
2429
--------
2430
2430
>>> da = xr.DataArray(np.random.randn(2, 3))
@@ -2442,7 +2442,7 @@ def apply(self, func, keep_attrs=False, args=(), **kwargs):
2442
2442
Dimensions without coordinates: dim_0, dim_1, x
2443
2443
Data variables:
2444
2444
foo (dim_0, dim_1) float64 0.3751 1.951 1.945 0.2948 0.711 0.3948
2445
- bar (x) float64 1.0 2.0
2445
+ bar (x) float64 1.0 2.0
2446
2446
"""
2447
2447
variables = OrderedDict (
2448
2448
(k , maybe_wrap_array (v , func (v , * args , ** kwargs )))
@@ -2577,63 +2577,79 @@ def from_dataframe(cls, dataframe):
2577
2577
obj [name ] = (dims , data )
2578
2578
return obj
2579
2579
2580
- def to_dask_dataframe (self , set_index = False ):
2580
+ def to_dask_dataframe (self , dim_order = None , set_index = False ):
2581
2581
"""
2582
2582
Convert this dataset into a dask.dataframe.DataFrame.
2583
2583
2584
- Both the coordinate and data variables in this dataset form
2584
+ The dimensions, coordinates and data variables in this dataset form
2585
2585
the columns of the DataFrame.
2586
2586
2587
- If set_index=True, the dask DataFrame is indexed by this dataset's
2588
- coordinate. Since dask DataFrames to not support multi-indexes,
2589
- set_index only works if there is one coordinate dimension.
2587
+ Arguments
2588
+ ---------
2589
+ dim_order : list, optional
2590
+ Hierarchical dimension order for the resulting dataframe. All
2591
+ arrays are transposed to this order and then written out as flat
2592
+ vectors in contiguous order, so the last dimension in this list
2593
+ will be contiguous in the resulting DataFrame. This has a major
2594
+ influence on which operations are efficient on the resulting dask
2595
+ dataframe.
2596
+
2597
+ If provided, must include all dimensions on this dataset. By
2598
+ default, dimensions are sorted alphabetically.
2599
+ set_index : bool, optional
2600
+ If set_index=True, the dask DataFrame is indexed by this dataset's
2601
+ coordinate. Since dask DataFrames to not support multi-indexes,
2602
+ set_index only works if the dataset only contains one dimension.
2603
+
2604
+ Returns
2605
+ -------
2606
+ dask.dataframe.DataFrame
2590
2607
"""
2591
2608
2609
+ import dask .array as da
2592
2610
import dask .dataframe as dd
2593
2611
2594
- ordered_dims = self .dims
2595
- chunks = self .chunks
2596
-
2597
- # order columns so that coordinates appear before data
2598
- columns = list (self .coords ) + list (self .data_vars )
2599
-
2600
- data = []
2601
- for k in columns :
2602
- v = self ._variables [k ]
2603
-
2604
- # consider coordinate variables as well as data varibles
2605
- if isinstance (v , xr .IndexVariable ):
2606
- v = v .to_base_variable ()
2612
+ if dim_order is None :
2613
+ dim_order = list (self .dims )
2614
+ elif set (dim_order ) != set (self .dims ):
2615
+ raise ValueError (
2616
+ 'dim_order {} does not match the set of dimensions on this '
2617
+ 'Dataset: {}' .format (dim_order , list (self .dims )))
2607
2618
2608
- # ensure all variables span the same dimensions
2609
- v = v .set_dims (ordered_dims )
2619
+ ordered_dims = OrderedDict ((k , self .dims [k ]) for k in dim_order )
2610
2620
2611
- # ensure all variables have the same chunking structure
2612
- if v . chunks != chunks :
2613
- v = v . chunk ( chunks )
2621
+ columns = list ( ordered_dims )
2622
+ columns . extend ( k for k in self . coords if k not in self . dims )
2623
+ columns . extend ( self . data_vars )
2614
2624
2615
- # reshape variable contents as a 1d array
2616
- d = v .data .reshape (- 1 )
2625
+ series_list = []
2626
+ for name in columns :
2627
+ try :
2628
+ var = self .variables [name ]
2629
+ except KeyError :
2630
+ # dimension without a matching coordinate
2631
+ size = self .dims [name ]
2632
+ data = da .arange (size , chunks = size , dtype = np .int64 )
2633
+ var = Variable ((name ,), data )
2617
2634
2618
- # convert to dask DataFrames
2619
- s = dd .from_array (d , columns = [k ])
2635
+ # IndexVariable objects have a dummy .chunk() method
2636
+ if isinstance (var , IndexVariable ):
2637
+ var = var .to_base_variable ()
2620
2638
2621
- data .append (s )
2639
+ dask_array = var .set_dims (ordered_dims ).chunk (self .chunks ).data
2640
+ series = dd .from_array (dask_array .reshape (- 1 ), columns = [name ])
2641
+ series_list .append (series )
2622
2642
2623
- df = dd .concat (data , axis = 1 )
2643
+ df = dd .concat (series_list , axis = 1 )
2624
2644
2625
2645
if set_index :
2626
-
2627
- if len (ordered_dims ) != 1 :
2628
- raise ValueError (
2629
- 'set_index=True only is valid for '
2630
- 'for one-dimensional datasets' )
2631
-
2632
- # extract out first (and only) coordinate variable
2633
- coord_dim = list (ordered_dims )[0 ]
2634
-
2635
- if coord_dim in df .columns :
2636
- df = df .set_index (coord_dim )
2646
+ if len (dim_order ) == 1 :
2647
+ (dim ,) = dim_order
2648
+ df = df .set_index (dim )
2649
+ else :
2650
+ # triggers an error about multi-indexes, even if only one
2651
+ # dimension is passed
2652
+ df = df .set_index (dim_order )
2637
2653
2638
2654
return df
2639
2655
0 commit comments