Skip to content

Commit 3d1d134

Browse files
authored
combine_by_coordinates to handle unnamed data arrays. (#4696)
* Added test for combine_by_coords changes. * Modified test case to expect a dataset instead of a DataArray. Added converter to combine_by_coords to check for all DataArray case and convert to datasets. * Added tests to check combine_by_coords for exception with mixed DataArrays and dataset input and with empty list. * Formatting changes after running black * Added underscore to helper function to label as private. * Black formatting changes for whats-new doc file. * Removed imports in docstring that were automatically added by code styling tools to match the other docstrings. * Removed duplicate new item line in whats-new. * combine methods now accept unnamed DataArrays as input. * combine nested test checks nested lists of unnamed DataArrays. * Made combine_by_coords more readable. * Cosmetic changes to code style. * Removed extra test from merge with previous PR. * Updated test to use pytest.raises instead of raises_regex. * Added breaking-change entry to whats new page. * Added deprecation warning to combine_coords * Removed index monotonicity checking temporarily. * Removed duplicate entries from whats new page. * Removed TODO message * Added test for combine_nested. * Added check to combine methods to clarify parameter requirements. * Reassigned description of changes to bug fixes category. * Minor style changes. * Added blank line for style purposes.
1 parent 2f8623d commit 3d1d134

File tree

5 files changed

+230
-43
lines changed

5 files changed

+230
-43
lines changed

doc/whats-new.rst

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,6 @@ Thomas Nicholas, Tom Nicholas, Zachary Moon.
130130

131131
New Features
132132
~~~~~~~~~~~~
133-
134133
- Implement :py:meth:`DataArray.drop_duplicates`
135134
to remove duplicate dimension values (:pull:`5239`).
136135
By `Andrew Huang <https://github.com/ahuang11>`_.
@@ -143,9 +142,22 @@ New Features
143142
- Raise more informative error when decoding time variables with invalid reference dates.
144143
(:issue:`5199`, :pull:`5288`). By `Giacomo Caria <https://github.com/gcaria>`_.
145144

145+
Breaking changes
146+
~~~~~~~~~~~~~~~~
147+
- The main parameter to :py:func:`combine_by_coords` is renamed to `data_objects` instead
148+
of `datasets` so anyone calling this method using a named parameter will need to update
149+
the name accordingly (:issue:`3248`, :pull:`4696`).
150+
By `Augustus Ijams <https://github.com/aijams>`_.
151+
152+
Deprecations
153+
~~~~~~~~~~~~
154+
155+
146156
Bug fixes
147157
~~~~~~~~~
148-
158+
- :py:func:`combine_by_coords` can now handle combining a list of unnamed
159+
``DataArray`` as input (:issue:`3248`, :pull:`4696`).
160+
By `Augustus Ijams <https://github.com/aijams>`_.
149161
- Opening netCDF files from a path that doesn't end in ``.nc`` without supplying
150162
an explicit ``engine`` works again (:issue:`5295`), fixing a bug introduced in
151163
0.18.0.

xarray/core/combine.py

Lines changed: 124 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import itertools
2+
import warnings
23
from collections import Counter
34

45
import pandas as pd
@@ -8,6 +9,7 @@
89
from .dataarray import DataArray
910
from .dataset import Dataset
1011
from .merge import merge
12+
from .utils import iterate_nested
1113

1214

1315
def _infer_concat_order_from_positions(datasets):
@@ -544,6 +546,15 @@ def combine_nested(
544546
concat
545547
merge
546548
"""
549+
mixed_datasets_and_arrays = any(
550+
isinstance(obj, Dataset) for obj in iterate_nested(datasets)
551+
) and any(
552+
isinstance(obj, DataArray) and obj.name is None
553+
for obj in iterate_nested(datasets)
554+
)
555+
if mixed_datasets_and_arrays:
556+
raise ValueError("Can't combine datasets with unnamed arrays.")
557+
547558
if isinstance(concat_dim, (str, DataArray)) or concat_dim is None:
548559
concat_dim = [concat_dim]
549560

@@ -565,18 +576,79 @@ def vars_as_keys(ds):
565576
return tuple(sorted(ds))
566577

567578

568-
def combine_by_coords(
579+
def _combine_single_variable_hypercube(
569580
datasets,
581+
fill_value=dtypes.NA,
582+
data_vars="all",
583+
coords="different",
584+
compat="no_conflicts",
585+
join="outer",
586+
combine_attrs="no_conflicts",
587+
):
588+
"""
589+
Attempt to combine a list of Datasets into a hypercube using their
590+
coordinates.
591+
592+
All provided Datasets must belong to a single variable, ie. must be
593+
assigned the same variable name. This precondition is not checked by this
594+
function, so the caller is assumed to know what it's doing.
595+
596+
This function is NOT part of the public API.
597+
"""
598+
if len(datasets) == 0:
599+
raise ValueError(
600+
"At least one Dataset is required to resolve variable names "
601+
"for combined hypercube."
602+
)
603+
604+
combined_ids, concat_dims = _infer_concat_order_from_coords(list(datasets))
605+
606+
if fill_value is None:
607+
# check that datasets form complete hypercube
608+
_check_shape_tile_ids(combined_ids)
609+
else:
610+
# check only that all datasets have same dimension depth for these
611+
# vars
612+
_check_dimension_depth_tile_ids(combined_ids)
613+
614+
# Concatenate along all of concat_dims one by one to create single ds
615+
concatenated = _combine_nd(
616+
combined_ids,
617+
concat_dims=concat_dims,
618+
data_vars=data_vars,
619+
coords=coords,
620+
compat=compat,
621+
fill_value=fill_value,
622+
join=join,
623+
combine_attrs=combine_attrs,
624+
)
625+
626+
# Check the overall coordinates are monotonically increasing
627+
for dim in concat_dims:
628+
indexes = concatenated.indexes.get(dim)
629+
if not (indexes.is_monotonic_increasing or indexes.is_monotonic_decreasing):
630+
raise ValueError(
631+
"Resulting object does not have monotonic"
632+
" global indexes along dimension {}".format(dim)
633+
)
634+
635+
return concatenated
636+
637+
638+
# TODO remove empty list default param after version 0.19, see PR4696
639+
def combine_by_coords(
640+
data_objects=[],
570641
compat="no_conflicts",
571642
data_vars="all",
572643
coords="different",
573644
fill_value=dtypes.NA,
574645
join="outer",
575646
combine_attrs="no_conflicts",
647+
datasets=None,
576648
):
577649
"""
578-
Attempt to auto-magically combine the given datasets into one by using
579-
dimension coordinates.
650+
Attempt to auto-magically combine the given datasets (or data arrays)
651+
into one by using dimension coordinates.
580652
581653
This method attempts to combine a group of datasets along any number of
582654
dimensions into a single entity by inspecting coords and metadata and using
@@ -600,8 +672,9 @@ def combine_by_coords(
600672
601673
Parameters
602674
----------
603-
datasets : sequence of xarray.Dataset
604-
Dataset objects to combine.
675+
data_objects : sequence of xarray.Dataset or sequence of xarray.DataArray
676+
Data objects to combine.
677+
605678
compat : {"identical", "equals", "broadcast_equals", "no_conflicts", "override"}, optional
606679
String indicating how to compare variables of the same name for
607680
potential conflicts:
@@ -776,51 +849,62 @@ def combine_by_coords(
776849
precipitation (y, x) float64 0.4376 0.8918 0.9637 ... 0.5684 0.01879 0.6176
777850
"""
778851

779-
# Group by data vars
780-
sorted_datasets = sorted(datasets, key=vars_as_keys)
781-
grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys)
782-
783-
# Perform the multidimensional combine on each group of data variables
784-
# before merging back together
785-
concatenated_grouped_by_data_vars = []
786-
for vars, datasets_with_same_vars in grouped_by_vars:
787-
combined_ids, concat_dims = _infer_concat_order_from_coords(
788-
list(datasets_with_same_vars)
852+
# TODO remove after version 0.19, see PR4696
853+
if datasets is not None:
854+
warnings.warn(
855+
"The datasets argument has been renamed to `data_objects`."
856+
" In future passing a value for datasets will raise an error."
789857
)
858+
data_objects = datasets
790859

791-
if fill_value is None:
792-
# check that datasets form complete hypercube
793-
_check_shape_tile_ids(combined_ids)
794-
else:
795-
# check only that all datasets have same dimension depth for these
796-
# vars
797-
_check_dimension_depth_tile_ids(combined_ids)
860+
if not data_objects:
861+
return Dataset()
798862

799-
# Concatenate along all of concat_dims one by one to create single ds
800-
concatenated = _combine_nd(
801-
combined_ids,
802-
concat_dims=concat_dims,
863+
mixed_arrays_and_datasets = any(
864+
isinstance(data_object, DataArray) and data_object.name is None
865+
for data_object in data_objects
866+
) and any(isinstance(data_object, Dataset) for data_object in data_objects)
867+
if mixed_arrays_and_datasets:
868+
raise ValueError("Can't automatically combine datasets with unnamed arrays.")
869+
870+
all_unnamed_data_arrays = all(
871+
isinstance(data_object, DataArray) and data_object.name is None
872+
for data_object in data_objects
873+
)
874+
if all_unnamed_data_arrays:
875+
unnamed_arrays = data_objects
876+
temp_datasets = [data_array._to_temp_dataset() for data_array in unnamed_arrays]
877+
878+
combined_temp_dataset = _combine_single_variable_hypercube(
879+
temp_datasets,
880+
fill_value=fill_value,
803881
data_vars=data_vars,
804882
coords=coords,
805883
compat=compat,
806-
fill_value=fill_value,
807884
join=join,
808885
combine_attrs=combine_attrs,
809886
)
887+
return DataArray()._from_temp_dataset(combined_temp_dataset)
810888

811-
# Check the overall coordinates are monotonically increasing
812-
# TODO (benbovy - flexible indexes): only with pandas.Index?
813-
for dim in concat_dims:
814-
indexes = concatenated.xindexes.get(dim)
815-
if not (
816-
indexes.array.is_monotonic_increasing
817-
or indexes.array.is_monotonic_decreasing
818-
):
819-
raise ValueError(
820-
"Resulting object does not have monotonic"
821-
" global indexes along dimension {}".format(dim)
822-
)
823-
concatenated_grouped_by_data_vars.append(concatenated)
889+
else:
890+
# Group by data vars
891+
sorted_datasets = sorted(data_objects, key=vars_as_keys)
892+
grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys)
893+
894+
# Perform the multidimensional combine on each group of data variables
895+
# before merging back together
896+
concatenated_grouped_by_data_vars = []
897+
for vars, datasets_with_same_vars in grouped_by_vars:
898+
concatenated = _combine_single_variable_hypercube(
899+
list(datasets_with_same_vars),
900+
fill_value=fill_value,
901+
data_vars=data_vars,
902+
coords=coords,
903+
compat=compat,
904+
join=join,
905+
combine_attrs=combine_attrs,
906+
)
907+
concatenated_grouped_by_data_vars.append(concatenated)
824908

825909
return merge(
826910
concatenated_grouped_by_data_vars,

xarray/core/utils.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -907,3 +907,11 @@ class Default(Enum):
907907

908908

909909
_default = Default.token
910+
911+
912+
def iterate_nested(nested_list):
913+
for item in nested_list:
914+
if isinstance(item, list):
915+
yield from iterate_nested(item)
916+
else:
917+
yield item

xarray/tests/test_combine.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -646,6 +646,47 @@ def test_combine_nested_fill_value(self, fill_value):
646646
actual = combine_nested(datasets, concat_dim="t", fill_value=fill_value)
647647
assert_identical(expected, actual)
648648

649+
def test_combine_nested_unnamed_data_arrays(self):
650+
unnamed_array = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
651+
652+
actual = combine_nested([unnamed_array], concat_dim="x")
653+
expected = unnamed_array
654+
assert_identical(expected, actual)
655+
656+
unnamed_array1 = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
657+
unnamed_array2 = DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x")
658+
659+
actual = combine_nested([unnamed_array1, unnamed_array2], concat_dim="x")
660+
expected = DataArray(
661+
data=[1.0, 2.0, 3.0, 4.0], coords={"x": [0, 1, 2, 3]}, dims="x"
662+
)
663+
assert_identical(expected, actual)
664+
665+
da1 = DataArray(data=[[0.0]], coords={"x": [0], "y": [0]}, dims=["x", "y"])
666+
da2 = DataArray(data=[[1.0]], coords={"x": [0], "y": [1]}, dims=["x", "y"])
667+
da3 = DataArray(data=[[2.0]], coords={"x": [1], "y": [0]}, dims=["x", "y"])
668+
da4 = DataArray(data=[[3.0]], coords={"x": [1], "y": [1]}, dims=["x", "y"])
669+
objs = [[da1, da2], [da3, da4]]
670+
671+
expected = DataArray(
672+
data=[[0.0, 1.0], [2.0, 3.0]],
673+
coords={"x": [0, 1], "y": [0, 1]},
674+
dims=["x", "y"],
675+
)
676+
actual = combine_nested(objs, concat_dim=["x", "y"])
677+
assert_identical(expected, actual)
678+
679+
# TODO aijams - Determine if this test is appropriate.
680+
def test_nested_combine_mixed_datasets_arrays(self):
681+
objs = [
682+
DataArray([0, 1], dims=("x"), coords=({"x": [0, 1]})),
683+
Dataset({"x": [2, 3]}),
684+
]
685+
with pytest.raises(
686+
ValueError, match=r"Can't combine datasets with unnamed arrays."
687+
):
688+
combine_nested(objs, "x")
689+
649690

650691
class TestCombineAuto:
651692
def test_combine_by_coords(self):
@@ -689,6 +730,17 @@ def test_combine_by_coords(self):
689730
def test_empty_input(self):
690731
assert_identical(Dataset(), combine_by_coords([]))
691732

733+
def test_combine_coords_mixed_datasets_arrays(self):
734+
objs = [
735+
DataArray([0, 1], dims=("x"), coords=({"x": [0, 1]})),
736+
Dataset({"x": [2, 3]}),
737+
]
738+
with pytest.raises(
739+
ValueError,
740+
match=r"Can't automatically combine datasets with unnamed arrays.",
741+
):
742+
combine_by_coords(objs)
743+
692744
@pytest.mark.parametrize(
693745
"join, expected",
694746
[
@@ -992,6 +1044,22 @@ def test_combine_by_coords_incomplete_hypercube(self):
9921044
with pytest.raises(ValueError):
9931045
combine_by_coords([x1, x2, x3], fill_value=None)
9941046

1047+
def test_combine_by_coords_unnamed_arrays(self):
1048+
unnamed_array = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
1049+
1050+
actual = combine_by_coords([unnamed_array])
1051+
expected = unnamed_array
1052+
assert_identical(expected, actual)
1053+
1054+
unnamed_array1 = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
1055+
unnamed_array2 = DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x")
1056+
1057+
actual = combine_by_coords([unnamed_array1, unnamed_array2])
1058+
expected = DataArray(
1059+
data=[1.0, 2.0, 3.0, 4.0], coords={"x": [0, 1, 2, 3]}, dims="x"
1060+
)
1061+
assert_identical(expected, actual)
1062+
9951063

9961064
@requires_cftime
9971065
def test_combine_by_coords_distant_cftime_dates():

xarray/tests/test_utils.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from xarray.coding.cftimeindex import CFTimeIndex
99
from xarray.core import duck_array_ops, utils
1010
from xarray.core.indexes import PandasIndex
11-
from xarray.core.utils import either_dict_or_kwargs
11+
from xarray.core.utils import either_dict_or_kwargs, iterate_nested
1212

1313
from . import assert_array_equal, requires_cftime, requires_dask
1414
from .test_coding_times import _all_cftime_date_types
@@ -318,3 +318,18 @@ def test_infix_dims(supplied, all_, expected):
318318
def test_infix_dims_errors(supplied, all_):
319319
with pytest.raises(ValueError):
320320
list(utils.infix_dims(supplied, all_))
321+
322+
323+
@pytest.mark.parametrize(
324+
"nested_list, expected",
325+
[
326+
([], []),
327+
([1], [1]),
328+
([1, 2, 3], [1, 2, 3]),
329+
([[1]], [1]),
330+
([[1, 2], [3, 4]], [1, 2, 3, 4]),
331+
([[[1, 2, 3], [4]], [5, 6]], [1, 2, 3, 4, 5, 6]),
332+
],
333+
)
334+
def test_iterate_nested(nested_list, expected):
335+
assert list(iterate_nested(nested_list)) == expected

0 commit comments

Comments
 (0)