combine_by_coordinates to handle unnamed data arrays. (#4696)

aijams · web-flow · commit 3d1d134278bb · 2021-07-02T17:34:36.000-04:00
* Added test for combine_by_coords changes.

* Modified test case to expect a dataset instead of a DataArray. Added converter to combine_by_coords to check for all DataArray case and convert to datasets.

* Added tests to check combine_by_coords for exception with mixed DataArrays and dataset input and with empty list.

* Formatting changes after running black

* Added underscore to helper function to label as private.

* Black formatting changes for whats-new doc file.

* Removed imports in docstring that were automatically added by code styling tools to match the other docstrings.

* Removed duplicate new item line in whats-new.

* combine methods now accept unnamed DataArrays as input.

* combine nested test checks nested lists of unnamed DataArrays.

* Made combine_by_coords more readable.

* Cosmetic changes to code style.

* Removed extra test from merge with previous PR.

* Updated test to use pytest.raises instead of raises_regex.

* Added breaking-change entry to whats new page.

* Added deprecation warning to combine_coords

* Removed index monotonicity checking temporarily.

* Removed duplicate entries from whats new page.

* Removed TODO message

* Added test for combine_nested.

* Added check to combine methods to clarify parameter requirements.

* Reassigned description of changes to bug fixes category.

* Minor style changes.

* Added blank line for style purposes.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -130,7 +130,6 @@ Thomas Nicholas, Tom Nicholas, Zachary Moon.
 
 New Features
 ~~~~~~~~~~~~
-
 - Implement :py:meth:`DataArray.drop_duplicates`
   to remove duplicate dimension values (:pull:`5239`).
   By `Andrew Huang <https://github.com/ahuang11>`_.
@@ -143,9 +142,22 @@ New Features
 - Raise more informative error when decoding time variables with invalid reference dates.
   (:issue:`5199`, :pull:`5288`). By `Giacomo Caria <https://github.com/gcaria>`_.
 
+Breaking changes
+~~~~~~~~~~~~~~~~
+- The main parameter to :py:func:`combine_by_coords` is renamed to `data_objects` instead
+  of `datasets` so anyone calling this method using a named parameter will need to update
+  the name accordingly (:issue:`3248`, :pull:`4696`).
+  By `Augustus Ijams <https://github.com/aijams>`_.
+
+Deprecations
+~~~~~~~~~~~~
+
+
 Bug fixes
 ~~~~~~~~~
-
+- :py:func:`combine_by_coords` can now handle combining a list of unnamed
+  ``DataArray`` as input (:issue:`3248`, :pull:`4696`).
+  By `Augustus Ijams <https://github.com/aijams>`_.
 - Opening netCDF files from a path that doesn't end in ``.nc`` without supplying
   an explicit ``engine`` works again (:issue:`5295`), fixing a bug introduced in
   0.18.0.
diff --git a/xarray/core/combine.py b/xarray/core/combine.py
@@ -1,4 +1,5 @@
 import itertools
+import warnings
 from collections import Counter
 
 import pandas as pd
@@ -8,6 +9,7 @@
 from .dataarray import DataArray
 from .dataset import Dataset
 from .merge import merge
+from .utils import iterate_nested
 
 
 def _infer_concat_order_from_positions(datasets):
@@ -544,6 +546,15 @@ def combine_nested(
     concat
     merge
     """
+    mixed_datasets_and_arrays = any(
+        isinstance(obj, Dataset) for obj in iterate_nested(datasets)
+    ) and any(
+        isinstance(obj, DataArray) and obj.name is None
+        for obj in iterate_nested(datasets)
+    )
+    if mixed_datasets_and_arrays:
+        raise ValueError("Can't combine datasets with unnamed arrays.")
+
     if isinstance(concat_dim, (str, DataArray)) or concat_dim is None:
         concat_dim = [concat_dim]
 
@@ -565,18 +576,79 @@ def vars_as_keys(ds):
     return tuple(sorted(ds))
 
 
-def combine_by_coords(
+def _combine_single_variable_hypercube(
     datasets,
+    fill_value=dtypes.NA,
+    data_vars="all",
+    coords="different",
+    compat="no_conflicts",
+    join="outer",
+    combine_attrs="no_conflicts",
+):
+    """
+    Attempt to combine a list of Datasets into a hypercube using their
+    coordinates.
+
+    All provided Datasets must belong to a single variable, ie. must be
+    assigned the same variable name. This precondition is not checked by this
+    function, so the caller is assumed to know what it's doing.
+
+    This function is NOT part of the public API.
+    """
+    if len(datasets) == 0:
+        raise ValueError(
+            "At least one Dataset is required to resolve variable names "
+            "for combined hypercube."
+        )
+
+    combined_ids, concat_dims = _infer_concat_order_from_coords(list(datasets))
+
+    if fill_value is None:
+        # check that datasets form complete hypercube
+        _check_shape_tile_ids(combined_ids)
+    else:
+        # check only that all datasets have same dimension depth for these
+        # vars
+        _check_dimension_depth_tile_ids(combined_ids)
+
+    # Concatenate along all of concat_dims one by one to create single ds
+    concatenated = _combine_nd(
+        combined_ids,
+        concat_dims=concat_dims,
+        data_vars=data_vars,
+        coords=coords,
+        compat=compat,
+        fill_value=fill_value,
+        join=join,
+        combine_attrs=combine_attrs,
+    )
+
+    # Check the overall coordinates are monotonically increasing
+    for dim in concat_dims:
+        indexes = concatenated.indexes.get(dim)
+        if not (indexes.is_monotonic_increasing or indexes.is_monotonic_decreasing):
+            raise ValueError(
+                "Resulting object does not have monotonic"
+                " global indexes along dimension {}".format(dim)
+            )
+
+    return concatenated
+
+
+# TODO remove empty list default param after version 0.19, see PR4696
+def combine_by_coords(
+    data_objects=[],
     compat="no_conflicts",
     data_vars="all",
     coords="different",
     fill_value=dtypes.NA,
     join="outer",
     combine_attrs="no_conflicts",
+    datasets=None,
 ):
     """
-    Attempt to auto-magically combine the given datasets into one by using
-    dimension coordinates.
+    Attempt to auto-magically combine the given datasets (or data arrays)
+    into one by using dimension coordinates.
 
     This method attempts to combine a group of datasets along any number of
     dimensions into a single entity by inspecting coords and metadata and using
@@ -600,8 +672,9 @@ def combine_by_coords(
 
     Parameters
     ----------
-    datasets : sequence of xarray.Dataset
-        Dataset objects to combine.
+    data_objects : sequence of xarray.Dataset or sequence of xarray.DataArray
+        Data objects to combine.
+
     compat : {"identical", "equals", "broadcast_equals", "no_conflicts", "override"}, optional
         String indicating how to compare variables of the same name for
         potential conflicts:
@@ -776,51 +849,62 @@ def combine_by_coords(
         precipitation  (y, x) float64 0.4376 0.8918 0.9637 ... 0.5684 0.01879 0.6176
     """
 
-    # Group by data vars
-    sorted_datasets = sorted(datasets, key=vars_as_keys)
-    grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys)
-
-    # Perform the multidimensional combine on each group of data variables
-    # before merging back together
-    concatenated_grouped_by_data_vars = []
-    for vars, datasets_with_same_vars in grouped_by_vars:
-        combined_ids, concat_dims = _infer_concat_order_from_coords(
-            list(datasets_with_same_vars)
+    # TODO remove after version 0.19, see PR4696
+    if datasets is not None:
+        warnings.warn(
+            "The datasets argument has been renamed to `data_objects`."
+            " In future passing a value for datasets will raise an error."
         )
+        data_objects = datasets
 
-        if fill_value is None:
-            # check that datasets form complete hypercube
-            _check_shape_tile_ids(combined_ids)
-        else:
-            # check only that all datasets have same dimension depth for these
-            # vars
-            _check_dimension_depth_tile_ids(combined_ids)
+    if not data_objects:
+        return Dataset()
 
-        # Concatenate along all of concat_dims one by one to create single ds
-        concatenated = _combine_nd(
-            combined_ids,
-            concat_dims=concat_dims,
+    mixed_arrays_and_datasets = any(
+        isinstance(data_object, DataArray) and data_object.name is None
+        for data_object in data_objects
+    ) and any(isinstance(data_object, Dataset) for data_object in data_objects)
+    if mixed_arrays_and_datasets:
+        raise ValueError("Can't automatically combine datasets with unnamed arrays.")
+
+    all_unnamed_data_arrays = all(
+        isinstance(data_object, DataArray) and data_object.name is None
+        for data_object in data_objects
+    )
+    if all_unnamed_data_arrays:
+        unnamed_arrays = data_objects
+        temp_datasets = [data_array._to_temp_dataset() for data_array in unnamed_arrays]
+
+        combined_temp_dataset = _combine_single_variable_hypercube(
+            temp_datasets,
+            fill_value=fill_value,
             data_vars=data_vars,
             coords=coords,
             compat=compat,
-            fill_value=fill_value,
             join=join,
             combine_attrs=combine_attrs,
         )
+        return DataArray()._from_temp_dataset(combined_temp_dataset)
 
-        # Check the overall coordinates are monotonically increasing
-        # TODO (benbovy - flexible indexes): only with pandas.Index?
-        for dim in concat_dims:
-            indexes = concatenated.xindexes.get(dim)
-            if not (
-                indexes.array.is_monotonic_increasing
-                or indexes.array.is_monotonic_decreasing
-            ):
-                raise ValueError(
-                    "Resulting object does not have monotonic"
-                    " global indexes along dimension {}".format(dim)
-                )
-        concatenated_grouped_by_data_vars.append(concatenated)
+    else:
+        # Group by data vars
+        sorted_datasets = sorted(data_objects, key=vars_as_keys)
+        grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys)
+
+        # Perform the multidimensional combine on each group of data variables
+        # before merging back together
+        concatenated_grouped_by_data_vars = []
+        for vars, datasets_with_same_vars in grouped_by_vars:
+            concatenated = _combine_single_variable_hypercube(
+                list(datasets_with_same_vars),
+                fill_value=fill_value,
+                data_vars=data_vars,
+                coords=coords,
+                compat=compat,
+                join=join,
+                combine_attrs=combine_attrs,
+            )
+            concatenated_grouped_by_data_vars.append(concatenated)
 
     return merge(
         concatenated_grouped_by_data_vars,
diff --git a/xarray/core/utils.py b/xarray/core/utils.py
@@ -907,3 +907,11 @@ class Default(Enum):
 
 
 _default = Default.token
+
+
+def iterate_nested(nested_list):
+    for item in nested_list:
+        if isinstance(item, list):
+            yield from iterate_nested(item)
+        else:
+            yield item
diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py
@@ -646,6 +646,47 @@ def test_combine_nested_fill_value(self, fill_value):
         actual = combine_nested(datasets, concat_dim="t", fill_value=fill_value)
         assert_identical(expected, actual)
 
+    def test_combine_nested_unnamed_data_arrays(self):
+        unnamed_array = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
+
+        actual = combine_nested([unnamed_array], concat_dim="x")
+        expected = unnamed_array
+        assert_identical(expected, actual)
+
+        unnamed_array1 = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
+        unnamed_array2 = DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x")
+
+        actual = combine_nested([unnamed_array1, unnamed_array2], concat_dim="x")
+        expected = DataArray(
+            data=[1.0, 2.0, 3.0, 4.0], coords={"x": [0, 1, 2, 3]}, dims="x"
+        )
+        assert_identical(expected, actual)
+
+        da1 = DataArray(data=[[0.0]], coords={"x": [0], "y": [0]}, dims=["x", "y"])
+        da2 = DataArray(data=[[1.0]], coords={"x": [0], "y": [1]}, dims=["x", "y"])
+        da3 = DataArray(data=[[2.0]], coords={"x": [1], "y": [0]}, dims=["x", "y"])
+        da4 = DataArray(data=[[3.0]], coords={"x": [1], "y": [1]}, dims=["x", "y"])
+        objs = [[da1, da2], [da3, da4]]
+
+        expected = DataArray(
+            data=[[0.0, 1.0], [2.0, 3.0]],
+            coords={"x": [0, 1], "y": [0, 1]},
+            dims=["x", "y"],
+        )
+        actual = combine_nested(objs, concat_dim=["x", "y"])
+        assert_identical(expected, actual)
+
+    # TODO aijams - Determine if this test is appropriate.
+    def test_nested_combine_mixed_datasets_arrays(self):
+        objs = [
+            DataArray([0, 1], dims=("x"), coords=({"x": [0, 1]})),
+            Dataset({"x": [2, 3]}),
+        ]
+        with pytest.raises(
+            ValueError, match=r"Can't combine datasets with unnamed arrays."
+        ):
+            combine_nested(objs, "x")
+
 
 class TestCombineAuto:
     def test_combine_by_coords(self):
@@ -689,6 +730,17 @@ def test_combine_by_coords(self):
     def test_empty_input(self):
         assert_identical(Dataset(), combine_by_coords([]))
 
+    def test_combine_coords_mixed_datasets_arrays(self):
+        objs = [
+            DataArray([0, 1], dims=("x"), coords=({"x": [0, 1]})),
+            Dataset({"x": [2, 3]}),
+        ]
+        with pytest.raises(
+            ValueError,
+            match=r"Can't automatically combine datasets with unnamed arrays.",
+        ):
+            combine_by_coords(objs)
+
     @pytest.mark.parametrize(
         "join, expected",
         [
@@ -992,6 +1044,22 @@ def test_combine_by_coords_incomplete_hypercube(self):
         with pytest.raises(ValueError):
             combine_by_coords([x1, x2, x3], fill_value=None)
 
+    def test_combine_by_coords_unnamed_arrays(self):
+        unnamed_array = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
+
+        actual = combine_by_coords([unnamed_array])
+        expected = unnamed_array
+        assert_identical(expected, actual)
+
+        unnamed_array1 = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
+        unnamed_array2 = DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x")
+
+        actual = combine_by_coords([unnamed_array1, unnamed_array2])
+        expected = DataArray(
+            data=[1.0, 2.0, 3.0, 4.0], coords={"x": [0, 1, 2, 3]}, dims="x"
+        )
+        assert_identical(expected, actual)
+
 
 @requires_cftime
 def test_combine_by_coords_distant_cftime_dates():
diff --git a/xarray/tests/test_utils.py b/xarray/tests/test_utils.py
@@ -8,7 +8,7 @@
 from xarray.coding.cftimeindex import CFTimeIndex
 from xarray.core import duck_array_ops, utils
 from xarray.core.indexes import PandasIndex
-from xarray.core.utils import either_dict_or_kwargs
+from xarray.core.utils import either_dict_or_kwargs, iterate_nested
 
 from . import assert_array_equal, requires_cftime, requires_dask
 from .test_coding_times import _all_cftime_date_types
@@ -318,3 +318,18 @@ def test_infix_dims(supplied, all_, expected):
 def test_infix_dims_errors(supplied, all_):
     with pytest.raises(ValueError):
         list(utils.infix_dims(supplied, all_))
+
+
+@pytest.mark.parametrize(
+    "nested_list, expected",
+    [
+        ([], []),
+        ([1], [1]),
+        ([1, 2, 3], [1, 2, 3]),
+        ([[1]], [1]),
+        ([[1, 2], [3, 4]], [1, 2, 3, 4]),
+        ([[[1, 2, 3], [4]], [5, 6]], [1, 2, 3, 4, 5, 6]),
+    ],
+)
+def test_iterate_nested(nested_list, expected):
+    assert list(iterate_nested(nested_list)) == expected