Add pathlib.Path support to open_(mf)dataset (#1514)

willirath · shoyer · commit 4a15cfa6f3e6 · 2017-09-01T08:31:51.000-07:00
* Add pathlib support * Loop over tmpfile functions * Added show_commit_url to asv.conf (#1515) * Added show_commit_url to asv.conf This should setup the proper links from the published output to the commit on Github. FYI the benchmarks should be running stably now, and posted to http://pandas.pydata.org/speed/xarray. http://pandas.pydata.org/speed/xarray/regressions.xml has an RSS feed to the regressions. * Update asv.conf.json * Small documentation fixes (#1516) * Clarify in docs that inferring DataArray dimensions is deprecated * Fix DataArray docstring * Clarify DataArray coords documentation * Condense pathlib handling for open_mf_dataset * Add and test pathlib support for backends * Add pathlib2 for python < 3 * Use pathlib backport if available. This follows <jazzband/pathlib2#8 (comment)> who argues for sticking to pathlib2. * Use pathlib w DataArray.to_netcdf * Handle case of completely missing pathlib * Remove pathlib requirement * Drop pathlib from minimal test env * Add what's-new entry on pathlib support * Prefer stdlib pathlib * Suppress ImportError's for pathlib * Acutally get suppress function * Add decorator for tests requiring pathlib(2) * Move path_type to central submodule * Remove unnecessary parens * Revert "Added show_commit_url to asv.conf (#1515)" This reverts commit 02023ed. * Revert "Small documentation fixes (#1516)" This reverts commit 4276bb8. * Fix typo in docstring and fallback-module name * Tweak what's new for pathlib support
diff --git a/ci/requirements-py27-cdat+pynio.yml b/ci/requirements-py27-cdat+pynio.yml
@@ -13,6 +13,7 @@ dependencies:
   - netcdf4
   - numpy
   - pandas
+  - pathlib2
   - pynio
   - pytest
   - scipy
diff --git a/ci/requirements-py27-windows.yml b/ci/requirements-py27-windows.yml
@@ -9,6 +9,7 @@ dependencies:
   - h5netcdf
   - matplotlib
   - netcdf4
+  - pathlib2
   - pytest
   - numpy
   - pandas
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -35,9 +35,31 @@ Backward Incompatible Changes
 Enhancements
 ~~~~~~~~~~~~
 
+- Support for `pathlib.Path` objects added to
+  :py:func:`~xarray.open_dataset`, :py:func:`~xarray.open_mfdataset`,
+  :py:func:`~xarray.to_netcdf`, and :py:func:`~xarray.save_mfdataset`
+  (:issue:`799`):
+
+  .. ipython::
+    :verbatim:
+
+    In [2]: from pathlib import Path  # In Python 2, use pathlib2!
+
+    In [3]: data_dir = Path("data/")
+
+    In [4]: one_file = data_dir / "dta_for_month_01.nc"
+
+    In [5]: xr.open_dataset(one_file)
+    Out[5]:
+    <xarray.Dataset>
+    [...]
+
+  By `Willi Rath <https://github.com/willirath>`_.
+
 - More attributes available in :py:attr:`~xarray.Dataset.attrs` dictionary when
   raster files are opened with :py:func:`~xarray.open_rasterio`.
   By `Greg Brener <https://github.com/gbrener>`_
+
 - Support for NetCDF files using an ``_Unsigned`` attribute to indicate that a
   a signed integer data type should be interpreted as unsigned bytes
   (:issue:`1444`).
diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -7,14 +7,15 @@
 from io import BytesIO
 from numbers import Number
 
+
 import numpy as np
 
 from .. import backends, conventions
 from .common import ArrayWriter, GLOBAL_LOCK
 from ..core import indexing
 from ..core.combine import auto_combine
 from ..core.utils import close_on_error, is_remote_uri
-from ..core.pycompat import basestring
+from ..core.pycompat import basestring, path_type
 
 DATAARRAY_NAME = '__xarray_dataarray_name__'
 DATAARRAY_VARIABLE = '__xarray_dataarray_variable__'
@@ -139,12 +140,12 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True,
 
     Parameters
     ----------
-    filename_or_obj : str, file or xarray.backends.*DataStore
-        Strings are interpreted as a path to a netCDF file or an OpenDAP URL
-        and opened with python-netCDF4, unless the filename ends with .gz, in
-        which case the file is gunzipped and opened with scipy.io.netcdf (only
-        netCDF3 supported). File-like objects are opened with scipy.io.netcdf
-        (only netCDF3 supported).
+    filename_or_obj : str, Path, file or xarray.backends.*DataStore
+        Strings and Path objects are interpreted as a path to a netCDF file
+        or an OpenDAP URL and opened with python-netCDF4, unless the filename
+        ends with .gz, in which case the file is gunzipped and opened with
+        scipy.io.netcdf (only netCDF3 supported). File-like objects are opened
+        with scipy.io.netcdf (only netCDF3 supported).
     group : str, optional
         Path to the netCDF4 group in the given file to open (only works for
         netCDF4 files).
@@ -253,6 +254,9 @@ def maybe_decode_store(store, lock=False):
 
         return ds2
 
+    if isinstance(filename_or_obj, path_type):
+        filename_or_obj = str(filename_or_obj)
+
     if isinstance(filename_or_obj, backends.AbstractDataStore):
         store = filename_or_obj
     elif isinstance(filename_or_obj, basestring):
@@ -318,12 +322,12 @@ def open_dataarray(*args, **kwargs):
 
     Parameters
     ----------
-    filename_or_obj : str, file or xarray.backends.*DataStore
-        Strings are interpreted as a path to a netCDF file or an OpenDAP URL
-        and opened with python-netCDF4, unless the filename ends with .gz, in
-        which case the file is gunzipped and opened with scipy.io.netcdf (only
-        netCDF3 supported). File-like objects are opened with scipy.io.netcdf
-        (only netCDF3 supported).
+    filename_or_obj : str, Path, file or xarray.backends.*DataStore
+        Strings and Paths are interpreted as a path to a netCDF file or an
+        OpenDAP URL and opened with python-netCDF4, unless the filename ends
+        with .gz, in which case the file is gunzipped and opened with
+        scipy.io.netcdf (only netCDF3 supported). File-like objects are opened
+        with scipy.io.netcdf (only netCDF3 supported).
     group : str, optional
         Path to the netCDF4 group in the given file to open (only works for
         netCDF4 files).
@@ -438,7 +442,8 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
     ----------
     paths : str or sequence
         Either a string glob in the form "path/to/my/files/*.nc" or an explicit
-        list of files to open.
+        list of files to open.  Paths can be given as strings or as pathlib
+        Paths.
     chunks : int or dict, optional
         Dictionary with keys given by dimension names and values given by chunk
         sizes. In general, these should divide the dimensions of each dataset.
@@ -497,6 +502,9 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
     """
     if isinstance(paths, basestring):
         paths = sorted(glob(paths))
+    else:
+        paths = [str(p) if isinstance(p, path_type) else p for p in paths]
+
     if not paths:
         raise IOError('no files to open')
 
@@ -533,6 +541,8 @@ def to_netcdf(dataset, path_or_file=None, mode='w', format=None, group=None,
 
     The ``writer`` argument is only for the private use of save_mfdataset.
     """
+    if isinstance(path_or_file, path_type):
+        path_or_file = str(path_or_file)
     if encoding is None:
         encoding = {}
     if path_or_file is None:
@@ -597,12 +607,14 @@ def save_mfdataset(datasets, paths, mode='w', format=None, groups=None,
     ----------
     datasets : list of xarray.Dataset
         List of datasets to save.
-    paths : list of str
+    paths : list of str or list of Paths
         List of paths to which to save each corresponding dataset.
     mode : {'w', 'a'}, optional
         Write ('w') or append ('a') mode. If mode='w', any existing file at
         these locations will be overwritten.
-    format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT', 'NETCDF3_CLASSIC'}, optional
+    format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT',
+              'NETCDF3_CLASSIC'}, optional
+
         File format for the resulting netCDF file:
 
         * NETCDF4: Data is stored in an HDF5 file, using netCDF4 API
diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -1286,15 +1286,16 @@ def to_netcdf(self, *args, **kwargs):
 
         Parameters
         ----------
-        path : str, optional
+        path : str or Path, optional
             Path to which to save this dataset. If no path is provided, this
             function returns the resulting netCDF file as a bytes object; in
             this case, we need to use scipy.io.netcdf, which does not support
             netCDF version 4 (the default format becomes NETCDF3_64BIT).
         mode : {'w', 'a'}, optional
             Write ('w') or append ('a') mode. If mode='w', any existing file at
             this location will be overwritten.
-        format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT', 'NETCDF3_CLASSIC'}, optional
+        format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT',
+                  'NETCDF3_CLASSIC'}, optional
             File format for the resulting netCDF file:
 
             * NETCDF4: Data is stored in an HDF5 file, using netCDF4 API
@@ -1324,7 +1325,8 @@ def to_netcdf(self, *args, **kwargs):
         encoding : dict, optional
             Nested dictionary with variable names as keys and dictionaries of
             variable specific encodings as values, e.g.,
-            ``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1, 'zlib': True}, ...}``
+            ``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1,
+               'zlib': True}, ...}``
 
         Notes
         -----
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -924,7 +924,7 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None,
 
         Parameters
         ----------
-        path : str or file-like object, optional
+        path : str, Path or file-like object, optional
             Path to which to save this dataset. File-like objects are only
             supported by the scipy engine. If no path is provided, this
             function returns the resulting netCDF file as bytes; in this case,
@@ -963,7 +963,8 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None,
         encoding : dict, optional
             Nested dictionary with variable names as keys and dictionaries of
             variable specific encodings as values, e.g.,
-            ``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1, 'zlib': True}, ...}``
+            ``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1,
+                               'zlib': True}, ...}``
         unlimited_dims : sequence of str, optional
             Dimension(s) that should be serialized as unlimited dimensions.
             By default, no dimensions are treated as unlimited dimensions.
diff --git a/xarray/core/pycompat.py b/xarray/core/pycompat.py
@@ -59,6 +59,16 @@ def itervalues(d):
 except ImportError:  # pragma: no cover
     dask_array_type = ()
 
+try:
+    try:
+        from pathlib import Path
+    except ImportError as e:
+        from pathlib2 import Path
+    path_type = (Path, )
+except ImportError as e:
+    path_type = ()
+
+
 try:
     from contextlib import suppress
 except ImportError:
@@ -188,7 +198,7 @@ def __exit__(self, *exc_details):
             # We manipulate the exception state so it behaves as though
             # we were actually nesting multiple with statements
             frame_exc = sys.exc_info()[1]
-            
+
             def _fix_exception_context(new_exc, old_exc):
                 # Context may not be correct, so find the end of the chain
                 while 1:
diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py
@@ -83,6 +83,17 @@
 except ImportError:
     has_rasterio = False
 
+try:
+    import pathlib
+    has_pathlib = True
+except ImportError:
+    try:
+        import pathlib2
+        has_pathlib = True
+    except ImportError:
+        has_pathlib = False
+
+
 # slighly simpler construction that the full functions.
 # Generally `pytest.importorskip('package')` inline is even easier
 requires_matplotlib = pytest.mark.skipif(
@@ -105,6 +116,9 @@
     not has_bottleneck, reason='requires bottleneck')
 requires_rasterio = pytest.mark.skipif(
     not has_rasterio, reason='requires rasterio')
+requires_pathlib = pytest.mark.skipif(
+    not has_pathlib, reason='requires pathlib / pathlib2'
+)
 
 
 try:
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -26,8 +26,9 @@
 
 from . import (TestCase, requires_scipy, requires_netCDF4, requires_pydap,
                requires_scipy_or_netCDF4, requires_dask, requires_h5netcdf,
-               requires_pynio, has_netCDF4, has_scipy, assert_allclose,
-               flaky, network, requires_rasterio, assert_identical)
+               requires_pynio, requires_pathlib, has_netCDF4, has_scipy,
+               assert_allclose, flaky, network, requires_rasterio,
+               assert_identical)
 from .test_dataset import create_test_data
 
 try:
@@ -40,6 +41,14 @@
 except ImportError:
     pass
 
+try:
+    from pathlib import Path
+except ImportError:
+    try:
+        from pathlib2 import Path
+    except ImportError:
+        pass
+
 
 ON_WINDOWS = sys.platform == 'win32'
 
@@ -302,7 +311,8 @@ def test_roundtrip_timedelta_data(self):
             self.assertDatasetIdentical(expected, actual)
 
     def test_roundtrip_float64_data(self):
-        expected = Dataset({'x': ('y', np.array([1.0, 2.0, np.pi], dtype='float64'))})
+        expected = Dataset({'x': ('y', np.array([1.0, 2.0, np.pi],
+                                                dtype='float64'))})
         with self.roundtrip(expected) as actual:
             self.assertDatasetIdentical(expected, actual)
 
@@ -738,7 +748,8 @@ def test_mask_and_scale(self):
                 v.scale_factor = 0.1
                 v[:] = np.array([-1, -1, 0, 1, 2])
 
-            # first make sure netCDF4 reads the masked and scaled data correctly
+            # first make sure netCDF4 reads the masked and scaled data
+            # correctly
             with nc4.Dataset(tmp_file, mode='r') as nc:
                 expected = np.ma.array([-1, -1, 10, 10.1, 10.2],
                                        mask=[True, True, False, False, False])
@@ -1305,6 +1316,19 @@ def test_open_mfdataset(self):
         with self.assertRaisesRegexp(IOError, 'no files to open'):
             open_mfdataset('foo-bar-baz-*.nc', autoclose=self.autoclose)
 
+    @requires_pathlib
+    def test_open_mfdataset_pathlib(self):
+        original = Dataset({'foo': ('x', np.random.randn(10))})
+        with create_tmp_file() as tmp1:
+            with create_tmp_file() as tmp2:
+                tmp1 = Path(tmp1)
+                tmp2 = Path(tmp2)
+                original.isel(x=slice(5)).to_netcdf(tmp1)
+                original.isel(x=slice(5, 10)).to_netcdf(tmp2)
+                with open_mfdataset([tmp1, tmp2],
+                                    autoclose=self.autoclose) as actual:
+                    self.assertDatasetAllClose(original, actual)
+
     def test_attrs_mfdataset(self):
         original = Dataset({'foo': ('x', np.random.randn(10))})
         with create_tmp_file() as tmp1:
@@ -1355,6 +1379,20 @@ def test_save_mfdataset_invalid(self):
         with self.assertRaisesRegexp(ValueError, 'same length'):
             save_mfdataset([ds, ds], ['only one path'])
 
+    @requires_pathlib
+    def test_save_mfdataset_pathlib_roundtrip(self):
+        original = Dataset({'foo': ('x', np.random.randn(10))})
+        datasets = [original.isel(x=slice(5)),
+                    original.isel(x=slice(5, 10))]
+        with create_tmp_file() as tmp1:
+            with create_tmp_file() as tmp2:
+                tmp1 = Path(tmp1)
+                tmp2 = Path(tmp2)
+                save_mfdataset(datasets, [tmp1, tmp2])
+                with open_mfdataset([tmp1, tmp2],
+                                    autoclose=self.autoclose) as actual:
+                    self.assertDatasetIdentical(actual, original)
+
     def test_open_and_do_math(self):
         original = Dataset({'foo': ('x', np.random.randn(10))})
         with create_tmp_file() as tmp:
@@ -1946,3 +1984,14 @@ def test_open_dataarray_options(self):
             expected = data.drop('y')
             with open_dataarray(tmp, drop_variables=['y']) as loaded:
                 self.assertDataArrayIdentical(expected, loaded)
+
+    @requires_pathlib
+    def test_dataarray_to_netcdf_no_name_pathlib(self):
+        original_da = DataArray(np.arange(12).reshape((3, 4)))
+
+        with create_tmp_file() as tmp:
+            tmp = Path(tmp)
+            original_da.to_netcdf(tmp)
+
+            with open_dataarray(tmp) as loaded_da:
+                self.assertDataArrayIdentical(original_da, loaded_da)

-Original file line number
+Diff line change
   - netcdf4
   - numpy
   - pandas
 +  - pathlib2
   - pynio
   - pytest
   - scipy