Skip to content

Commit 4a15cfa

Browse files
willirathshoyer
authored andcommitted
Add pathlib.Path support to open_(mf)dataset (#1514)
* Add pathlib support * Loop over tmpfile functions * Added show_commit_url to asv.conf (#1515) * Added show_commit_url to asv.conf This should setup the proper links from the published output to the commit on Github. FYI the benchmarks should be running stably now, and posted to http://pandas.pydata.org/speed/xarray. http://pandas.pydata.org/speed/xarray/regressions.xml has an RSS feed to the regressions. * Update asv.conf.json * Small documentation fixes (#1516) * Clarify in docs that inferring DataArray dimensions is deprecated * Fix DataArray docstring * Clarify DataArray coords documentation * Condense pathlib handling for open_mf_dataset * Add and test pathlib support for backends * Add pathlib2 for python < 3 * Use pathlib backport if available. This follows <jazzband/pathlib2#8 (comment)> who argues for sticking to pathlib2. * Use pathlib w DataArray.to_netcdf * Handle case of completely missing pathlib * Remove pathlib requirement * Drop pathlib from minimal test env * Add what's-new entry on pathlib support * Prefer stdlib pathlib * Suppress ImportError's for pathlib * Acutally get suppress function * Add decorator for tests requiring pathlib(2) * Move path_type to central submodule * Remove unnecessary parens * Revert "Added show_commit_url to asv.conf (#1515)" This reverts commit 02023ed. * Revert "Small documentation fixes (#1516)" This reverts commit 4276bb8. * Fix typo in docstring and fallback-module name * Tweak what's new for pathlib support
1 parent 4571d60 commit 4a15cfa

9 files changed

+138
-26
lines changed

ci/requirements-py27-cdat+pynio.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ dependencies:
1313
- netcdf4
1414
- numpy
1515
- pandas
16+
- pathlib2
1617
- pynio
1718
- pytest
1819
- scipy

ci/requirements-py27-windows.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ dependencies:
99
- h5netcdf
1010
- matplotlib
1111
- netcdf4
12+
- pathlib2
1213
- pytest
1314
- numpy
1415
- pandas

doc/whats-new.rst

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,31 @@ Backward Incompatible Changes
3535
Enhancements
3636
~~~~~~~~~~~~
3737

38+
- Support for `pathlib.Path` objects added to
39+
:py:func:`~xarray.open_dataset`, :py:func:`~xarray.open_mfdataset`,
40+
:py:func:`~xarray.to_netcdf`, and :py:func:`~xarray.save_mfdataset`
41+
(:issue:`799`):
42+
43+
.. ipython::
44+
:verbatim:
45+
46+
In [2]: from pathlib import Path # In Python 2, use pathlib2!
47+
48+
In [3]: data_dir = Path("data/")
49+
50+
In [4]: one_file = data_dir / "dta_for_month_01.nc"
51+
52+
In [5]: xr.open_dataset(one_file)
53+
Out[5]:
54+
<xarray.Dataset>
55+
[...]
56+
57+
By `Willi Rath <https://github.com/willirath>`_.
58+
3859
- More attributes available in :py:attr:`~xarray.Dataset.attrs` dictionary when
3960
raster files are opened with :py:func:`~xarray.open_rasterio`.
4061
By `Greg Brener <https://github.com/gbrener>`_
62+
4163
- Support for NetCDF files using an ``_Unsigned`` attribute to indicate that a
4264
a signed integer data type should be interpreted as unsigned bytes
4365
(:issue:`1444`).

xarray/backends/api.py

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,15 @@
77
from io import BytesIO
88
from numbers import Number
99

10+
1011
import numpy as np
1112

1213
from .. import backends, conventions
1314
from .common import ArrayWriter, GLOBAL_LOCK
1415
from ..core import indexing
1516
from ..core.combine import auto_combine
1617
from ..core.utils import close_on_error, is_remote_uri
17-
from ..core.pycompat import basestring
18+
from ..core.pycompat import basestring, path_type
1819

1920
DATAARRAY_NAME = '__xarray_dataarray_name__'
2021
DATAARRAY_VARIABLE = '__xarray_dataarray_variable__'
@@ -139,12 +140,12 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True,
139140
140141
Parameters
141142
----------
142-
filename_or_obj : str, file or xarray.backends.*DataStore
143-
Strings are interpreted as a path to a netCDF file or an OpenDAP URL
144-
and opened with python-netCDF4, unless the filename ends with .gz, in
145-
which case the file is gunzipped and opened with scipy.io.netcdf (only
146-
netCDF3 supported). File-like objects are opened with scipy.io.netcdf
147-
(only netCDF3 supported).
143+
filename_or_obj : str, Path, file or xarray.backends.*DataStore
144+
Strings and Path objects are interpreted as a path to a netCDF file
145+
or an OpenDAP URL and opened with python-netCDF4, unless the filename
146+
ends with .gz, in which case the file is gunzipped and opened with
147+
scipy.io.netcdf (only netCDF3 supported). File-like objects are opened
148+
with scipy.io.netcdf (only netCDF3 supported).
148149
group : str, optional
149150
Path to the netCDF4 group in the given file to open (only works for
150151
netCDF4 files).
@@ -253,6 +254,9 @@ def maybe_decode_store(store, lock=False):
253254

254255
return ds2
255256

257+
if isinstance(filename_or_obj, path_type):
258+
filename_or_obj = str(filename_or_obj)
259+
256260
if isinstance(filename_or_obj, backends.AbstractDataStore):
257261
store = filename_or_obj
258262
elif isinstance(filename_or_obj, basestring):
@@ -318,12 +322,12 @@ def open_dataarray(*args, **kwargs):
318322
319323
Parameters
320324
----------
321-
filename_or_obj : str, file or xarray.backends.*DataStore
322-
Strings are interpreted as a path to a netCDF file or an OpenDAP URL
323-
and opened with python-netCDF4, unless the filename ends with .gz, in
324-
which case the file is gunzipped and opened with scipy.io.netcdf (only
325-
netCDF3 supported). File-like objects are opened with scipy.io.netcdf
326-
(only netCDF3 supported).
325+
filename_or_obj : str, Path, file or xarray.backends.*DataStore
326+
Strings and Paths are interpreted as a path to a netCDF file or an
327+
OpenDAP URL and opened with python-netCDF4, unless the filename ends
328+
with .gz, in which case the file is gunzipped and opened with
329+
scipy.io.netcdf (only netCDF3 supported). File-like objects are opened
330+
with scipy.io.netcdf (only netCDF3 supported).
327331
group : str, optional
328332
Path to the netCDF4 group in the given file to open (only works for
329333
netCDF4 files).
@@ -438,7 +442,8 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
438442
----------
439443
paths : str or sequence
440444
Either a string glob in the form "path/to/my/files/*.nc" or an explicit
441-
list of files to open.
445+
list of files to open. Paths can be given as strings or as pathlib
446+
Paths.
442447
chunks : int or dict, optional
443448
Dictionary with keys given by dimension names and values given by chunk
444449
sizes. In general, these should divide the dimensions of each dataset.
@@ -497,6 +502,9 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
497502
"""
498503
if isinstance(paths, basestring):
499504
paths = sorted(glob(paths))
505+
else:
506+
paths = [str(p) if isinstance(p, path_type) else p for p in paths]
507+
500508
if not paths:
501509
raise IOError('no files to open')
502510

@@ -533,6 +541,8 @@ def to_netcdf(dataset, path_or_file=None, mode='w', format=None, group=None,
533541
534542
The ``writer`` argument is only for the private use of save_mfdataset.
535543
"""
544+
if isinstance(path_or_file, path_type):
545+
path_or_file = str(path_or_file)
536546
if encoding is None:
537547
encoding = {}
538548
if path_or_file is None:
@@ -597,12 +607,14 @@ def save_mfdataset(datasets, paths, mode='w', format=None, groups=None,
597607
----------
598608
datasets : list of xarray.Dataset
599609
List of datasets to save.
600-
paths : list of str
610+
paths : list of str or list of Paths
601611
List of paths to which to save each corresponding dataset.
602612
mode : {'w', 'a'}, optional
603613
Write ('w') or append ('a') mode. If mode='w', any existing file at
604614
these locations will be overwritten.
605-
format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT', 'NETCDF3_CLASSIC'}, optional
615+
format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT',
616+
'NETCDF3_CLASSIC'}, optional
617+
606618
File format for the resulting netCDF file:
607619
608620
* NETCDF4: Data is stored in an HDF5 file, using netCDF4 API

xarray/core/dataarray.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1286,15 +1286,16 @@ def to_netcdf(self, *args, **kwargs):
12861286
12871287
Parameters
12881288
----------
1289-
path : str, optional
1289+
path : str or Path, optional
12901290
Path to which to save this dataset. If no path is provided, this
12911291
function returns the resulting netCDF file as a bytes object; in
12921292
this case, we need to use scipy.io.netcdf, which does not support
12931293
netCDF version 4 (the default format becomes NETCDF3_64BIT).
12941294
mode : {'w', 'a'}, optional
12951295
Write ('w') or append ('a') mode. If mode='w', any existing file at
12961296
this location will be overwritten.
1297-
format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT', 'NETCDF3_CLASSIC'}, optional
1297+
format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT',
1298+
'NETCDF3_CLASSIC'}, optional
12981299
File format for the resulting netCDF file:
12991300
13001301
* NETCDF4: Data is stored in an HDF5 file, using netCDF4 API
@@ -1324,7 +1325,8 @@ def to_netcdf(self, *args, **kwargs):
13241325
encoding : dict, optional
13251326
Nested dictionary with variable names as keys and dictionaries of
13261327
variable specific encodings as values, e.g.,
1327-
``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1, 'zlib': True}, ...}``
1328+
``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1,
1329+
'zlib': True}, ...}``
13281330
13291331
Notes
13301332
-----

xarray/core/dataset.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -924,7 +924,7 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None,
924924
925925
Parameters
926926
----------
927-
path : str or file-like object, optional
927+
path : str, Path or file-like object, optional
928928
Path to which to save this dataset. File-like objects are only
929929
supported by the scipy engine. If no path is provided, this
930930
function returns the resulting netCDF file as bytes; in this case,
@@ -963,7 +963,8 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None,
963963
encoding : dict, optional
964964
Nested dictionary with variable names as keys and dictionaries of
965965
variable specific encodings as values, e.g.,
966-
``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1, 'zlib': True}, ...}``
966+
``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1,
967+
'zlib': True}, ...}``
967968
unlimited_dims : sequence of str, optional
968969
Dimension(s) that should be serialized as unlimited dimensions.
969970
By default, no dimensions are treated as unlimited dimensions.

xarray/core/pycompat.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,16 @@ def itervalues(d):
5959
except ImportError: # pragma: no cover
6060
dask_array_type = ()
6161

62+
try:
63+
try:
64+
from pathlib import Path
65+
except ImportError as e:
66+
from pathlib2 import Path
67+
path_type = (Path, )
68+
except ImportError as e:
69+
path_type = ()
70+
71+
6272
try:
6373
from contextlib import suppress
6474
except ImportError:
@@ -188,7 +198,7 @@ def __exit__(self, *exc_details):
188198
# We manipulate the exception state so it behaves as though
189199
# we were actually nesting multiple with statements
190200
frame_exc = sys.exc_info()[1]
191-
201+
192202
def _fix_exception_context(new_exc, old_exc):
193203
# Context may not be correct, so find the end of the chain
194204
while 1:

xarray/tests/__init__.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,17 @@
8383
except ImportError:
8484
has_rasterio = False
8585

86+
try:
87+
import pathlib
88+
has_pathlib = True
89+
except ImportError:
90+
try:
91+
import pathlib2
92+
has_pathlib = True
93+
except ImportError:
94+
has_pathlib = False
95+
96+
8697
# slighly simpler construction that the full functions.
8798
# Generally `pytest.importorskip('package')` inline is even easier
8899
requires_matplotlib = pytest.mark.skipif(
@@ -105,6 +116,9 @@
105116
not has_bottleneck, reason='requires bottleneck')
106117
requires_rasterio = pytest.mark.skipif(
107118
not has_rasterio, reason='requires rasterio')
119+
requires_pathlib = pytest.mark.skipif(
120+
not has_pathlib, reason='requires pathlib / pathlib2'
121+
)
108122

109123

110124
try:

xarray/tests/test_backends.py

Lines changed: 53 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,9 @@
2626

2727
from . import (TestCase, requires_scipy, requires_netCDF4, requires_pydap,
2828
requires_scipy_or_netCDF4, requires_dask, requires_h5netcdf,
29-
requires_pynio, has_netCDF4, has_scipy, assert_allclose,
30-
flaky, network, requires_rasterio, assert_identical)
29+
requires_pynio, requires_pathlib, has_netCDF4, has_scipy,
30+
assert_allclose, flaky, network, requires_rasterio,
31+
assert_identical)
3132
from .test_dataset import create_test_data
3233

3334
try:
@@ -40,6 +41,14 @@
4041
except ImportError:
4142
pass
4243

44+
try:
45+
from pathlib import Path
46+
except ImportError:
47+
try:
48+
from pathlib2 import Path
49+
except ImportError:
50+
pass
51+
4352

4453
ON_WINDOWS = sys.platform == 'win32'
4554

@@ -302,7 +311,8 @@ def test_roundtrip_timedelta_data(self):
302311
self.assertDatasetIdentical(expected, actual)
303312

304313
def test_roundtrip_float64_data(self):
305-
expected = Dataset({'x': ('y', np.array([1.0, 2.0, np.pi], dtype='float64'))})
314+
expected = Dataset({'x': ('y', np.array([1.0, 2.0, np.pi],
315+
dtype='float64'))})
306316
with self.roundtrip(expected) as actual:
307317
self.assertDatasetIdentical(expected, actual)
308318

@@ -738,7 +748,8 @@ def test_mask_and_scale(self):
738748
v.scale_factor = 0.1
739749
v[:] = np.array([-1, -1, 0, 1, 2])
740750

741-
# first make sure netCDF4 reads the masked and scaled data correctly
751+
# first make sure netCDF4 reads the masked and scaled data
752+
# correctly
742753
with nc4.Dataset(tmp_file, mode='r') as nc:
743754
expected = np.ma.array([-1, -1, 10, 10.1, 10.2],
744755
mask=[True, True, False, False, False])
@@ -1305,6 +1316,19 @@ def test_open_mfdataset(self):
13051316
with self.assertRaisesRegexp(IOError, 'no files to open'):
13061317
open_mfdataset('foo-bar-baz-*.nc', autoclose=self.autoclose)
13071318

1319+
@requires_pathlib
1320+
def test_open_mfdataset_pathlib(self):
1321+
original = Dataset({'foo': ('x', np.random.randn(10))})
1322+
with create_tmp_file() as tmp1:
1323+
with create_tmp_file() as tmp2:
1324+
tmp1 = Path(tmp1)
1325+
tmp2 = Path(tmp2)
1326+
original.isel(x=slice(5)).to_netcdf(tmp1)
1327+
original.isel(x=slice(5, 10)).to_netcdf(tmp2)
1328+
with open_mfdataset([tmp1, tmp2],
1329+
autoclose=self.autoclose) as actual:
1330+
self.assertDatasetAllClose(original, actual)
1331+
13081332
def test_attrs_mfdataset(self):
13091333
original = Dataset({'foo': ('x', np.random.randn(10))})
13101334
with create_tmp_file() as tmp1:
@@ -1355,6 +1379,20 @@ def test_save_mfdataset_invalid(self):
13551379
with self.assertRaisesRegexp(ValueError, 'same length'):
13561380
save_mfdataset([ds, ds], ['only one path'])
13571381

1382+
@requires_pathlib
1383+
def test_save_mfdataset_pathlib_roundtrip(self):
1384+
original = Dataset({'foo': ('x', np.random.randn(10))})
1385+
datasets = [original.isel(x=slice(5)),
1386+
original.isel(x=slice(5, 10))]
1387+
with create_tmp_file() as tmp1:
1388+
with create_tmp_file() as tmp2:
1389+
tmp1 = Path(tmp1)
1390+
tmp2 = Path(tmp2)
1391+
save_mfdataset(datasets, [tmp1, tmp2])
1392+
with open_mfdataset([tmp1, tmp2],
1393+
autoclose=self.autoclose) as actual:
1394+
self.assertDatasetIdentical(actual, original)
1395+
13581396
def test_open_and_do_math(self):
13591397
original = Dataset({'foo': ('x', np.random.randn(10))})
13601398
with create_tmp_file() as tmp:
@@ -1946,3 +1984,14 @@ def test_open_dataarray_options(self):
19461984
expected = data.drop('y')
19471985
with open_dataarray(tmp, drop_variables=['y']) as loaded:
19481986
self.assertDataArrayIdentical(expected, loaded)
1987+
1988+
@requires_pathlib
1989+
def test_dataarray_to_netcdf_no_name_pathlib(self):
1990+
original_da = DataArray(np.arange(12).reshape((3, 4)))
1991+
1992+
with create_tmp_file() as tmp:
1993+
tmp = Path(tmp)
1994+
original_da.to_netcdf(tmp)
1995+
1996+
with open_dataarray(tmp) as loaded_da:
1997+
self.assertDataArrayIdentical(original_da, loaded_da)

0 commit comments

Comments
 (0)