WIP: Compute==False for to_zarr and to_netcdf (#1811)

Joe Hamman · web-flow · commit 8ef194f2e6f2 · 2018-05-16T11:05:02.000-04:00
* move backend append logic to the prepare_variable methods

* deprecate variables/dimensions/attrs properties on AbstractWritableDataStore

* warnings instead of errors for backend properties

* use attrs.update when setting zarr attributes

* more performance improvements to attributes in zarr backend

* fix typo

* new set_dimensions method for writable data stores

* more fixes for zarr

* more tests for zarr and remove append logic for zarr

* more tests for zarr and remove append logic for zarr

* a few more tweaks to zarr attrs

* Add encode methods to writable data stores, fixes for Zarr tests

* fix for InMemoryDataStore

* fix for unlimited dimensions Scipy Datastores

* another patch for scipy

* whatsnew

* initial commit returning dask futures from to_netcdf and to_zarr methods

* ordereddict

* address some of rabernats comments, in particular, this commit removes the _DIMENSION_KEY from the zarr_group.attrs

* stop skipping zero-dim zarr tests

* update minimum zarr version for tests

* cleanup a bit before adding tests

* tempoary checkin

* cleanup implementation of compute=False for to_foo functions, still needs additional tests

* docs and more tests, failing tests on h5netcdf backend only

* skip h5netcdf/netcdf4 tests in certain places

* remove spurious returns

* finalize stores when compute=False

* more docs, skip h5netcdf netcdf tests, raise informative error for h5netcdf and scipy

* cleanup whats-new

* reorg dask task graph when using compute=False and save_mfdataset

* move compute_false tests to DaskTests class

* small doc/style fixes

* save api.py
diff --git a/doc/dask.rst b/doc/dask.rst
@@ -100,6 +100,21 @@ Once you've manipulated a dask array, you can still write a dataset too big to
 fit into memory back to disk by using :py:meth:`~xarray.Dataset.to_netcdf` in the
 usual way.
 
+.. ipython:: python
+
+    ds.to_netcdf('manipulated-example-data.nc')
+
+By setting the ``compute`` argument to ``False``, :py:meth:`~xarray.Dataset.to_netcdf`
+will return a dask delayed object that can be computed later.
+
+.. ipython:: python
+
+    from dask.diagnostics import ProgressBar
+    # or distributed.progress when using the distributed scheduler
+    delayed_obj = ds.to_netcdf('manipulated-example-data.nc', compute=False)
+    with ProgressBar():
+        results = delayed_obj.compute()
+
 .. note::
 
     When using dask's distributed scheduler to write NETCDF4 files,
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -69,6 +69,19 @@ Enhancements
 - ``plot.line()`` learned new kwargs: ``xincrease``, ``yincrease`` that change the direction of the respective axes.
   By `Deepak Cherian <https://github.com/dcherian>`_.
 
+- Added the ``parallel`` option to :py:func:`open_mfdataset`. This option uses
+  ``dask.delayed`` to parallelize the open and preprocessing steps within
+  ``open_mfdataset``. This is expected to provide performance improvements when
+  opening many files, particularly when used in conjunction with dask's
+  multiprocessing or distributed schedulers (:issue:`1981`).
+  By `Joe Hamman <https://github.com/jhamman>`_.
+
+- New ``compute`` option in :py:meth:`~xarray.Dataset.to_netcdf`,
+  :py:meth:`~xarray.Dataset.to_zarr`, and :py:func:`~xarray.save_mfdataset` to
+  allow for the lazy computation of netCDF and zarr stores. This feature is
+  currently only supported by the netCDF4 and zarr backends. (:issue:`1784`).
+  By `Joe Hamman <https://github.com/jhamman>`_.
+
 Bug fixes
 ~~~~~~~~~
 
@@ -104,12 +117,6 @@ The minor release includes a number of bug-fixes and backwards compatible enhanc
 Enhancements
 ~~~~~~~~~~~~
 
-- Added the ``parallel`` option to :py:func:`open_mfdataset`. This option uses
-  ``dask.delayed`` to parallelize the open and preprocessing steps within
-  ``open_mfdataset``. This is expected to provide performance improvements when
-  opening many files, particularly when used in conjunction with dask's
-  multiprocessing or distributed schedulers (:issue:`1981`).
-  By `Joe Hamman <https://github.com/jhamman>`_.
 - :py:meth:`~xarray.DataArray.isin` and :py:meth:`~xarray.Dataset.isin` methods,
   which test each value in the array for whether it is contained in the
   supplied list, returning a bool array. See :ref:`selecting values with isin`
diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -144,6 +144,13 @@ def _get_lock(engine, scheduler, format, path_or_file):
     return lock
 
 
+def _finalize_store(write, store):
+    """ Finalize this store by explicitly syncing and closing"""
+    del write  # ensure writing is done first
+    store.sync()
+    store.close()
+
+
 def open_dataset(filename_or_obj, group=None, decode_cf=True,
                  mask_and_scale=True, decode_times=True, autoclose=False,
                  concat_characters=True, decode_coords=True, engine=None,
@@ -620,7 +627,8 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
 
 
 def to_netcdf(dataset, path_or_file=None, mode='w', format=None, group=None,
-              engine=None, writer=None, encoding=None, unlimited_dims=None):
+              engine=None, writer=None, encoding=None, unlimited_dims=None,
+              compute=True):
     """This function creates an appropriate datastore for writing a dataset to
     disk as a netCDF file
 
@@ -680,19 +688,22 @@ def to_netcdf(dataset, path_or_file=None, mode='w', format=None, group=None,
         unlimited_dims = dataset.encoding.get('unlimited_dims', None)
     try:
         dataset.dump_to_store(store, sync=sync, encoding=encoding,
-                              unlimited_dims=unlimited_dims)
+                              unlimited_dims=unlimited_dims, compute=compute)
         if path_or_file is None:
             return target.getvalue()
     finally:
         if sync and isinstance(path_or_file, basestring):
             store.close()
 
+    if not compute:
+        import dask
+        return dask.delayed(_finalize_store)(store.delayed_store, store)
+
     if not sync:
         return store
 
-
 def save_mfdataset(datasets, paths, mode='w', format=None, groups=None,
-                   engine=None):
+                   engine=None, compute=True):
     """Write multiple datasets to disk as netCDF files simultaneously.
 
     This function is intended for use with datasets consisting of dask.array
@@ -742,6 +753,9 @@ def save_mfdataset(datasets, paths, mode='w', format=None, groups=None,
         default engine is chosen based on available dependencies, with a
         preference for 'netcdf4' if writing to a file on disk.
         See `Dataset.to_netcdf` for additional information.
+    compute: boolean
+        If true compute immediately, otherwise return a
+        ``dask.delayed.Delayed`` object that can be computed later.
 
     Examples
     --------
@@ -769,11 +783,17 @@ def save_mfdataset(datasets, paths, mode='w', format=None, groups=None,
                          'datasets, paths and groups arguments to '
                          'save_mfdataset')
 
-    writer = ArrayWriter()
-    stores = [to_netcdf(ds, path, mode, format, group, engine, writer)
+    writer = ArrayWriter() if compute else None
+    stores = [to_netcdf(ds, path, mode, format, group, engine, writer,
+                        compute=compute)
               for ds, path, group in zip(datasets, paths, groups)]
+
+    if not compute:
+        import dask
+        return dask.delayed(stores)
+
     try:
-        writer.sync()
+        delayed = writer.sync(compute=compute)
         for store in stores:
             store.sync()
     finally:
@@ -782,7 +802,7 @@ def save_mfdataset(datasets, paths, mode='w', format=None, groups=None,
 
 
 def to_zarr(dataset, store=None, mode='w-', synchronizer=None, group=None,
-            encoding=None):
+            encoding=None, compute=True):
     """This function creates an appropriate datastore for writing a dataset to
     a zarr ztore
 
@@ -803,5 +823,9 @@ def to_zarr(dataset, store=None, mode='w-', synchronizer=None, group=None,
 
     # I think zarr stores should always be sync'd immediately
     # TODO: figure out how to properly handle unlimited_dims
-    dataset.dump_to_store(store, sync=True, encoding=encoding)
+    dataset.dump_to_store(store, sync=True, encoding=encoding, compute=compute)
+
+    if not compute:
+        import dask
+        return dask.delayed(_finalize_store)(store.delayed_store, store)
     return store
diff --git a/xarray/backends/common.py b/xarray/backends/common.py
@@ -264,19 +264,23 @@ def add(self, source, target):
         else:
             target[...] = source
 
-    def sync(self):
+    def sync(self, compute=True):
         if self.sources:
             import dask.array as da
-            da.store(self.sources, self.targets, lock=self.lock)
+            delayed_store = da.store(self.sources, self.targets,
+                                     lock=self.lock, compute=compute,
+                                     flush=True)
             self.sources = []
             self.targets = []
+            return delayed_store
 
 
 class AbstractWritableDataStore(AbstractDataStore):
     def __init__(self, writer=None, lock=HDF5_LOCK):
         if writer is None:
             writer = ArrayWriter(lock=lock)
         self.writer = writer
+        self.delayed_store = None
 
     def encode(self, variables, attributes):
         """
@@ -318,11 +322,11 @@ def set_attribute(self, k, v):  # pragma: no cover
     def set_variable(self, k, v):  # pragma: no cover
         raise NotImplementedError
 
-    def sync(self):
+    def sync(self, compute=True):
         if self._isopen and self._autoclose:
             # datastore will be reopened during write
             self.close()
-        self.writer.sync()
+        self.delayed_store = self.writer.sync(compute=compute)
 
     def store_dataset(self, dataset):
         """
diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py
@@ -212,9 +212,12 @@ def prepare_variable(self, name, variable, check_encoding=False,
 
         return target, variable.data
 
-    def sync(self):
+    def sync(self, compute=True):
+        if not compute:
+            raise NotImplementedError(
+                'compute=False is not supported for the h5netcdf backend yet')
         with self.ensure_open(autoclose=True):
-            super(H5NetCDFStore, self).sync()
+            super(H5NetCDFStore, self).sync(compute=compute)
             self.ds.sync()
 
     def close(self):
diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py
@@ -439,9 +439,9 @@ def prepare_variable(self, name, variable, check_encoding=False,
 
         return target, variable.data
 
-    def sync(self):
+    def sync(self, compute=True):
         with self.ensure_open(autoclose=True):
-            super(NetCDF4DataStore, self).sync()
+            super(NetCDF4DataStore, self).sync(compute=compute)
             self.ds.sync()
 
     def close(self):
diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py
@@ -219,9 +219,12 @@ def prepare_variable(self, name, variable, check_encoding=False,
 
         return target, data
 
-    def sync(self):
+    def sync(self, compute=True):
+        if not compute:
+            raise NotImplementedError(
+                'compute=False is not supported for the scipy backend yet')
         with self.ensure_open(autoclose=True):
-            super(ScipyDataStore, self).sync()
+            super(ScipyDataStore, self).sync(compute=compute)
             self.ds.flush()
 
     def close(self):
diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py
@@ -342,8 +342,8 @@ def store(self, variables, attributes, *args, **kwargs):
         AbstractWritableDataStore.store(self, variables, attributes,
                                         *args, **kwargs)
 
-    def sync(self):
-        self.writer.sync()
+    def sync(self, compute=True):
+        self.delayed_store = self.writer.sync(compute=compute)
 
 
 def open_zarr(store, group=None, synchronizer=None, auto_chunk=True,
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -1055,7 +1055,7 @@ def reset_coords(self, names=None, drop=False, inplace=False):
         return obj
 
     def dump_to_store(self, store, encoder=None, sync=True, encoding=None,
-                      unlimited_dims=None):
+                      unlimited_dims=None, compute=True):
         """Store dataset contents to a backends.*DataStore object."""
         if encoding is None:
             encoding = {}
@@ -1074,10 +1074,11 @@ def dump_to_store(self, store, encoder=None, sync=True, encoding=None,
         store.store(variables, attrs, check_encoding,
                     unlimited_dims=unlimited_dims)
         if sync:
-            store.sync()
+            store.sync(compute=compute)
 
     def to_netcdf(self, path=None, mode='w', format=None, group=None,
-                  engine=None, encoding=None, unlimited_dims=None):
+                  engine=None, encoding=None, unlimited_dims=None,
+                  compute=True):
         """Write dataset contents to a netCDF file.
 
         Parameters
@@ -1136,16 +1137,20 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None,
             By default, no dimensions are treated as unlimited dimensions.
             Note that unlimited_dims may also be set via
             ``dataset.encoding['unlimited_dims']``.
+        compute: boolean
+            If true compute immediately, otherwise return a
+            ``dask.delayed.Delayed`` object that can be computed later.
         """
         if encoding is None:
             encoding = {}
         from ..backends.api import to_netcdf
         return to_netcdf(self, path, mode, format=format, group=group,
                          engine=engine, encoding=encoding,
-                         unlimited_dims=unlimited_dims)
+                         unlimited_dims=unlimited_dims,
+                         compute=compute)
 
     def to_zarr(self, store=None, mode='w-', synchronizer=None, group=None,
-                encoding=None):
+                encoding=None, compute=True):
         """Write dataset contents to a zarr group.
 
         .. note:: Experimental
@@ -1167,6 +1172,9 @@ def to_zarr(self, store=None, mode='w-', synchronizer=None, group=None,
             Nested dictionary with variable names as keys and dictionaries of
             variable specific encodings as values, e.g.,
             ``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1,}, ...}``
+        compute: boolean
+            If true compute immediately, otherwise return a
+            ``dask.delayed.Delayed`` object that can be computed later.
         """
         if encoding is None:
             encoding = {}
@@ -1176,7 +1184,7 @@ def to_zarr(self, store=None, mode='w-', synchronizer=None, group=None,
                              "and 'w-'.")
         from ..backends.api import to_zarr
         return to_zarr(self, store=store, mode=mode, synchronizer=synchronizer,
-                       group=group, encoding=encoding)
+                       group=group, encoding=encoding, compute=compute)
 
     def __unicode__(self):
         return formatting.dataset_repr(self)
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py