linkml · melonora · Jul 3, 2024 · Jul 3, 2024 · Jul 4, 2024 · Jul 5, 2024
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,7 @@ numpy = ">=1.24.3"
 h5py = ">=3.9.0"
 zarr = ">=2.16.1"
 xarray = "^2024.1.1"
+h5netcdf = ">=1.3.0"
 ruamel-yaml = "^0.18.6"
 importlib_metadata = "*"
 

diff --git a/src/linkml_arrays/dumpers/__init__.py b/src/linkml_arrays/dumpers/__init__.py
@@ -1,15 +1,21 @@
 """Dumper classes for linkml-arrays."""
 
 from .hdf5_dumper import Hdf5Dumper
+from .xarray_dumpers import XarrayNetCDFDumper, XarrayZarrDumper
 from .yaml_dumper import YamlDumper
 from .yaml_hdf5_dumper import YamlHdf5Dumper
 from .yaml_numpy_dumper import YamlNumpyDumper
+from .yaml_xarray_dumpers import YamlXarrayNetCDFDumper, YamlXarrayZarrDumper
 from .zarr_directory_store_dumper import ZarrDirectoryStoreDumper
 
 __all__ = [
     "Hdf5Dumper",
+    "XarrayNetCDFDumper",
+    "XarrayZarrDumper",
     "YamlDumper",
     "YamlHdf5Dumper",
     "YamlNumpyDumper",
     "ZarrDirectoryStoreDumper",
+    "YamlXarrayNetCDFDumper",
+    "YamlXarrayZarrDumper",
 ]
diff --git a/src/linkml_arrays/dumpers/hdf5_dumper.py b/src/linkml_arrays/dumpers/hdf5_dumper.py
@@ -39,14 +39,16 @@ def _iterate_element(
 class Hdf5Dumper(Dumper):
     """Dumper class for LinkML models to HDF5 files."""
 
-    # TODO is this the right method to overwrite? it does not dump a string
-    def dumps(
+    def dump(
         self,
         element: Union[YAMLRoot, BaseModel],
+        to_file: str,
         schemaview: SchemaView,
-        output_file_path: Union[str, Path],
         **kwargs,
     ):
         """Dump the element to an HDF5 file."""
-        with h5py.File(output_file_path, "w") as f:
+        with h5py.File(to_file, "w") as f:
             _iterate_element(element, schemaview, f)
+
+    def dumps(self, element: Union[YAMLRoot, BaseModel], **kwargs):
+        raise NotImplementedError("This method is not sensible for this dumper.")
diff --git a/src/linkml_arrays/dumpers/xarray_dumpers.py b/src/linkml_arrays/dumpers/xarray_dumpers.py
@@ -0,0 +1,132 @@
+"""Class for dumping a LinkML model to netcdf like using xarray and DataTree."""
+
+from pathlib import Path
+
+import numpy as np
+from xarray.core.datatree import DataTree
+
+"""Class for dumping a LinkML model to an netcdf like file."""
+
+from pathlib import Path
+from typing import Union
+
+import xarray as xr
+import pandas as pd
+from linkml_runtime import SchemaView
+from linkml_runtime.dumpers.dumper_root import Dumper
+from linkml_runtime.utils.yamlutils import YAMLRoot
+from pydantic import BaseModel
+from linkml_runtime import SchemaView
+
+
+def _create_node(model, schemaview):
+    """Create datatree from temperature dataset"""
+    node_dict = {}
+    for k, v in vars(model).items():
+        if isinstance(v, str):
+            # parts of the dataset with key and value both being string, e.g. name, latitude_in_deg
+            try:
+                node_dict["attrs"][k] = v
+            except KeyError:
+                node_dict["attrs"] = {k: v}
+        elif isinstance(v, BaseModel):
+            if len(var_dict := vars(v)) == 1:
+                # If values are length 1 we are dealing with coords like date
+                v = v.values
+                try:
+                    node_dict["coords"][k] = v
-                    node_dict["coords"][k] = v
+                    node_dict["coords"][k] = {"data": v, "dims": k}
-                    node_dict["coords"][k] = v
+                    node_dict["coords"][k] = {"data": v, "dims": k}
+                except KeyError:
+                    node_dict["coords"] = {k: {"data": v, "dims": k}}
+            else:
+                for key, value in var_dict.items():
+                    if key == "values":
+                        if not isinstance(value[0], list):
+                            try:
+                                node_dict["coords"][k] = {"data": value, "dims": list(node_dict["coords"])[0]}
+                            except KeyError:
+                                node_dict["coords"] = {k: {"data": value, "dims": list(node_dict["coords"])[0]}}
+                        else:
+                            # Parse the temperature matrix
+                            element_type = type(v).__name__
+                            dimensions_expressions = schemaview.induced_slot(key, element_type).array.dimensions
+                            dims = [dim.alias for dim in dimensions_expressions]
+                            array = np.array(value)
+                            node_dict["dims"] = {dim: array.shape[i] for i, dim in enumerate(dims)}
+                            node_dict["data_vars"][k].update({"data": array, "dims": list(node_dict["dims"].keys())})
+                    else:
+                        if isinstance(value, str):
+                            # can't use timestamp here as it does not serialize, potentially add with 'data' dims as coord
+                            node_dict["coords"][k].update({"attrs": {key: value}})
+                        else:
+                            # conversion factor
+                            node_dict["data_vars"] = {k: {"attrs": {key: value}}}
+    return xr.Dataset.from_dict(node_dict)
+
+
+
+def _iterate_element(
+    element: Union[YAMLRoot, BaseModel], schemaview: SchemaView, datatree = None
+):
+    """Recursively iterate through the elements of a LinkML model and save them.
+
+    Write toplevel Pydantic BaseModel objects as datasets, slots with the "array" element
+    as datasets, and other slots as attributes.
+    """
+    # get the type of the element
+    element_type = type(element).__name__
+
+    for k, v in vars(element).items():
+        if isinstance(v, BaseModel):
+            # create a subgroup and recurse
+            if "values" in v.__dir__():
+                dims = ["y","x"]
+                data_dict = vars(v)
+                data_dict["data"] = np.array(data_dict.pop("values"))
+                data_dict["dims"] = [dims[i] for i in range(data_dict["data"].shape[0])]
+                data_dict["attrs"] = {"name": v.name}
+                dataarray = xr.DataArray.from_dict(d=data_dict)
+                datatree[k] = dataarray
+            else:
+                dataset = _create_node(v, schemaview)
+                datatree[k] = DataTree(dataset)
+        elif isinstance(v, str):
+            datatree.attrs["name"] = v
+    return datatree
+
+
+class XarrayNetCDFDumper(Dumper):
+    """Dumper class for LinkML models to HDF5 files."""
+
+    def dump(
+        self,
+        element: Union[YAMLRoot, BaseModel],
+        to_file: str,
+        schemaview: SchemaView,
+        **kwargs,
+    ):
+        """Dump the element to an HDF5 file."""
+        datatree = DataTree()
+        datatree = _iterate_element(element, schemaview, datatree)
+        datatree.to_netcdf(to_file, engine='h5netcdf')
+
+    def dumps(self, element: Union[YAMLRoot, BaseModel], **kwargs):
+        raise NotImplementedError("This method is not sensible for this dumper.")
+
+
+class XarrayZarrDumper(Dumper):
+    """Dumper class for LinkML models to HDF5 files."""
+
+    def dump(
+        self,
+        element: Union[YAMLRoot, BaseModel],
+        to_file: str,
+        schemaview: SchemaView,
+        **kwargs,
+    ):
+        """Dump the element to an HDF5 file."""
+        datatree = DataTree()
+        datatree = _iterate_element(element, schemaview, datatree)
+        datatree.to_zarr(to_file)
+
+    def dumps(self, element: Union[YAMLRoot, BaseModel], **kwargs):
+        raise NotImplementedError("This method is not sensible for this dumper.")
diff --git a/src/linkml_arrays/dumpers/yaml_array_file_dumper.py b/src/linkml_arrays/dumpers/yaml_array_file_dumper.py
@@ -61,19 +61,17 @@ def _iterate_element(
             else:
                 output_file_name = f"{found_slot.name}"
 
-            # if output_dir is absolute, make it relative to current working directory
-            # and create the directory if it does not exist
-            if output_dir.is_absolute():
-                output_dir = Path(os.path.relpath(output_dir, start=os.getcwd()))
-            output_dir.mkdir(exist_ok=True)
-            output_file_path_no_suffix = output_dir / output_file_name
+            output_file_path_no_suffix = (output_dir / output_file_name)
 
             # save the numpy array to file and write the file path to the dictionary
             output_file_path = write_array(v, output_file_path_no_suffix)
+
+            # write the path to the array file relative to the output directory where the yaml is written
+            relative_output_file_path = os.path.relpath(output_file_path, start=output_dir)
             ret_dict[k] = {
                 "source": [
                     {
-                        "file": f"./{output_file_path}",
+                        "file": f"{relative_output_file_path}",
                         "format": format,
                     }
                 ]
@@ -100,6 +98,21 @@ class YamlArrayFileDumper(Dumper, metaclass=ABCMeta):
 
     # FORMAT is a class attribute that must be set by subclasses
 
+    def dump(
+        self,
+        element: Union[YAMLRoot, BaseModel],
+        to_file: str,
+        schemaview: SchemaView,
+        **kwargs,
+    ):
+        """Dump the element to a YAML file with paths to array files."""
+        output_dir = Path(to_file).parent
+        input = _iterate_element(
+            element, schemaview, Path(output_dir), self.write_array, self.FORMAT
+        )
+        with open(to_file, "w") as f:
+            yaml.dump(input, f)
+
     def dumps(
         self,
         element: Union[YAMLRoot, BaseModel],

diff --git a/src/linkml_arrays/dumpers/yaml_xarray_dumpers.py b/src/linkml_arrays/dumpers/yaml_xarray_dumpers.py
@@ -0,0 +1,63 @@
+"""Class for dumping a LinkML model to YAML with paths to NumPy files."""
+
+from pathlib import Path
+from typing import List, Union
+
+import numpy as np
+import xarray as xr
+
+from .yaml_array_file_dumper import YamlArrayFileDumper
+
+
+class YamlXarrayNetCDFDumper(YamlArrayFileDumper):
+    """Dumper class for LinkML models to YAML with paths to .nc file.
+
+    Each array is written to a netcdf dataset at path "/data" in a new .nc file.
+    """
+
+    FILE_SUFFIX = ".nc"  # used in parent class
+    FORMAT = "netcdf"
+
+    @classmethod
+    def write_array(
+        cls, array: Union[List, np.ndarray], output_file_path_no_suffix: Union[str, Path]
+    ):
+        """Write an array to a NumPy file."""
+        # TODO do not assume that there is only one by this name
+        # add suffix to the file name
+        if isinstance(output_file_path_no_suffix, str):
+            output_file_path_no_suffix = Path(output_file_path_no_suffix)
+        output_file_path = output_file_path_no_suffix.parent / (
+            output_file_path_no_suffix.name + cls.FILE_SUFFIX
+        )
+
+        data_array = xr.DataArray(data=array)
+        data_array.to_netcdf(output_file_path, engine="h5netcdf")
+        return output_file_path
+
+
+class YamlXarrayZarrDumper(YamlArrayFileDumper):
+    """Dumper class for LinkML models to YAML with paths to .zarr file.
+
+    Each array is written to a zarr dataset at path "/data" in a new .zarr file.
+    """
+
+    FILE_SUFFIX = ".zarr"  # used in parent class
+    FORMAT = "zarr"
+
+    @classmethod
+    def write_array(
+        cls, array: Union[List, np.ndarray], output_file_path_no_suffix: Union[str, Path]
+    ):
+        """Write an array to a NumPy file."""
+        # TODO do not assume that there is only one by this name
+        # add suffix to the file name
+        if isinstance(output_file_path_no_suffix, str):
+            output_file_path_no_suffix = Path(output_file_path_no_suffix)
+        output_file_path = output_file_path_no_suffix.parent / (
+            output_file_path_no_suffix.name + cls.FILE_SUFFIX
+        )
+
+        data_array = xr.DataArray(data=array)
+        data_array.to_zarr(output_file_path)
+        return output_file_path
diff --git a/src/linkml_arrays/dumpers/zarr_directory_store_dumper.py b/src/linkml_arrays/dumpers/zarr_directory_store_dumper.py
@@ -39,15 +39,17 @@ def _iterate_element(
 class ZarrDirectoryStoreDumper(Dumper):
     """Dumper class for LinkML models to Zarr directory stores."""
 
-    # TODO is this the right method to overwrite? it does not dump a string
-    def dumps(
+    def dump(
         self,
         element: Union[YAMLRoot, BaseModel],
+        to_file: str,
         schemaview: SchemaView,
-        output_file_path: Union[str, Path],
         **kwargs,
     ):
         """Dump the element to a Zarr directory store."""
-        store = zarr.DirectoryStore(output_file_path)
+        store = zarr.DirectoryStore(to_file)
         root = zarr.group(store=store, overwrite=True)
         _iterate_element(element, schemaview, root)
+
+    def dumps(self, element: Union[YAMLRoot, BaseModel], **kwargs):
+        raise NotImplementedError("This method is not sensible for this dumper.")
diff --git a/src/linkml_arrays/loaders/__init__.py b/src/linkml_arrays/loaders/__init__.py
@@ -3,10 +3,13 @@
 from .hdf5_loader import Hdf5Loader
 from .yaml_array_file_loader import YamlArrayFileLoader
 from .yaml_loader import YamlLoader
+from .xarray_loaders import XarrayZarrLoader, XarrayNetCDFLoader
 from .zarr_directory_store_loader import ZarrDirectoryStoreLoader
 
 __all__ = [
     "Hdf5Loader",
+    "XarrayNetCDFLoader",
+    "XarrayZarrLoader",
     "YamlArrayFileLoader",
     "YamlLoader",
     "ZarrDirectoryStoreLoader",