Skip to content

Xarray dumpers / loaders #16

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 39 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
6ad73fc
Work with new array schema and LoL generator
rly Jul 3, 2024
92ed77b
Put labeled by as string annotation on array
rly Jul 3, 2024
bbfb77a
Use pydantic 2 in array classes
rly Jul 4, 2024
f1754fb
Temp workaround for multivalued annotation value
rly Jul 5, 2024
3bf80a1
Use pydantic 2 in classes
rly Jul 5, 2024
fc77a6d
Update tests, dumpers, loaders
rly Jul 5, 2024
3432b90
Update tests, style
rly Jul 5, 2024
4009efe
Update gitignore
rly Jul 6, 2024
35e8750
Clean up
rly Jul 6, 2024
0af7a87
Clean up
rly Jul 6, 2024
1f6a7f5
Clean up
rly Jul 6, 2024
188e5aa
Clean up
rly Jul 6, 2024
3a4d785
Clean up
rly Jul 6, 2024
077ca53
Update array file format to be dict of sources
rly Jul 7, 2024
175c882
Update deps, remove cookiecutter cli
rly Jul 7, 2024
ca40bff
Remove demo test
rly Jul 7, 2024
6aa8878
adjust for non ordered keys
melonora Jul 27, 2024
1bbd018
add xarray loaders
melonora Aug 1, 2024
0d68f95
update dependency
melonora Aug 1, 2024
0858c17
update module imports
melonora Aug 1, 2024
d436994
add tests
melonora Aug 1, 2024
a23202d
add xarray zarr dumper test
melonora Aug 1, 2024
64f411b
add xarray yaml netcdf dumper
melonora Aug 1, 2024
0fcadbb
add xarray input
melonora Aug 1, 2024
e3d93bb
delete outdated .nc
melonora Aug 5, 2024
3f86415
delete outdated .zarr
melonora Aug 5, 2024
d328605
added name to data arrays
melonora Aug 5, 2024
40e2ded
remove outdated data
melonora Aug 6, 2024
175e014
move refernce date to coords.attrs
melonora Aug 6, 2024
f5ce0f9
move conversion factor to array attributes
melonora Aug 6, 2024
0d213c3
add updated data
melonora Aug 6, 2024
4e9f44f
add yaml xarray zarr dumper
melonora Aug 6, 2024
06f527e
don't use datetime
melonora Aug 6, 2024
b26372e
added xarray zarr loader
melonora Aug 6, 2024
ffb9645
added xarray yaml loaders
melonora Aug 6, 2024
ad32157
Merge branch 'main' into xarray
rly Sep 19, 2024
c0bdd6c
Update dumpers to write relative to yaml, make loader/dumper tests in…
rly Sep 19, 2024
741e212
Implement dump and raise error for dumps
rly Sep 19, 2024
77a0feb
adjust for deprecated data keyword argument
melonora Jan 9, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ numpy = ">=1.24.3"
h5py = ">=3.9.0"
zarr = ">=2.16.1"
xarray = "^2024.1.1"
h5netcdf = ">=1.3.0"
ruamel-yaml = "^0.18.6"
importlib_metadata = "*"

Expand Down
6 changes: 6 additions & 0 deletions src/linkml_arrays/dumpers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,21 @@
"""Dumper classes for linkml-arrays."""

from .hdf5_dumper import Hdf5Dumper
from .xarray_dumpers import XarrayNetCDFDumper, XarrayZarrDumper
from .yaml_dumper import YamlDumper
from .yaml_hdf5_dumper import YamlHdf5Dumper
from .yaml_numpy_dumper import YamlNumpyDumper
from .yaml_xarray_dumpers import YamlXarrayNetCDFDumper, YamlXarrayZarrDumper
from .zarr_directory_store_dumper import ZarrDirectoryStoreDumper

__all__ = [
"Hdf5Dumper",
"XarrayNetCDFDumper",
"XarrayZarrDumper",
"YamlDumper",
"YamlHdf5Dumper",
"YamlNumpyDumper",
"ZarrDirectoryStoreDumper",
"YamlXarrayNetCDFDumper",
"YamlXarrayZarrDumper",
]
10 changes: 6 additions & 4 deletions src/linkml_arrays/dumpers/hdf5_dumper.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,16 @@ def _iterate_element(
class Hdf5Dumper(Dumper):
"""Dumper class for LinkML models to HDF5 files."""

# TODO is this the right method to overwrite? it does not dump a string
def dumps(
def dump(
self,
element: Union[YAMLRoot, BaseModel],
to_file: str,
schemaview: SchemaView,
output_file_path: Union[str, Path],
**kwargs,
):
"""Dump the element to an HDF5 file."""
with h5py.File(output_file_path, "w") as f:
with h5py.File(to_file, "w") as f:
_iterate_element(element, schemaview, f)

def dumps(self, element: Union[YAMLRoot, BaseModel], **kwargs):
raise NotImplementedError("This method is not sensible for this dumper.")
132 changes: 132 additions & 0 deletions src/linkml_arrays/dumpers/xarray_dumpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
"""Class for dumping a LinkML model to netcdf like using xarray and DataTree."""

from pathlib import Path

import numpy as np
from xarray.core.datatree import DataTree

"""Class for dumping a LinkML model to an netcdf like file."""

from pathlib import Path
from typing import Union

import xarray as xr
import pandas as pd
from linkml_runtime import SchemaView
from linkml_runtime.dumpers.dumper_root import Dumper
from linkml_runtime.utils.yamlutils import YAMLRoot
from pydantic import BaseModel
from linkml_runtime import SchemaView


def _create_node(model, schemaview):
"""Create datatree from temperature dataset"""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add comment with pointer to the data model of a Dataset's attrs/coords/data_vars/dims.

node_dict = {}
for k, v in vars(model).items():
if isinstance(v, str):
# parts of the dataset with key and value both being string, e.g. name, latitude_in_deg
try:
node_dict["attrs"][k] = v
except KeyError:
node_dict["attrs"] = {k: v}
elif isinstance(v, BaseModel):
if len(var_dict := vars(v)) == 1:
# If values are length 1 we are dealing with coords like date
v = v.values
try:
node_dict["coords"][k] = v
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
node_dict["coords"][k] = v
node_dict["coords"][k] = {"data": v, "dims": k}

This would be consistent with the lines below. Not sure if dims is necessary, but I think so?

except KeyError:
node_dict["coords"] = {k: {"data": v, "dims": k}}
else:
for key, value in var_dict.items():
if key == "values":
if not isinstance(value[0], list):
try:
node_dict["coords"][k] = {"data": value, "dims": list(node_dict["coords"])[0]}
except KeyError:
node_dict["coords"] = {k: {"data": value, "dims": list(node_dict["coords"])[0]}}
else:
# Parse the temperature matrix
element_type = type(v).__name__
dimensions_expressions = schemaview.induced_slot(key, element_type).array.dimensions
dims = [dim.alias for dim in dimensions_expressions]
array = np.array(value)
node_dict["dims"] = {dim: array.shape[i] for i, dim in enumerate(dims)}
node_dict["data_vars"][k].update({"data": array, "dims": list(node_dict["dims"].keys())})
else:
if isinstance(value, str):
# can't use timestamp here as it does not serialize, potentially add with 'data' dims as coord
node_dict["coords"][k].update({"attrs": {key: value}})
else:
# conversion factor
node_dict["data_vars"] = {k: {"attrs": {key: value}}}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if there is more than one? Here the attrs dict is set to a dictionary with just this kv pair. I think this should call _create_node recursively.

return xr.Dataset.from_dict(node_dict)



def _iterate_element(
element: Union[YAMLRoot, BaseModel], schemaview: SchemaView, datatree = None
):
"""Recursively iterate through the elements of a LinkML model and save them.

Write toplevel Pydantic BaseModel objects as datasets, slots with the "array" element
as datasets, and other slots as attributes.
"""
# get the type of the element
element_type = type(element).__name__

for k, v in vars(element).items():
if isinstance(v, BaseModel):
# create a subgroup and recurse
if "values" in v.__dir__():
dims = ["y","x"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: see if we can get this from the schemaview.

data_dict = vars(v)
data_dict["data"] = np.array(data_dict.pop("values"))
data_dict["dims"] = [dims[i] for i in range(data_dict["data"].shape[0])]
data_dict["attrs"] = {"name": v.name}
dataarray = xr.DataArray.from_dict(d=data_dict)
datatree[k] = dataarray
else:
dataset = _create_node(v, schemaview)
datatree[k] = DataTree(dataset)
elif isinstance(v, str):
datatree.attrs["name"] = v
return datatree


class XarrayNetCDFDumper(Dumper):
"""Dumper class for LinkML models to HDF5 files."""

def dump(
self,
element: Union[YAMLRoot, BaseModel],
to_file: str,
schemaview: SchemaView,
**kwargs,
):
"""Dump the element to an HDF5 file."""
datatree = DataTree()
datatree = _iterate_element(element, schemaview, datatree)
datatree.to_netcdf(to_file, engine='h5netcdf')

def dumps(self, element: Union[YAMLRoot, BaseModel], **kwargs):
raise NotImplementedError("This method is not sensible for this dumper.")


class XarrayZarrDumper(Dumper):
"""Dumper class for LinkML models to HDF5 files."""

def dump(
self,
element: Union[YAMLRoot, BaseModel],
to_file: str,
schemaview: SchemaView,
**kwargs,
):
"""Dump the element to an HDF5 file."""
datatree = DataTree()
datatree = _iterate_element(element, schemaview, datatree)
datatree.to_zarr(to_file)

def dumps(self, element: Union[YAMLRoot, BaseModel], **kwargs):
raise NotImplementedError("This method is not sensible for this dumper.")
27 changes: 20 additions & 7 deletions src/linkml_arrays/dumpers/yaml_array_file_dumper.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,19 +61,17 @@ def _iterate_element(
else:
output_file_name = f"{found_slot.name}"

# if output_dir is absolute, make it relative to current working directory
# and create the directory if it does not exist
if output_dir.is_absolute():
output_dir = Path(os.path.relpath(output_dir, start=os.getcwd()))
output_dir.mkdir(exist_ok=True)
output_file_path_no_suffix = output_dir / output_file_name
output_file_path_no_suffix = (output_dir / output_file_name)

# save the numpy array to file and write the file path to the dictionary
output_file_path = write_array(v, output_file_path_no_suffix)

# write the path to the array file relative to the output directory where the yaml is written
relative_output_file_path = os.path.relpath(output_file_path, start=output_dir)
ret_dict[k] = {
"source": [
{
"file": f"./{output_file_path}",
"file": f"{relative_output_file_path}",
"format": format,
}
]
Expand All @@ -100,6 +98,21 @@ class YamlArrayFileDumper(Dumper, metaclass=ABCMeta):

# FORMAT is a class attribute that must be set by subclasses

def dump(
self,
element: Union[YAMLRoot, BaseModel],
to_file: str,
schemaview: SchemaView,
**kwargs,
):
"""Dump the element to a YAML file with paths to array files."""
output_dir = Path(to_file).parent
input = _iterate_element(
element, schemaview, Path(output_dir), self.write_array, self.FORMAT
)
with open(to_file, "w") as f:
yaml.dump(input, f)

def dumps(
self,
element: Union[YAMLRoot, BaseModel],
Expand Down
63 changes: 63 additions & 0 deletions src/linkml_arrays/dumpers/yaml_xarray_dumpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""Class for dumping a LinkML model to YAML with paths to NumPy files."""

from pathlib import Path
from typing import List, Union

import numpy as np
import xarray as xr

from .yaml_array_file_dumper import YamlArrayFileDumper


class YamlXarrayNetCDFDumper(YamlArrayFileDumper):
"""Dumper class for LinkML models to YAML with paths to .nc file.

Each array is written to a netcdf dataset at path "/data" in a new .nc file.
"""

FILE_SUFFIX = ".nc" # used in parent class
FORMAT = "netcdf"

@classmethod
def write_array(
cls, array: Union[List, np.ndarray], output_file_path_no_suffix: Union[str, Path]
):
"""Write an array to a NumPy file."""
# TODO do not assume that there is only one by this name
# add suffix to the file name
if isinstance(output_file_path_no_suffix, str):
output_file_path_no_suffix = Path(output_file_path_no_suffix)
output_file_path = output_file_path_no_suffix.parent / (
output_file_path_no_suffix.name + cls.FILE_SUFFIX
)

data_array = xr.DataArray(data=array)
data_array.to_netcdf(output_file_path, engine="h5netcdf")
return output_file_path


class YamlXarrayZarrDumper(YamlArrayFileDumper):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am sorta having a tough time telling whats going on in this PR (sorry for missing the meeting, I never know when they are), but im doing this rn and u will definitely want to have a model for this, otherwise all the ser/des logic gets packed into a huge nesty dumper/loader and u lose the declarative relationship between in memory and dumped.

"""Dumper class for LinkML models to YAML with paths to .zarr file.

Each array is written to a zarr dataset at path "/data" in a new .zarr file.
"""

FILE_SUFFIX = ".zarr" # used in parent class
FORMAT = "zarr"

@classmethod
def write_array(
cls, array: Union[List, np.ndarray], output_file_path_no_suffix: Union[str, Path]
):
"""Write an array to a NumPy file."""
# TODO do not assume that there is only one by this name
# add suffix to the file name
if isinstance(output_file_path_no_suffix, str):
output_file_path_no_suffix = Path(output_file_path_no_suffix)
output_file_path = output_file_path_no_suffix.parent / (
output_file_path_no_suffix.name + cls.FILE_SUFFIX
)

data_array = xr.DataArray(data=array)
data_array.to_zarr(output_file_path)
return output_file_path
10 changes: 6 additions & 4 deletions src/linkml_arrays/dumpers/zarr_directory_store_dumper.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,17 @@ def _iterate_element(
class ZarrDirectoryStoreDumper(Dumper):
"""Dumper class for LinkML models to Zarr directory stores."""

# TODO is this the right method to overwrite? it does not dump a string
def dumps(
def dump(
self,
element: Union[YAMLRoot, BaseModel],
to_file: str,
schemaview: SchemaView,
output_file_path: Union[str, Path],
**kwargs,
):
"""Dump the element to a Zarr directory store."""
store = zarr.DirectoryStore(output_file_path)
store = zarr.DirectoryStore(to_file)
root = zarr.group(store=store, overwrite=True)
_iterate_element(element, schemaview, root)

def dumps(self, element: Union[YAMLRoot, BaseModel], **kwargs):
raise NotImplementedError("This method is not sensible for this dumper.")
3 changes: 3 additions & 0 deletions src/linkml_arrays/loaders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@
from .hdf5_loader import Hdf5Loader
from .yaml_array_file_loader import YamlArrayFileLoader
from .yaml_loader import YamlLoader
from .xarray_loaders import XarrayZarrLoader, XarrayNetCDFLoader
from .zarr_directory_store_loader import ZarrDirectoryStoreLoader

__all__ = [
"Hdf5Loader",
"XarrayNetCDFLoader",
"XarrayZarrLoader",
"YamlArrayFileLoader",
"YamlLoader",
"ZarrDirectoryStoreLoader",
Expand Down
Loading