-
Notifications
You must be signed in to change notification settings - Fork 1
Xarray dumpers / loaders #16
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
6ad73fc
92ed77b
bbfb77a
f1754fb
3bf80a1
fc77a6d
3432b90
4009efe
35e8750
0af7a87
1f6a7f5
188e5aa
3a4d785
077ca53
175c882
ca40bff
6aa8878
1bbd018
0d68f95
0858c17
d436994
a23202d
64f411b
0fcadbb
e3d93bb
3f86415
d328605
40e2ded
175e014
f5ce0f9
0d213c3
4e9f44f
06f527e
b26372e
ffb9645
ad32157
c0bdd6c
741e212
77a0feb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,21 @@ | ||
"""Dumper classes for linkml-arrays.""" | ||
|
||
from .hdf5_dumper import Hdf5Dumper | ||
from .xarray_dumpers import XarrayNetCDFDumper, XarrayZarrDumper | ||
from .yaml_dumper import YamlDumper | ||
from .yaml_hdf5_dumper import YamlHdf5Dumper | ||
from .yaml_numpy_dumper import YamlNumpyDumper | ||
from .yaml_xarray_dumpers import YamlXarrayNetCDFDumper, YamlXarrayZarrDumper | ||
from .zarr_directory_store_dumper import ZarrDirectoryStoreDumper | ||
|
||
__all__ = [ | ||
"Hdf5Dumper", | ||
"XarrayNetCDFDumper", | ||
"XarrayZarrDumper", | ||
"YamlDumper", | ||
"YamlHdf5Dumper", | ||
"YamlNumpyDumper", | ||
"ZarrDirectoryStoreDumper", | ||
"YamlXarrayNetCDFDumper", | ||
"YamlXarrayZarrDumper", | ||
] |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,132 @@ | ||||||
"""Class for dumping a LinkML model to netcdf like using xarray and DataTree.""" | ||||||
|
||||||
from pathlib import Path | ||||||
|
||||||
import numpy as np | ||||||
from xarray.core.datatree import DataTree | ||||||
|
||||||
"""Class for dumping a LinkML model to an netcdf like file.""" | ||||||
|
||||||
from pathlib import Path | ||||||
from typing import Union | ||||||
|
||||||
import xarray as xr | ||||||
import pandas as pd | ||||||
from linkml_runtime import SchemaView | ||||||
from linkml_runtime.dumpers.dumper_root import Dumper | ||||||
from linkml_runtime.utils.yamlutils import YAMLRoot | ||||||
from pydantic import BaseModel | ||||||
from linkml_runtime import SchemaView | ||||||
|
||||||
|
||||||
def _create_node(model, schemaview): | ||||||
"""Create datatree from temperature dataset""" | ||||||
node_dict = {} | ||||||
for k, v in vars(model).items(): | ||||||
if isinstance(v, str): | ||||||
# parts of the dataset with key and value both being string, e.g. name, latitude_in_deg | ||||||
try: | ||||||
node_dict["attrs"][k] = v | ||||||
except KeyError: | ||||||
node_dict["attrs"] = {k: v} | ||||||
elif isinstance(v, BaseModel): | ||||||
if len(var_dict := vars(v)) == 1: | ||||||
# If values are length 1 we are dealing with coords like date | ||||||
v = v.values | ||||||
try: | ||||||
node_dict["coords"][k] = v | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
This would be consistent with the lines below. Not sure if dims is necessary, but I think so? |
||||||
except KeyError: | ||||||
node_dict["coords"] = {k: {"data": v, "dims": k}} | ||||||
else: | ||||||
for key, value in var_dict.items(): | ||||||
if key == "values": | ||||||
if not isinstance(value[0], list): | ||||||
try: | ||||||
node_dict["coords"][k] = {"data": value, "dims": list(node_dict["coords"])[0]} | ||||||
except KeyError: | ||||||
node_dict["coords"] = {k: {"data": value, "dims": list(node_dict["coords"])[0]}} | ||||||
else: | ||||||
# Parse the temperature matrix | ||||||
element_type = type(v).__name__ | ||||||
dimensions_expressions = schemaview.induced_slot(key, element_type).array.dimensions | ||||||
dims = [dim.alias for dim in dimensions_expressions] | ||||||
array = np.array(value) | ||||||
node_dict["dims"] = {dim: array.shape[i] for i, dim in enumerate(dims)} | ||||||
node_dict["data_vars"][k].update({"data": array, "dims": list(node_dict["dims"].keys())}) | ||||||
else: | ||||||
if isinstance(value, str): | ||||||
# can't use timestamp here as it does not serialize, potentially add with 'data' dims as coord | ||||||
node_dict["coords"][k].update({"attrs": {key: value}}) | ||||||
else: | ||||||
# conversion factor | ||||||
node_dict["data_vars"] = {k: {"attrs": {key: value}}} | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if there is more than one? Here the attrs dict is set to a dictionary with just this kv pair. I think this should call |
||||||
return xr.Dataset.from_dict(node_dict) | ||||||
|
||||||
|
||||||
|
||||||
def _iterate_element( | ||||||
element: Union[YAMLRoot, BaseModel], schemaview: SchemaView, datatree = None | ||||||
): | ||||||
"""Recursively iterate through the elements of a LinkML model and save them. | ||||||
|
||||||
Write toplevel Pydantic BaseModel objects as datasets, slots with the "array" element | ||||||
as datasets, and other slots as attributes. | ||||||
""" | ||||||
# get the type of the element | ||||||
element_type = type(element).__name__ | ||||||
|
||||||
for k, v in vars(element).items(): | ||||||
if isinstance(v, BaseModel): | ||||||
# create a subgroup and recurse | ||||||
if "values" in v.__dir__(): | ||||||
dims = ["y","x"] | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TODO: see if we can get this from the schemaview. |
||||||
data_dict = vars(v) | ||||||
data_dict["data"] = np.array(data_dict.pop("values")) | ||||||
data_dict["dims"] = [dims[i] for i in range(data_dict["data"].shape[0])] | ||||||
data_dict["attrs"] = {"name": v.name} | ||||||
dataarray = xr.DataArray.from_dict(d=data_dict) | ||||||
datatree[k] = dataarray | ||||||
else: | ||||||
dataset = _create_node(v, schemaview) | ||||||
datatree[k] = DataTree(dataset) | ||||||
elif isinstance(v, str): | ||||||
datatree.attrs["name"] = v | ||||||
return datatree | ||||||
|
||||||
|
||||||
class XarrayNetCDFDumper(Dumper): | ||||||
"""Dumper class for LinkML models to HDF5 files.""" | ||||||
|
||||||
def dump( | ||||||
self, | ||||||
element: Union[YAMLRoot, BaseModel], | ||||||
to_file: str, | ||||||
schemaview: SchemaView, | ||||||
**kwargs, | ||||||
): | ||||||
"""Dump the element to an HDF5 file.""" | ||||||
datatree = DataTree() | ||||||
datatree = _iterate_element(element, schemaview, datatree) | ||||||
datatree.to_netcdf(to_file, engine='h5netcdf') | ||||||
|
||||||
def dumps(self, element: Union[YAMLRoot, BaseModel], **kwargs): | ||||||
raise NotImplementedError("This method is not sensible for this dumper.") | ||||||
|
||||||
|
||||||
class XarrayZarrDumper(Dumper): | ||||||
"""Dumper class for LinkML models to HDF5 files.""" | ||||||
|
||||||
def dump( | ||||||
self, | ||||||
element: Union[YAMLRoot, BaseModel], | ||||||
to_file: str, | ||||||
schemaview: SchemaView, | ||||||
**kwargs, | ||||||
): | ||||||
"""Dump the element to an HDF5 file.""" | ||||||
datatree = DataTree() | ||||||
datatree = _iterate_element(element, schemaview, datatree) | ||||||
datatree.to_zarr(to_file) | ||||||
|
||||||
def dumps(self, element: Union[YAMLRoot, BaseModel], **kwargs): | ||||||
raise NotImplementedError("This method is not sensible for this dumper.") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
"""Class for dumping a LinkML model to YAML with paths to NumPy files.""" | ||
|
||
from pathlib import Path | ||
from typing import List, Union | ||
|
||
import numpy as np | ||
import xarray as xr | ||
|
||
from .yaml_array_file_dumper import YamlArrayFileDumper | ||
|
||
|
||
class YamlXarrayNetCDFDumper(YamlArrayFileDumper): | ||
"""Dumper class for LinkML models to YAML with paths to .nc file. | ||
|
||
Each array is written to a netcdf dataset at path "/data" in a new .nc file. | ||
""" | ||
|
||
FILE_SUFFIX = ".nc" # used in parent class | ||
FORMAT = "netcdf" | ||
|
||
@classmethod | ||
def write_array( | ||
cls, array: Union[List, np.ndarray], output_file_path_no_suffix: Union[str, Path] | ||
): | ||
"""Write an array to a NumPy file.""" | ||
# TODO do not assume that there is only one by this name | ||
# add suffix to the file name | ||
if isinstance(output_file_path_no_suffix, str): | ||
output_file_path_no_suffix = Path(output_file_path_no_suffix) | ||
output_file_path = output_file_path_no_suffix.parent / ( | ||
output_file_path_no_suffix.name + cls.FILE_SUFFIX | ||
) | ||
|
||
data_array = xr.DataArray(data=array) | ||
data_array.to_netcdf(output_file_path, engine="h5netcdf") | ||
return output_file_path | ||
|
||
|
||
class YamlXarrayZarrDumper(YamlArrayFileDumper): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am sorta having a tough time telling whats going on in this PR (sorry for missing the meeting, I never know when they are), but im doing this rn and u will definitely want to have a model for this, otherwise all the ser/des logic gets packed into a huge nesty dumper/loader and u lose the declarative relationship between in memory and dumped. |
||
"""Dumper class for LinkML models to YAML with paths to .zarr file. | ||
|
||
Each array is written to a zarr dataset at path "/data" in a new .zarr file. | ||
""" | ||
|
||
FILE_SUFFIX = ".zarr" # used in parent class | ||
FORMAT = "zarr" | ||
|
||
@classmethod | ||
def write_array( | ||
cls, array: Union[List, np.ndarray], output_file_path_no_suffix: Union[str, Path] | ||
): | ||
"""Write an array to a NumPy file.""" | ||
# TODO do not assume that there is only one by this name | ||
# add suffix to the file name | ||
if isinstance(output_file_path_no_suffix, str): | ||
output_file_path_no_suffix = Path(output_file_path_no_suffix) | ||
output_file_path = output_file_path_no_suffix.parent / ( | ||
output_file_path_no_suffix.name + cls.FILE_SUFFIX | ||
) | ||
|
||
data_array = xr.DataArray(data=array) | ||
data_array.to_zarr(output_file_path) | ||
return output_file_path |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add comment with pointer to the data model of a Dataset's attrs/coords/data_vars/dims.