diff --git a/src/stochtree.cpp b/src/stochtree.cpp index 4cc1eb3..0380828 100644 --- a/src/stochtree.cpp +++ b/src/stochtree.cpp @@ -1,5 +1,7 @@ #include #include +#include +#include #include #include #include @@ -114,14 +116,18 @@ class RngCpp { std::unique_ptr rng_; }; -// Forward declaration +// Forward declarations class ForestSamplerCpp; +class JsonCpp; class ForestContainerCpp { public: ForestContainerCpp(int num_trees, int output_dimension = 1, bool is_leaf_constant = true) { // Initialize pointer to C++ ForestContainer class forest_samples_ = std::make_unique(num_trees, output_dimension, is_leaf_constant); + num_trees_ = num_trees; + output_dimension_ = output_dimension; + is_leaf_constant_ = is_leaf_constant; } ~ForestContainerCpp() {} @@ -218,14 +224,16 @@ class ForestContainerCpp { void UpdateResidual(ForestDatasetCpp& dataset, ResidualCpp& residual, ForestSamplerCpp& sampler, bool requires_basis, int forest_num, bool add); - void SaveJson(std::string json_filename) { + void SaveToJsonFile(std::string json_filename) { forest_samples_->SaveToJsonFile(json_filename); } - void LoadFromJson(std::string json_filename) { + void LoadFromJsonFile(std::string json_filename) { forest_samples_->LoadFromJsonFile(json_filename); } + void LoadFromJson(JsonCpp& json, std::string forest_label); + StochTree::ForestContainer* GetContainer() { return forest_samples_.get(); } @@ -234,8 +242,15 @@ class ForestContainerCpp { return forest_samples_->GetEnsemble(i); } + nlohmann::json ToJson() { + return forest_samples_->to_json(); + } + private: std::unique_ptr forest_samples_; + int num_trees_; + int output_dimension_; + bool is_leaf_constant_; }; class ForestSamplerCpp { @@ -383,7 +398,6 @@ class LeafVarianceModelCpp { StochTree::LeafNodeHomoskedasticVarianceModel var_model_; }; -// Implementation of UpdateResidual void ForestContainerCpp::UpdateResidual(ForestDatasetCpp& dataset, ResidualCpp& residual, ForestSamplerCpp& sampler, bool requires_basis, int forest_num, bool add) { // Determine whether or not we are adding forest_num to the residuals std::function op; @@ -394,49 +408,355 @@ void ForestContainerCpp::UpdateResidual(ForestDatasetCpp& dataset, ResidualCpp& StochTree::UpdateResidualEntireForest(*(sampler.GetTracker()), *(dataset.GetDataset()), *(residual.GetData()), forest_samples_->GetEnsemble(forest_num), requires_basis, op); } +class JsonCpp { + public: + JsonCpp() { + // Initialize pointer to C++ nlohmann::json class + json_ = std::make_unique(); + nlohmann::json forests = nlohmann::json::object(); + json_->emplace("forests", forests); + json_->emplace("num_forests", 0); + } + ~JsonCpp() {} + + void LoadFile(std::string filename) { + std::ifstream f(filename); + *json_ = nlohmann::json::parse(f); + } + + void SaveFile(std::string filename) { + std::ofstream output_file(filename); + output_file << *json_ << std::endl; + } + + std::string DumpJson() { + return json_->dump(); + } + + std::string AddForest(ForestContainerCpp& forest_samples) { + int forest_num = json_->at("num_forests"); + std::string forest_label = "forest_" + std::to_string(forest_num); + nlohmann::json forest_json = forest_samples.ToJson(); + json_->at("forests").emplace(forest_label, forest_json); + json_->at("num_forests") = forest_num + 1; + return forest_label; + } + + void AddDouble(std::string field_name, double field_value) { + if (json_->contains(field_name)) { + json_->at(field_name) = field_value; + } else { + json_->emplace(std::pair(field_name, field_value)); + } + } + + void AddDoubleSubfolder(std::string subfolder_name, std::string field_name, double field_value) { + if (json_->contains(subfolder_name)) { + if (json_->at(subfolder_name).contains(field_name)) { + json_->at(subfolder_name).at(field_name) = field_value; + } else { + json_->at(subfolder_name).emplace(std::pair(field_name, field_value)); + } + } else { + json_->emplace(std::pair(subfolder_name, nlohmann::json::object())); + json_->at(subfolder_name).emplace(std::pair(field_name, field_value)); + } + } + + void AddBool(std::string field_name, bool field_value) { + if (json_->contains(field_name)) { + json_->at(field_name) = field_value; + } else { + json_->emplace(std::pair(field_name, field_value)); + } + } + + void AddBoolSubfolder(std::string subfolder_name, std::string field_name, bool field_value) { + if (json_->contains(subfolder_name)) { + if (json_->at(subfolder_name).contains(field_name)) { + json_->at(subfolder_name).at(field_name) = field_value; + } else { + json_->at(subfolder_name).emplace(std::pair(field_name, field_value)); + } + } else { + json_->emplace(std::pair(subfolder_name, nlohmann::json::object())); + json_->at(subfolder_name).emplace(std::pair(field_name, field_value)); + } + } + + void AddString(std::string field_name, std::string field_value) { + if (json_->contains(field_name)) { + json_->at(field_name) = field_value; + } else { + json_->emplace(std::pair(field_name, field_value)); + } + } + + void AddStringSubfolder(std::string subfolder_name, std::string field_name, std::string field_value) { + if (json_->contains(subfolder_name)) { + if (json_->at(subfolder_name).contains(field_name)) { + json_->at(subfolder_name).at(field_name) = field_value; + } else { + json_->at(subfolder_name).emplace(std::pair(field_name, field_value)); + } + } else { + json_->emplace(std::pair(subfolder_name, nlohmann::json::object())); + json_->at(subfolder_name).emplace(std::pair(field_name, field_value)); + } + } + + void AddDoubleVector(std::string field_name, py::array_t field_vector) { + int vec_length = field_vector.size(); + auto accessor = field_vector.mutable_unchecked<1>(); + if (json_->contains(field_name)) { + json_->at(field_name).clear(); + for (int i = 0; i < vec_length; i++) { + json_->at(field_name).emplace_back(accessor(i)); + } + } else { + json_->emplace(std::pair(field_name, nlohmann::json::array())); + for (int i = 0; i < vec_length; i++) { + json_->at(field_name).emplace_back(accessor(i)); + } + } + } + + void AddDoubleVectorSubfolder(std::string subfolder_name, std::string field_name, py::array_t field_vector) { + int vec_length = field_vector.size(); + auto accessor = field_vector.mutable_unchecked<1>(); + if (json_->contains(subfolder_name)) { + if (json_->at(subfolder_name).contains(field_name)) { + json_->at(subfolder_name).at(field_name).clear(); + for (int i = 0; i < vec_length; i++) { + json_->at(subfolder_name).at(field_name).emplace_back(accessor(i)); + } + } else { + json_->at(subfolder_name).emplace(std::pair(field_name, nlohmann::json::array())); + for (int i = 0; i < vec_length; i++) { + json_->at(subfolder_name).at(field_name).emplace_back(accessor(i)); + } + } + } else { + json_->emplace(std::pair(subfolder_name, nlohmann::json::object())); + json_->at(subfolder_name).emplace(std::pair(field_name, nlohmann::json::array())); + for (int i = 0; i < vec_length; i++) { + json_->at(subfolder_name).at(field_name).emplace_back(accessor(i)); + } + } + } + + void AddStringVector(std::string field_name, std::vector& field_vector) { + int vec_length = field_vector.size(); + if (json_->contains(field_name)) { + json_->at(field_name).clear(); + for (int i = 0; i < vec_length; i++) { + json_->at(field_name).emplace_back(field_vector.at(i)); + } + } else { + json_->emplace(std::pair(field_name, nlohmann::json::array())); + for (int i = 0; i < vec_length; i++) { + json_->at(field_name).emplace_back(field_vector.at(i)); + } + } + } + + void AddStringVectorSubfolder(std::string subfolder_name, std::string field_name, std::vector& field_vector) { + int vec_length = field_vector.size(); + if (json_->contains(subfolder_name)) { + if (json_->at(subfolder_name).contains(field_name)) { + json_->at(subfolder_name).at(field_name).clear(); + for (int i = 0; i < vec_length; i++) { + json_->at(subfolder_name).at(field_name).emplace_back(field_vector.at(i)); + } + } else { + json_->at(subfolder_name).emplace(std::pair(field_name, nlohmann::json::array())); + for (int i = 0; i < vec_length; i++) { + json_->at(subfolder_name).at(field_name).emplace_back(field_vector.at(i)); + } + } + } else { + json_->emplace(std::pair(subfolder_name, nlohmann::json::object())); + json_->at(subfolder_name).emplace(std::pair(field_name, nlohmann::json::array())); + for (int i = 0; i < vec_length; i++) { + json_->at(subfolder_name).at(field_name).emplace_back(field_vector.at(i)); + } + } + } + + bool ContainsField(std::string field_name) { + if (json_->contains(field_name)) { + return true; + } else { + return false; + } + } + + bool ContainsFieldSubfolder(std::string subfolder_name, std::string field_name) { + if (json_->contains(subfolder_name)) { + if (json_->at(subfolder_name).contains(field_name)) { + return true; + } else { + return false; + } + } else { + return false; + } + } + + double ExtractDouble(std::string field_name) { + return json_->at(field_name); + } + + double ExtractDoubleSubfolder(std::string subfolder_name, std::string field_name) { + return json_->at(subfolder_name).at(field_name); + } + + bool ExtractBool(std::string field_name) { + return json_->at(field_name); + } + + bool ExtractBoolSubfolder(std::string subfolder_name, std::string field_name) { + return json_->at(subfolder_name).at(field_name); + } + + std::string ExtractString(std::string field_name) { + return json_->at(field_name); + } + + std::string ExtractStringSubfolder(std::string subfolder_name, std::string field_name) { + return json_->at(subfolder_name).at(field_name); + } + + py::array_t ExtractDoubleVector(std::string field_name) { + auto json_vec = json_->at(field_name); + ssize_t json_vec_length = json_->at(field_name).size(); + auto result = py::array_t(py::detail::any_container({json_vec_length})); + auto accessor = result.mutable_unchecked<1>(); + for (size_t i = 0; i < json_vec_length; i++) { + accessor(i) = json_vec.at(i); + } + return result; + } + + py::array_t ExtractDoubleVectorSubfolder(std::string subfolder_name, std::string field_name) { + auto json_vec = json_->at(subfolder_name).at(field_name); + ssize_t json_vec_length = json_->at(subfolder_name).at(field_name).size(); + auto result = py::array_t(py::detail::any_container({json_vec_length})); + auto accessor = result.mutable_unchecked<1>(); + for (size_t i = 0; i < json_vec_length; i++) { + accessor(i) = json_vec.at(i); + } + return result; + } + + std::vector ExtractStringVector(std::string field_name) { + auto json_vec = json_->at(field_name); + ssize_t json_vec_length = json_->at(field_name).size(); + auto result = std::vector(json_vec_length); + for (size_t i = 0; i < json_vec_length; i++) { + result.at(i) = json_vec.at(i); + } + return result; + } + + std::vector ExtractStringVectorSubfolder(std::string subfolder_name, std::string field_name) { + auto json_vec = json_->at(subfolder_name).at(field_name); + ssize_t json_vec_length = json_->at(subfolder_name).at(field_name).size(); + auto result = std::vector(json_vec_length); + for (size_t i = 0; i < json_vec_length; i++) { + result.at(i) = json_vec.at(i); + } + return result; + } + + nlohmann::json SubsetJsonForest(std::string forest_label) { + return json_->at("forests").at(forest_label); + } + + private: + std::unique_ptr json_; +}; + +void ForestContainerCpp::LoadFromJson(JsonCpp& json, std::string forest_label) { + nlohmann::json forest_json = json.SubsetJsonForest(forest_label); + forest_samples_->Reset(); + forest_samples_->from_json(forest_json); +} + PYBIND11_MODULE(stochtree_cpp, m) { - py::class_(m, "ForestDatasetCpp") - .def(py::init<>()) - .def("AddCovariates", &ForestDatasetCpp::AddCovariates) - .def("AddBasis", &ForestDatasetCpp::AddBasis) - .def("UpdateBasis", &ForestDatasetCpp::UpdateBasis) - .def("AddVarianceWeights", &ForestDatasetCpp::AddVarianceWeights) - .def("NumRows", &ForestDatasetCpp::NumRows); - - py::class_(m, "ResidualCpp") - .def(py::init,data_size_t>()); - - py::class_(m, "RngCpp") - .def(py::init()); - - py::class_(m, "ForestContainerCpp") - .def(py::init()) - .def("OutputDimension", &ForestContainerCpp::OutputDimension) - .def("NumSamples", &ForestContainerCpp::NumSamples) - .def("Predict", &ForestContainerCpp::Predict) - .def("PredictRaw", &ForestContainerCpp::PredictRaw) - .def("PredictRawSingleForest", &ForestContainerCpp::PredictRawSingleForest) - .def("SetRootValue", &ForestContainerCpp::SetRootValue) - .def("SetRootVector", &ForestContainerCpp::SetRootVector) - .def("UpdateResidual", &ForestContainerCpp::UpdateResidual) - .def("SaveJson", &ForestContainerCpp::SaveJson) - .def("LoadFromJson", &ForestContainerCpp::LoadFromJson); - - py::class_(m, "ForestSamplerCpp") - .def(py::init, int, data_size_t, double, double, int>()) - .def("SampleOneIteration", &ForestSamplerCpp::SampleOneIteration); - - py::class_(m, "GlobalVarianceModelCpp") - .def(py::init<>()) - .def("SampleOneIteration", &GlobalVarianceModelCpp::SampleOneIteration); - - py::class_(m, "LeafVarianceModelCpp") - .def(py::init<>()) - .def("SampleOneIteration", &LeafVarianceModelCpp::SampleOneIteration); + py::class_(m, "JsonCpp") + .def(py::init<>()) + .def("LoadFile", &JsonCpp::LoadFile) + .def("SaveFile", &JsonCpp::SaveFile) + .def("DumpJson", &JsonCpp::DumpJson) + .def("AddDouble", &JsonCpp::AddDouble) + .def("AddDoubleSubfolder", &JsonCpp::AddDoubleSubfolder) + .def("AddBool", &JsonCpp::AddBool) + .def("AddBoolSubfolder", &JsonCpp::AddBoolSubfolder) + .def("AddString", &JsonCpp::AddString) + .def("AddStringSubfolder", &JsonCpp::AddStringSubfolder) + .def("AddDoubleVector", &JsonCpp::AddDoubleVector) + .def("AddDoubleVectorSubfolder", &JsonCpp::AddDoubleVectorSubfolder) + .def("AddStringVector", &JsonCpp::AddStringVector) + .def("AddStringVectorSubfolder", &JsonCpp::AddStringVectorSubfolder) + .def("AddForest", &JsonCpp::AddForest) + .def("ContainsField", &JsonCpp::ContainsField) + .def("ContainsFieldSubfolder", &JsonCpp::ContainsFieldSubfolder) + .def("ExtractDouble", &JsonCpp::ExtractDouble) + .def("ExtractDoubleSubfolder", &JsonCpp::ExtractDoubleSubfolder) + .def("ExtractBool", &JsonCpp::ExtractBool) + .def("ExtractBoolSubfolder", &JsonCpp::ExtractBoolSubfolder) + .def("ExtractString", &JsonCpp::ExtractString) + .def("ExtractStringSubfolder", &JsonCpp::ExtractStringSubfolder) + .def("ExtractDoubleVector", &JsonCpp::ExtractDoubleVector) + .def("ExtractDoubleVectorSubfolder", &JsonCpp::ExtractDoubleVectorSubfolder) + .def("ExtractStringVector", &JsonCpp::ExtractStringVector) + .def("ExtractStringVectorSubfolder", &JsonCpp::ExtractStringVectorSubfolder) + .def("SubsetJsonForest", &JsonCpp::SubsetJsonForest); + + py::class_(m, "ForestDatasetCpp") + .def(py::init<>()) + .def("AddCovariates", &ForestDatasetCpp::AddCovariates) + .def("AddBasis", &ForestDatasetCpp::AddBasis) + .def("UpdateBasis", &ForestDatasetCpp::UpdateBasis) + .def("AddVarianceWeights", &ForestDatasetCpp::AddVarianceWeights) + .def("NumRows", &ForestDatasetCpp::NumRows); + + py::class_(m, "ResidualCpp") + .def(py::init,data_size_t>()); + + py::class_(m, "RngCpp") + .def(py::init()); + + py::class_(m, "ForestContainerCpp") + .def(py::init()) + .def("OutputDimension", &ForestContainerCpp::OutputDimension) + .def("NumSamples", &ForestContainerCpp::NumSamples) + .def("Predict", &ForestContainerCpp::Predict) + .def("PredictRaw", &ForestContainerCpp::PredictRaw) + .def("PredictRawSingleForest", &ForestContainerCpp::PredictRawSingleForest) + .def("SetRootValue", &ForestContainerCpp::SetRootValue) + .def("SetRootVector", &ForestContainerCpp::SetRootVector) + .def("UpdateResidual", &ForestContainerCpp::UpdateResidual) + .def("SaveToJsonFile", &ForestContainerCpp::SaveToJsonFile) + .def("LoadFromJsonFile", &ForestContainerCpp::LoadFromJsonFile) + .def("LoadFromJson", &ForestContainerCpp::LoadFromJson); + + py::class_(m, "ForestSamplerCpp") + .def(py::init, int, data_size_t, double, double, int>()) + .def("SampleOneIteration", &ForestSamplerCpp::SampleOneIteration); + + py::class_(m, "GlobalVarianceModelCpp") + .def(py::init<>()) + .def("SampleOneIteration", &GlobalVarianceModelCpp::SampleOneIteration); + + py::class_(m, "LeafVarianceModelCpp") + .def(py::init<>()) + .def("SampleOneIteration", &LeafVarianceModelCpp::SampleOneIteration); #ifdef VERSION_INFO - m.attr("__version__") = MACRO_STRINGIFY(VERSION_INFO); + m.attr("__version__") = MACRO_STRINGIFY(VERSION_INFO); #else - m.attr("__version__") = "dev"; + m.attr("__version__") = "dev"; #endif } \ No newline at end of file diff --git a/stochtree/__init__.py b/stochtree/__init__.py index a6b8d7c..b868e54 100644 --- a/stochtree/__init__.py +++ b/stochtree/__init__.py @@ -3,6 +3,9 @@ from .data import Dataset, Residual from .forest import ForestContainer from .sampler import RNG, ForestSampler, GlobalVarianceModel, LeafVarianceModel +from .serialization import JSONSerializer from .utils import NotSampledError -__all__ = ['BARTModel', 'BCFModel', 'Dataset', 'Residual', 'ForestContainer', 'RNG', 'ForestSampler', 'GlobalVarianceModel', 'LeafVarianceModel', 'NotSampledError'] \ No newline at end of file +__all__ = ['BARTModel', 'BCFModel', 'Dataset', 'Residual', 'ForestContainer', + 'RNG', 'ForestSampler', 'GlobalVarianceModel', 'LeafVarianceModel', + 'JSONSerializer', 'NotSampledError'] \ No newline at end of file diff --git a/stochtree/bart.py b/stochtree/bart.py index 2becab2..a5bdf0d 100644 --- a/stochtree/bart.py +++ b/stochtree/bart.py @@ -299,8 +299,9 @@ def predict(self, covariates: np.array, basis: np.array = None) -> np.array: # Convert everything to standard shape (2-dimensional) if covariates.ndim == 1: covariates = np.expand_dims(covariates, 1) - if basis.ndim == 1: - basis = np.expand_dims(basis, 1) + if basis is not None: + if basis.ndim == 1: + basis = np.expand_dims(basis, 1) # Data checks if basis is not None: @@ -309,6 +310,7 @@ def predict(self, covariates: np.array, basis: np.array = None) -> np.array: pred_dataset = Dataset() pred_dataset.add_covariates(covariates) - pred_dataset.add_basis(basis) + if basis is not None: + pred_dataset.add_basis(basis) pred_raw = self.forest_container.forest_container_cpp.Predict(pred_dataset.dataset_cpp) return pred_raw[:,self.keep_indices]*self.y_std + self.y_bar diff --git a/stochtree/bcf.py b/stochtree/bcf.py index 4a1aa59..9b6a539 100644 --- a/stochtree/bcf.py +++ b/stochtree/bcf.py @@ -12,6 +12,9 @@ from .utils import NotSampledError class BCFModel: + """Class that handles sampling, storage, and serialization of causal BART models like BCF, XBCF, and Warm-Start BCF + """ + def __init__(self) -> None: # Internal flag for whether the sample() method has been run self.sampled = False diff --git a/stochtree/forest.py b/stochtree/forest.py index 76d1e78..9b80e58 100644 --- a/stochtree/forest.py +++ b/stochtree/forest.py @@ -3,6 +3,7 @@ """ import numpy as np from .data import Dataset, Residual +# from .serialization import JSONSerializer from stochtree_cpp import ForestContainerCpp from typing import Union @@ -35,8 +36,9 @@ def set_root_leaves(self, forest_num: int, leaf_value: Union[float, np.array]) - else: self.forest_container_cpp.SetRootValue(forest_num, leaf_value) - def save_json(self, json_filename: str) -> None: - self.forest_container_cpp.SaveJson(json_filename) + def save_to_json_file(self, json_filename: str) -> None: + self.forest_container_cpp.SaveToJsonFile(json_filename) - def load_from_json(self, json_filename: str) -> None: - self.forest_container_cpp.LoadFromJson(json_filename) + def load_from_json_file(self, json_filename: str) -> None: + self.forest_container_cpp.LoadFromJsonFile(json_filename) + \ No newline at end of file diff --git a/stochtree/serialization.py b/stochtree/serialization.py new file mode 100644 index 0000000..9c7d04b --- /dev/null +++ b/stochtree/serialization.py @@ -0,0 +1,183 @@ +import warnings +import numpy as np +import pandas as pd +from scipy.linalg import lstsq +from scipy.stats import gamma +from .forest import ForestContainer +from stochtree_cpp import JsonCpp + +class JSONSerializer: + """Class that handles serialization and deserialization of stochastic forest models + """ + + def __init__(self) -> None: + self.json_cpp = JsonCpp() + self.num_forests = 0 + self.forest_labels = [] + + def add_forest(self, forest_samples: ForestContainer) -> None: + """Adds a container of forest samples to a json object + + :param forest_samples: Samples of a tree ensemble + :type forest_samples: ForestContainer + """ + forest_label = self.json_cpp.AddForest(forest_samples.forest_container_cpp) + self.num_forests += 1 + self.forest_labels.append(forest_label) + + def add_scalar(self, field_name: str, field_value: float, subfolder_name: str = None) -> None: + """Adds a scalar (numeric) value to a json object + + :param field_name: Name of the json field / label under which the numeric value will be stored + :type field_name: str + :param field_value: Numeric value to be stored + :type field_value: float + :param subfolder_name: Name of "subfolder" under which ``field_name`` to be stored in the json hierarchy + :type subfolder_name: str, optional + """ + if subfolder_name is None: + self.json_cpp.AddDouble(field_name, field_value) + else: + self.json_cpp.AddDoubleSubfolder(subfolder_name, field_name, field_value) + + def add_boolean(self, field_name: str, field_value: bool, subfolder_name: str = None) -> None: + """Adds a scalar (boolean) value to a json object + + :param field_name: Name of the json field / label under which the boolean value will be stored + :type field_name: str + :param field_value: Boolean value to be stored + :type field_value: bool + :param subfolder_name: Name of "subfolder" under which ``field_name`` to be stored in the json hierarchy + :type subfolder_name: str, optional + """ + if subfolder_name is None: + self.json_cpp.AddBool(field_name, field_value) + else: + self.json_cpp.AddBoolSubfolder(subfolder_name, field_name, field_value) + + def add_string(self, field_name: str, field_value: str, subfolder_name: str = None) -> None: + """Adds a string to a json object + + :param field_name: Name of the json field / label under which the numeric value will be stored + :type field_name: str + :param field_value: String field to be stored + :type field_value: str + :param subfolder_name: Name of "subfolder" under which ``field_name`` to be stored in the json hierarchy + :type subfolder_name: str, optional + """ + if subfolder_name is None: + self.json_cpp.AddString(field_name, field_value) + else: + self.json_cpp.AddStringSubfolder(subfolder_name, field_name, field_value) + + def add_numeric_vector(self, field_name: str, field_vector: np.array, subfolder_name: str = None) -> None: + """Adds a numeric vector (stored as a numpy array) to a json object + + :param field_name: Name of the json field / label under which the numeric vector will be stored + :type field_name: str + :param field_vector: Numpy array containing the vector to be stored in json. Should be one-dimensional. + :type field_vector: np.array + :param subfolder_name: Name of "subfolder" under which ``field_name`` to be stored in the json hierarchy + :type subfolder_name: str, optional + """ + # Runtime checks + if not isinstance(field_vector, np.ndarray): + raise ValueError("field_vector must be a numpy array") + field_vector = np.squeeze(field_vector) + if field_vector.ndim > 1: + warnings.warn("field_vector has more than 1 dimension. It will be flattened in row-major order using np.ravel()") + field_vector = np.ravel(field_vector, order = "C") + + if subfolder_name is None: + self.json_cpp.AddDoubleVector(field_name, field_vector) + else: + self.json_cpp.AddDoubleVectorSubfolder(subfolder_name, field_name, field_vector) + + def add_string_vector(self, field_name: str, field_vector: list, subfolder_name: str = None) -> None: + """Adds a list of strings to a json object as an array + + :param field_name: Name of the json field / label under which the string list will be stored + :type field_name: str + :param field_vector: Python list of strings containing the array to be stored in json + :type field_vector: list + :param subfolder_name: Name of "subfolder" under which ``field_name`` to be stored in the json hierarchy + :type subfolder_name: str, optional + """ + # Runtime checks + if not isinstance(field_vector, list): + raise ValueError("field_vector must be a list") + + if subfolder_name is None: + self.json_cpp.AddStringVector(field_name, field_vector) + else: + self.json_cpp.AddStringVectorSubfolder(subfolder_name, field_name, field_vector) + + def get_scalar(self, field_name: str, subfolder_name: str = None) -> float: + """Retrieves a scalar (numeric) value from a json object + + :param field_name: Name of the json field / label under which the numeric value is stored + :type field_name: str + :param subfolder_name: Name of "subfolder" under which ``field_name`` is stored in the json hierarchy + :type subfolder_name: str, optional + """ + if subfolder_name is None: + return self.json_cpp.ExtractDouble(field_name) + else: + return self.json_cpp.ExtractDoubleSubfolder(subfolder_name, field_name) + + def get_boolean(self, field_name: str, subfolder_name: str = None) -> bool: + """Retrieves a scalar (boolean) value from a json object + + :param field_name: Name of the json field / label under which the boolean value is stored + :type field_name: str + :param subfolder_name: Name of "subfolder" under which ``field_name`` is stored in the json hierarchy + :type subfolder_name: str, optional + """ + if subfolder_name is None: + return self.json_cpp.ExtractBool(field_name) + else: + return self.json_cpp.ExtractBoolSubfolder(subfolder_name, field_name) + + def get_string(self, field_name: str, subfolder_name: str = None) -> str: + """Retrieve a string to a json object + + :param field_name: Name of the json field / label under which the numeric value is stored + :type field_name: str + :param subfolder_name: Name of "subfolder" under which ``field_name`` is stored in the json hierarchy + :type subfolder_name: str, optional + """ + if subfolder_name is None: + return self.json_cpp.ExtractString(field_name) + else: + return self.json_cpp.ExtractStringSubfolder(subfolder_name, field_name) + + def get_numeric_vector(self, field_name: str, subfolder_name: str = None) -> np.array: + """Adds a string to a json object + + :param field_name: Name of the json field / label under which the numeric vector is stored + :type field_name: str + :param subfolder_name: Name of "subfolder" under which ``field_name`` to be stored in the json hierarchy + :type subfolder_name: str, optional + """ + if subfolder_name is None: + return self.json_cpp.ExtractDoubleVector(field_name) + else: + return self.json_cpp.ExtractDoubleVectorSubfolder(subfolder_name, field_name) + + def get_string_vector(self, field_name: str, subfolder_name: str = None) -> list: + """Adds a string to a json object + + :param field_name: Name of the json field / label under which the string list is stored + :type field_name: str + :param subfolder_name: Name of "subfolder" under which ``field_name`` to be stored in the json hierarchy + :type subfolder_name: str, optional + """ + if subfolder_name is None: + return self.json_cpp.ExtractStringVector(field_name) + else: + return self.json_cpp.ExtractStringVectorSubfolder(subfolder_name, field_name) + + def get_forest_container(self, forest_label: str) -> ForestContainer: + result = ForestContainer(0, 1, True) + result.forest_container_cpp.LoadFromJson(self.json_cpp, forest_label) + return result diff --git a/test/test_json.py b/test/test_json.py new file mode 100644 index 0000000..db88d3d --- /dev/null +++ b/test/test_json.py @@ -0,0 +1,71 @@ +import numpy as np +from stochtree import BARTModel, JSONSerializer, ForestContainer, Dataset + +class TestJson: + def test_value(self): + json_test = JSONSerializer() + a = 1.5 + b = True + c = "Example" + json_test.add_scalar("a", a) + json_test.add_boolean("b", b) + json_test.add_string("c", c) + assert a == json_test.get_scalar("a") + assert b == json_test.get_boolean("b") + assert c == json_test.get_string("c") + + def test_array(self): + json_test = JSONSerializer() + a = np.array([1.5,2.4,3.3]) + b = ["a","b","c"] + json_test.add_numeric_vector("a", a) + json_test.add_string_vector("b", b) + np.testing.assert_array_equal(a, json_test.get_numeric_vector("a")) + assert b == json_test.get_string_vector("b") + + def test_forest(self): + # Generate sample data + random_seed = 1234 + rng = np.random.default_rng(random_seed) + n = 1000 + p_X = 10 + p_W = 1 + X = rng.uniform(0, 1, (n, p_X)) + def outcome_mean(X): + return np.where( + (X[:,0] >= 0.0) & (X[:,0] < 0.25), -7.5, + np.where( + (X[:,0] >= 0.25) & (X[:,0] < 0.5), -2.5, + np.where( + (X[:,0] >= 0.5) & (X[:,0] < 0.75), 2.5, + 7.5 + ) + ) + ) + epsilon = rng.normal(0, 1, n) + y = outcome_mean(X) + epsilon + + # Train a BART model + bart_model = BARTModel() + bart_model.sample(X_train=X, y_train=y, num_gfr=10, num_mcmc=10) + + # Extract original predictions + forest_preds_y_mcmc_cached = bart_model.y_hat_train + + # Extract original predictions + forest_preds_y_mcmc_retrieved = bart_model.predict(X) + + # Roundtrip to / from JSON + json_test = JSONSerializer() + json_test.add_forest(bart_model.forest_container) + forest_container = json_test.get_forest_container("forest_0") + + # Predict from the deserialized forest container + forest_dataset = Dataset() + forest_dataset.add_covariates(X) + forest_preds_json_reload = forest_container.predict(forest_dataset)[:,bart_model.keep_indices] + forest_preds_json_reload = forest_preds_json_reload*bart_model.y_std + bart_model.y_bar + # Check the predictions + np.testing.assert_almost_equal(forest_preds_y_mcmc_cached, forest_preds_json_reload) + np.testing.assert_almost_equal(forest_preds_y_mcmc_retrieved, forest_preds_json_reload) + \ No newline at end of file