Skip to content
This repository was archived by the owner on Feb 7, 2025. It is now read-only.

Handle and correctly preprocess a variety of input types #5

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# StochasticTree Python Package

**NOTE**: we are in the process of refactoring this project so that the R, Python, and C++ source code sits in the [same repo](https://github.com/StochasticTree/stochtree-cpp/).

## Getting started

The python package can be installed from source. Before you begin, make sure you have [conda](https://www.anaconda.com/download) installed.
Expand Down
10 changes: 4 additions & 6 deletions demo/notebooks/causal_inference.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,7 @@
"outputs": [],
"source": [
"# RNG\n",
"random_seed = 101\n",
"rng = np.random.default_rng(random_seed)\n",
"rng = np.random.default_rng()\n",
"\n",
"# Generate covariates and basis\n",
"n = 1000\n",
Expand All @@ -53,9 +52,8 @@
"Z = rng.binomial(1, pi_X, n).astype(float)\n",
"\n",
"# Define the outcome mean functions (prognostic and treatment effects)\n",
"mu_X = pi_X*5\n",
"# tau_X = np.sin(X[:,1]*2*np.pi)\n",
"tau_X = X[:,1]*2\n",
"mu_X = pi_X*5 + 2*X[:,2]\n",
"tau_X = (X[:,1]*2 - 1)\n",
"\n",
"# Generate outcome\n",
"epsilon = rng.normal(0, 1, n)\n",
Expand Down Expand Up @@ -105,7 +103,7 @@
"outputs": [],
"source": [
"bcf_model = BCFModel()\n",
"bcf_model.sample(X_train, Z_train, y_train, pi_train, X_test, Z_test, pi_test, num_gfr=10, num_mcmc=1000)"
"bcf_model.sample(X_train, Z_train, y_train, pi_train, X_test, Z_test, pi_test, num_gfr=10, num_mcmc=100)"
]
},
{
Expand Down
18 changes: 18 additions & 0 deletions docs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Python Package Documentation

## Building Documentation Locally

The online documentation is built automatically upon successful PR merge (see [here](https://github.com/StochasticTree/stochtree-python/blob/main/.github/workflows/docs.yml) for the Github workflow).
To build the documentation locally, first ensure that you have [Sphinx](https://www.sphinx-doc.org/en/master/) installed, then navigate to the python package's main directory (i.e. `cd [path/to/stochtree-python]`),
install the package, and run `sphinx-build` as below

```
pip install .
sphinx-build -M html docs/source/ docs/build/
```

## Documentation Style

Module (class, function, etc...) documentation follows [the numpy standard](https://numpydoc.readthedocs.io/en/latest/format.html#docstring-standard),
applied in Sphinx using the [napoleon](https://www.sphinx-doc.org/en/master/usage/extensions/napoleon.html) extension.

7 changes: 7 additions & 0 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,23 @@ beautifulsoup4==4.12.3
certifi==2024.2.2
charset-normalizer==3.3.2
docutils==0.20.1
exceptiongroup==1.2.1
furo==2024.5.6
idna==3.7
imagesize==1.4.1
importlib_metadata==7.1.0
iniconfig==2.0.0
Jinja2==3.1.4
joblib==1.4.2
MarkupSafe==2.1.5
numpy==1.24.4
numpydoc==1.7.0
packaging==24.0
pandas==2.0.3
pluggy==1.5.0
pybind11==2.12.0
Pygments==2.18.0
pytest==8.2.1
python-dateutil==2.9.0.post0
pytz==2024.1
requests==2.32.2
Expand All @@ -32,7 +37,9 @@ sphinxcontrib-htmlhelp==2.0.1
sphinxcontrib-jsmath==1.0.1
sphinxcontrib-qthelp==1.0.3
sphinxcontrib-serializinghtml==1.1.5
tabulate==0.9.0
threadpoolctl==3.5.0
tomli==2.0.1
tzdata==2024.1
urllib3==2.2.1
zipp==3.18.2
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.autosummary',
'numpydoc'
]

templates_path = ['_templates']
exclude_patterns = []



# -- Options for HTML output -------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output

Expand Down
5 changes: 3 additions & 2 deletions stochtree/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
from .bcf import BCFModel
from .data import Dataset, Residual
from .forest import ForestContainer
from .preprocessing import CovariateTransformer
from .sampler import RNG, ForestSampler, GlobalVarianceModel, LeafVarianceModel
from .serialization import JSONSerializer
from .utils import NotSampledError

__all__ = ['BARTModel', 'BCFModel', 'Dataset', 'Residual', 'ForestContainer',
'RNG', 'ForestSampler', 'GlobalVarianceModel', 'LeafVarianceModel',
'JSONSerializer', 'NotSampledError']
'CovariateTransformer', 'RNG', 'ForestSampler', 'GlobalVarianceModel',
'LeafVarianceModel', 'JSONSerializer', 'NotSampledError']
181 changes: 108 additions & 73 deletions stochtree/bart.py

Large diffs are not rendered by default.

260 changes: 147 additions & 113 deletions stochtree/bcf.py

Large diffs are not rendered by default.

266 changes: 266 additions & 0 deletions stochtree/preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
"""
Data preprocessing module, drawn largely from the sklearn preprocessing module, released under the BSD-3-Clause license, with the following copyright

Copyright (c) 2007-2024 The scikit-learn developers.
"""
from typing import Union, Optional, Any
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.utils.validation import check_array, column_or_1d
import numpy as np
import pandas as pd
import warnings

class CovariateTransformer:
"""Class that transforms covariates to a format that can be used to define tree splits
"""

def __init__(self) -> None:
self._is_fitted = False
self._ordinal_encoders = []
self._onehot_encoders = []
self._ordinal_feature_index = []
self._onehot_feature_index = []
self._processed_feature_types = []
self._original_feature_types = []

def _check_is_numeric_dtype(self, dtype: np.dtype) -> bool:
if dtype.kind == "b" or dtype.kind == "i" or dtype.kind == "u" or dtype.kind == "f":
return True
else:
return False

def _process_unordered_categorical(self, covariate: pd.Series) -> int:
num_onehot = len(self._onehot_encoders)
category_list = covariate.array.categories.to_list()
enc = OneHotEncoder(categories=[category_list], sparse_output=False)
enc.fit(pd.DataFrame(covariate))
self._onehot_encoders.append(enc)
return num_onehot

def _process_ordered_categorical(self, covariate: pd.Series) -> int:
num_ord = len(self._ordinal_encoders)
category_list = covariate.array.categories.to_list()
enc = OrdinalEncoder(categories=[category_list])
enc.fit(pd.DataFrame(covariate))
self._ordinal_encoders.append(enc)
return num_ord

def _fit_pandas(self, covariates: pd.DataFrame) -> None:
self._num_original_features = covariates.shape[1]
self._ordinal_feature_index = [-1 for i in range(self._num_original_features)]
self._onehot_feature_index = [-1 for i in range(self._num_original_features)]
self._original_feature_types = [-1 for i in range(self._num_original_features)]
datetime_types = covariates.apply(lambda x: pd.api.types.is_datetime64_any_dtype(x))
object_types = covariates.apply(lambda x: pd.api.types.is_object_dtype(x))
interval_types = covariates.apply(lambda x: isinstance(x.dtype, pd.IntervalDtype))
period_types = covariates.apply(lambda x: isinstance(x.dtype, pd.PeriodDtype))
timedelta_types = np.logical_or(covariates.apply(lambda x: pd.api.types.is_timedelta64_dtype(x)),
covariates.apply(lambda x: pd.api.types.is_timedelta64_ns_dtype(x)))
sparse_types = covariates.apply(lambda x: isinstance(x.dtype, pd.SparseDtype))
bool_types = covariates.apply(lambda x: pd.api.types.is_bool_dtype(x))
categorical_types = covariates.apply(lambda x: isinstance(x.dtype, pd.CategoricalDtype))
float_types = covariates.apply(lambda x: pd.api.types.is_float_dtype(x))
integer_types = covariates.apply(lambda x: pd.api.types.is_integer_dtype(x))
string_types = covariates.apply(lambda x: pd.api.types.is_integer_dtype(x))
if np.any(datetime_types):
# raise ValueError("DateTime columns are currently unsupported")
datetime_cols = covariates.columns[datetime_types].to_list()
warn_msg = "The following columns are a type unsupported by stochtree (DateTime) and will be ignored: {}"
warnings.warn(warn_msg.format(datetime_cols))
if np.any(interval_types):
# raise ValueError("Interval columns are currently unsupported")
interval_cols = covariates.columns[interval_types].to_list()
warn_msg = "The following columns are a type unsupported by stochtree (Interval) and will be ignored: {}"
warnings.warn(warn_msg.format(interval_cols))
if np.any(period_types):
# raise ValueError("Period columns are currently unsupported")
period_cols = covariates.columns[period_types].to_list()
warn_msg = "The following columns are a type unsupported by stochtree (Period) and will be ignored: {}"
warnings.warn(warn_msg.format(period_cols))
if np.any(timedelta_types):
# raise ValueError("TimeDelta columns are currently unsupported")
timedelta_cols = covariates.columns[timedelta_types].to_list()
warn_msg = "The following columns are a type unsupported by stochtree (TimeDelta) and will be ignored: {}"
warnings.warn(warn_msg.format(timedelta_cols))
if np.any(sparse_types):
# raise ValueError("Sparse columns are currently unsupported")
sparse_cols = covariates.columns[sparse_types].to_list()
warn_msg = "The following columns are a type unsupported by stochtree (Sparse) and will be ignored: {}"
warnings.warn(warn_msg.format(sparse_cols))
if np.any(object_types):
# raise ValueError("Object columns are currently unsupported")
object_cols = covariates.columns[object_types].to_list()
warn_msg = "The following columns are a type unsupported by stochtree (object) and will be ignored: {}"
warnings.warn(warn_msg.format(object_cols))

for i in range(covariates.shape[1]):
covariate = covariates.iloc[:,i]
if categorical_types.iloc[i]:
self._original_feature_types[i] = "category"
if covariate.array.ordered:
ord_index = self._process_ordered_categorical(covariate)
self._ordinal_feature_index[i] = ord_index
self._processed_feature_types.append(1)
else:
onehot_index = self._process_unordered_categorical(covariate)
self._onehot_feature_index[i] = onehot_index
feature_ones = np.repeat(1, len(covariate.array.categories)).tolist()
self._processed_feature_types.extend(feature_ones)
elif string_types.iloc[i]:
self._original_feature_types[i] = "string"
onehot_index = self._process_unordered_categorical(covariate)
self._onehot_feature_index[i] = onehot_index
feature_ones = np.repeat(1, len(self._onehot_encoders[onehot_index].categories_[0])).tolist()
self._processed_feature_types.extend(feature_ones)
elif bool_types.iloc[i]:
self._original_feature_types[i] = "boolean"
self._processed_feature_types.append(1)
elif integer_types.iloc[i]:
self._original_feature_types[i] = "integer"
self._processed_feature_types.append(0)
elif float_types.iloc[i]:
self._original_feature_types[i] = "float"
self._processed_feature_types.append(0)
else:
self._original_feature_types[i] = "unsupported"

def _fit_numpy(self, covariates: np.array) -> None:
if covariates.ndim == 1:
covariates = np.expand_dims(covariates, 1)
elif covariates.ndim > 2:
raise ValueError("Covariates passed as a numpy array must be 1d or 2d")

self._num_original_features = covariates.shape[1]
self._ordinal_feature_index = [-1 for i in range(self._num_original_features)]
self._onehot_feature_index = [-1 for i in range(self._num_original_features)]
self._original_feature_types = ["float" for i in range(self._num_original_features)]

# Check whether the array is numeric
cov_dtype = covariates.dtype
if len(cov_dtype) == 0:
array_numeric = True
else:
array_numeric = True
for i in range(len(cov_dtype)):
if not self._check_is_numeric_dtype(cov_dtype[i]):
array_numeric = False
if not array_numeric:
raise ValueError("Covariates passed as np.array must all be simple numeric types (bool, integer, unsigned integer, floating point)")

# Scan for binary columns
for i in range(self._num_original_features):
num_unique = np.unique(covariates[:,i]).size
if num_unique == 2:
self._processed_feature_types.append(1)
else:
self._processed_feature_types.append(0)

def _fit(self, covariates: Union[pd.DataFrame, np.array]) -> None:
if isinstance(covariates, pd.DataFrame):
self._fit_pandas(covariates)
elif isinstance(covariates, np.ndarray):
self._fit_numpy(covariates)
else:
raise ValueError("covariates must be a pd.DataFrame or a np.array")
self._is_fitted = True

def _transform_pandas(self, covariates: pd.DataFrame) -> np.array:
if self._num_original_features != covariates.shape[1]:
raise ValueError("Attempting to call transform from a CovariateTransformer that was fit on a dataset with different dimensionality")

output_array = np.empty((covariates.shape[0], len(self._processed_feature_types)), dtype=np.float64)
output_iter = 0
for i in range(covariates.shape[1]):
covariate = covariates.iloc[:,i]
if self._original_feature_types[i] == "category" or self._original_feature_types[i] == "string":
if self._ordinal_feature_index[i] != -1:
ord_ind = self._ordinal_feature_index[i]
covariate_transformed = self._ordinal_encoders[ord_ind].transform(pd.DataFrame(covariate))
output_array[:,output_iter] = np.squeeze(covariate_transformed)
output_iter += 1
else:
onehot_ind = self._onehot_feature_index[i]
covariate_transformed = self._onehot_encoders[onehot_ind].transform(pd.DataFrame(covariate))
output_dim = covariate_transformed.shape[1]
output_array[:,np.arange(output_iter, output_iter + output_dim)] = np.squeeze(covariate_transformed)
output_iter += output_dim

elif self._original_feature_types[i] == "boolean":
output_array[:,output_iter] = (covariate*1.0).to_numpy()
output_iter += 1

elif self._original_feature_types[i] == "integer" or self._original_feature_types[i] == "float":
output_array[:,output_iter] = (covariate).to_numpy()
output_iter += 1

return output_array

def _transform_numpy(self, covariates: np.array) -> np.array:
if covariates.ndim == 1:
covariates = np.expand_dims(covariates, 1)
elif covariates.ndim > 2:
raise ValueError("Covariates passed as a numpy array must be 1d or 2d")
if self._num_original_features != covariates.shape[1]:
raise ValueError("Attempting to call transform from a CovariateTransformer that was fit on a dataset with different dimensionality")

return covariates

def _transform(self, covariates: Union[pd.DataFrame, np.array]) -> np.array:
if self._check_is_fitted():
if isinstance(covariates, pd.DataFrame):
return self._transform_pandas(covariates)
elif isinstance(covariates, np.ndarray):
return self._transform_numpy(covariates)
else:
raise ValueError("covariates must be a pd.DataFrame or a np.array")
else:
raise ValueError("Attempting to call transform() from an CovariateTransformer that has not yet been fit")

def _check_is_fitted(self) -> bool:
return self._is_fitted

def fit(self, covariates: Union[pd.DataFrame, np.array]) -> None:
"""Fits a ``CovariateTransformer`` by unpacking (and storing) data type information on the input (raw) covariates
and then converting to a numpy array which can be passed to a tree ensemble sampler.

If ``covariates`` is a ``pd.DataFrame``, `column dtypes <https://pandas.pydata.org/docs/user_guide/basics.html#basics-dtypes>`_
will be handled as follows:

* ``category``: one-hot encoded if unordered, ordinal encoded if ordered
* ``string``: one-hot encoded
* ``boolean``: passed through as binary integer, treated as ordered categorical by tree samplers
* integer (i.e. ``Int8``, ``Int16``, etc...): passed through as double (**note**: if you have categorical data stored as integers, you should explicitly convert it to categorical in pandas, see this `user guide <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`_)
* float (i.e. ``Float32``, ``Float64``): passed through as double
* ``object``: currently unsupported, convert object columns to numeric or categorical before passing
* Datetime (i.e. ``datetime64``): currently unsupported, though datetime columns can be converted to numeric features, see `here <https://pandas.pydata.org/docs/reference/api/pandas.Timestamp.html#pandas.Timestamp>`_
* Period (i.e. ``period[<freq>]``): currently unsupported, though period columns can be converted to numeric features, see `here <https://pandas.pydata.org/docs/reference/api/pandas.Period.html#pandas.Period>`_
* Interval (i.e. ``interval``, ``Interval[datetime64[ns]]``): currently unsupported, though interval columns can be converted to numeric or categorical features, see `here <https://pandas.pydata.org/docs/reference/api/pandas.Interval.html#pandas.Interval>`_
* Sparse (i.e. ``Sparse``, ``Sparse[float]``): currently unsupported, convert sparse columns to dense before passing

Columns with unsupported types will be ignored, with a warning.

If ``covariates`` is a ``np.array``, columns must be numeric and the only preprocessing done by ``CovariateTransformer.fit()`` is to
auto-detect binary columns. All other integer-valued columns will be passed through to the tree sampler as (continuous) numeric data.
If you would like to treat integer-valued data as categorical, you can either convert your numpy array to a pandas dataframe and
explicitly tag such columns as ordered / unordered categorical, or preprocess manually using ``sklearn.preprocessing.OneHotEncoder``
and ``sklearn.preprocessing.OrdinalEncoder``.

Parameters
----------
covariates : np.array or pd.DataFrame
Covariates to be preprocessed.

Returns
-------
self : CovariateTransformer
Fitted CovariateTransformer.
"""
self._fit(covariates)
return self

def transform(self, covariates: Union[pd.DataFrame, np.array]) -> np.array:
return self._transform(covariates)

def fit_transform(self, covariates: Union[pd.DataFrame, np.array]) -> np.array:
self._fit(covariates)
return self._transform(covariates)
1 change: 0 additions & 1 deletion stochtree/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

class NotSampledError(ValueError, AttributeError):
"""Exception class to raise if attempting to predict from a model before it has been sampled.

Expand Down
Loading