diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..c3ca6092 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,52 @@ +# stochtree 0.1.2 + +## R + +* Fixed indexing bug in cleanup of grow-from-root (GFR) samples in BART and BCF models +* Avoid using covariate preprocessor in `computeForestLeafIndices` function when a `ForestSamples` object is provided + +## Python + +* Specify float dtype in numpy ones and zeros (i.e. `np.zeros(num_zeros, dtype=float)`) [#161](https://github.com/StochasticTree/stochtree/pull/161) + +# stochtree 0.1.1 + +## R + +* Fixed initialization bug in several R package code examples for random effects models + +# stochtree 0.1.0 + +## Python + +* Initial release on CRAN. +* Support for sampling stochastic tree ensembles using two algorithms: MCMC and Grow-From-Root (GFR) +* High-level model types supported: + * Supervised learning with constant leaves or user-specified leaf regression models + * Causal effect estimation with binary or continuous treatments +* Additional high-level modeling features: + * Forest-based variance function estimation (heteroskedasticity) + * Additive (univariate or multivariate) group random effects + * Multi-chain sampling and support for parallelism + * "Warm-start" initialization of MCMC forest samplers via the Grow-From-Root (GFR) algorithm + * Automated preprocessing / handling of categorical variables +* Low-level interface: + * Ability to combine a forest sampler with other (additive) model terms, without using C++ + * Combine and sample an arbitrary number of forests or random effects terms + +## R + +* Initial release on CRAN. +* Support for sampling stochastic tree ensembles using two algorithms: MCMC and Grow-From-Root (GFR) +* High-level model types supported: + * Supervised learning with constant leaves or user-specified leaf regression models + * Causal effect estimation with binary or continuous treatments +* Additional high-level modeling features: + * Forest-based variance function estimation (heteroskedasticity) + * Additive (univariate or multivariate) group random effects + * Multi-chain sampling and support for parallelism + * "Warm-start" initialization of MCMC forest samplers via the Grow-From-Root (GFR) algorithm + * Automated preprocessing / handling of categorical variables +* Low-level interface: + * Ability to combine a forest sampler with other (additive) model terms, without using C++ + * Combine and sample an arbitrary number of forests or random effects terms diff --git a/demo/notebooks/two-step-bart.ipynb b/demo/notebooks/two-step-bart.ipynb new file mode 100644 index 00000000..745fcad2 --- /dev/null +++ b/demo/notebooks/two-step-bart.ipynb @@ -0,0 +1,315 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Two Step BART Modeling\n", + "\n", + "Here we consider models of the form\n", + "\n", + "$$y = X\\beta + f(X, X\\beta) + \\epsilon$$\n", + "\n", + "where $f(X)$ is a stochastic tree model, $\\beta$ is a vector of linear regression coefficients, and $\\epsilon \\sim \\mathcal{N}(0,\\sigma^2)$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The model is fit in two stages:\n", + "\n", + "1. $\\beta$ is estimated by $y = X\\beta + \\nu$ linear regression\n", + "2. $f$ is sampled via $y - X\\beta = f(X, X\\beta) + \\epsilon$, where $\\epsilon \\sim \\mathcal{N}(0,\\sigma^2)$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To begin to investigate this, we load the necessary libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "from stochtree import BARTModel" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Demo 1: Supervised Learning\n", + "\n", + "Consider a nonlinear single-index model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generate sample data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# RNG\n", + "rng = np.random.default_rng()\n", + "\n", + "# Generate covariates and basis\n", + "n = 5300\n", + "p_X = 5\n", + "X = rng.uniform(-10./p_X, 10./p_X, (n, p_X))\n", + "\n", + "\n", + "# Define the single index basis\n", + "def single_index_basis(X, beta = None):\n", + " if beta is None:\n", + " _, p_x = X.shape\n", + " rng = np.random.default_rng()\n", + " beta = 2.0*p_x*rng.dirichlet(alpha = np.ones(p_x, dtype=float))\n", + " return np.squeeze(np.matmul(X, beta))\n", + "\n", + "\n", + "# Define the outcome mean function\n", + "def outcome_mean(basis):\n", + " return 0.1 * basis + np.sin(2.0 - basis)/(1.0 + np.abs(basis - 2.0))\n", + "\n", + "\n", + "# Generate outcome\n", + "epsilon = rng.normal(0, 1, n)\n", + "beta = 2.0*p_X*rng.dirichlet(alpha = np.ones(p_X, dtype=float))\n", + "Xb = single_index_basis(X, beta)\n", + "f_x = outcome_mean(Xb)\n", + "snr = 3\n", + "sig = np.std(f_x)/snr\n", + "y = f_x + epsilon*sig" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Test-train split" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sample_inds = np.arange(n)\n", + "train_inds, test_inds = train_test_split(sample_inds, test_size=300)\n", + "X_train = X[train_inds, :]\n", + "X_test = X[test_inds, :]\n", + "Xb_train = Xb[train_inds]\n", + "Xb_test = Xb[test_inds]\n", + "y_train = y[train_inds]\n", + "y_test = y[test_inds]\n", + "f_x_train = f_x[train_inds]\n", + "f_x_test = f_x[test_inds]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run single-step BART" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "single_step_bart_model = BARTModel()\n", + "mean_params = {\"num_trees\": 500, \"sample_sigma2_leaf\": False}\n", + "single_step_bart_model.sample(\n", + " X_train=X_train,\n", + " y_train=y_train,\n", + " X_test=X_test,\n", + " num_gfr=5,\n", + " num_mcmc=200,\n", + " mean_forest_params=mean_params,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.scatter(x=np.mean(single_step_bart_model.y_hat_test,axis=1), y=y_test, label=\"Outcome\")\n", + "plt.scatter(x=np.mean(single_step_bart_model.y_hat_test,axis=1), y=f_x_test, label=\"Mean function\")\n", + "plt.xlabel(\"BART prediction\")\n", + "plt.ylabel(\"Actual\")\n", + "plt.legend()\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run two-step lm + BART" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "linear_model = LinearRegression()\n", + "linear_model.fit(X=X_train, y=y_train)\n", + "yhat_lm_train = np.squeeze(linear_model.predict(X=X_train))\n", + "yhat_lm_test = np.squeeze(linear_model.predict(X=X_test))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "two_step_bart_model = BARTModel()\n", + "mean_params = {\"num_trees\": 50, \"max_depth\": 15}\n", + "two_step_bart_covariate_train = np.c_[X_train, yhat_lm_train]\n", + "two_step_bart_covariate_test = np.c_[X_test, yhat_lm_test]\n", + "two_step_bart_model.sample(\n", + " X_train=two_step_bart_covariate_train,\n", + " y_train=y_train - yhat_lm_train,\n", + " X_test=two_step_bart_covariate_test,\n", + " num_gfr=10,\n", + " num_mcmc=200,\n", + " mean_forest_params=mean_params,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Inspect the MCMC (BART) samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "single_step_preds_y_mcmc = single_step_bart_model.y_hat_test\n", + "single_step_avg_mcmc = np.squeeze(single_step_preds_y_mcmc).mean(axis=1, keepdims=True)\n", + "two_step_preds_y_mcmc = two_step_bart_model.y_hat_test\n", + "two_step_avg_mcmc = np.squeeze(two_step_preds_y_mcmc).mean(axis=1, keepdims=True) + np.expand_dims(yhat_lm_test, 1)\n", + "plt.scatter(x=Xb_test, y=single_step_avg_mcmc, label=\"Classic BART\")\n", + "plt.scatter(x=Xb_test, y=two_step_avg_mcmc, label=\"Linear Augmented BART\")\n", + "plt.xlabel(\"Index variable\")\n", + "plt.ylabel(\"Model predictions\")\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.scatter(x=single_step_avg_mcmc, y=y_test, label=\"Classic BART\")\n", + "plt.scatter(x=two_step_avg_mcmc, y=y_test, label=\"Linear Augmented BART\")\n", + "plt.xlabel(\"Model predictions\")\n", + "plt.ylabel(\"True outcome\")\n", + "plt.legend()\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.scatter(x=single_step_avg_mcmc, y=f_x_test, label=\"Classic BART\")\n", + "plt.scatter(x=two_step_avg_mcmc, y=f_x_test, label=\"Linear Augmented BART\")\n", + "plt.xlabel(\"Model predictions\")\n", + "plt.ylabel(\"True mean function\")\n", + "plt.legend()\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Compute the root mean squared difference between $\\hat{f}(X)$ and $f(X)$ on the test set" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Single step root mean squared estimation error: {np.sqrt(np.mean(np.power(f_x_test - np.squeeze(single_step_avg_mcmc), 2)))}\\nTwo step root mean squared estimation error: {np.sqrt(np.mean(np.power(f_x_test - np.squeeze(two_step_avg_mcmc), 2)))}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Compute the root mean squared difference between $\\hat{f}(X)$ and $y$ on the test set" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Single step root mean squared prediction error: {np.sqrt(np.mean(np.power(y_test - np.squeeze(single_step_avg_mcmc), 2)))}\\nTwo step root mean squared prediction error: {np.sqrt(np.mean(np.power(y_test - np.squeeze(two_step_avg_mcmc), 2)))}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}