diff --git a/environment.yml b/environment.yml new file mode 100644 index 000000000..4d57dac77 --- /dev/null +++ b/environment.yml @@ -0,0 +1,24 @@ +name: pymc-examples +channels: +- conda-forge +dependencies: +- python=3.11 +- pymc +- pymc-bart +- nutpie +# spatial notebooks +- geopandas +- folium +- libpysal +- rasterio +- pip: + - pymc-experimental + - preliz + - bambi + - jax + - papermill + - joblib + - jupyter + - seaborn + - watermark + - lifelines diff --git a/scripts/run_notebooks/injected.py b/scripts/run_notebooks/injected.py new file mode 100644 index 000000000..ffb8d8f6c --- /dev/null +++ b/scripts/run_notebooks/injected.py @@ -0,0 +1,74 @@ +"""Injected code to the top of each notebook to mock long running code.""" + +import os +import numpy as np +import pymc as pm +import xarray as xr + + +def mock_sample(*args, **kwargs): + if len(args) > 0: + draws = args[0] + else: + draws = kwargs.get("draws", 1000) + random_seed = kwargs.get("random_seed", None) + rng = np.random.default_rng(random_seed) + model = kwargs.get("model", None) + chains = kwargs.get("chains", os.cpu_count()) + idata = pm.sample_prior_predictive( + model=model, + random_seed=random_seed, + samples=draws, + ) + n_chains = chains + expanded_chains = xr.DataArray( + np.ones(n_chains), + coords={"chain": np.arange(n_chains)}, + ) + idata.add_groups( + posterior=(idata.prior.mean("chain") * expanded_chains).transpose("chain", "draw", ...) + ) + idata.posterior.attrs["sampling_time"] = 1.0 + + if "prior" in idata: + del idata.prior + if "prior_predictive" in idata: + del idata.prior_predictive + + # Create mock sample stats with diverging data + if "sample_stats" not in idata: + n_chains = chains + n_draws = draws + sample_stats = xr.Dataset( + { + "diverging": xr.DataArray( + np.zeros((n_chains, n_draws), dtype=int), + dims=("chain", "draw"), + ), + "energy": xr.DataArray( + rng.normal(loc=150, scale=2.5, size=(n_chains, n_draws)), + dims=("chain", "draw"), + ), + "tree_depth": xr.DataArray( + rng.choice([1, 2, 3], p=[0.01, 0.86, 0.13], size=(n_chains, n_draws)), + dims=("chain", "draw"), + ), + "acceptance_rate": xr.DataArray( + rng.beta(0.5, 0.5, size=(n_chains, n_draws)), + dims=("chain", "draw"), + ), + # Different sampler + "accept": xr.DataArray( + rng.choice([0, 1], size=(n_chains, n_draws)), + dims=("chain", "draw"), + ), + } + ) + idata.add_groups(sample_stats=sample_stats) + + return idata + + +pm.sample = mock_sample +pm.HalfFlat = pm.HalfNormal +pm.Flat = pm.Normal diff --git a/scripts/run_notebooks/runner.py b/scripts/run_notebooks/runner.py new file mode 100644 index 000000000..45ab6e620 --- /dev/null +++ b/scripts/run_notebooks/runner.py @@ -0,0 +1,219 @@ +"""CLI to notebook or directory of notebooks. + +Arguments +--------- +--notebooks: Specific notebook or directory of notebooks to run. +--mock: Run notebooks with mock code. Default is True. If --no-mock is provided, + notebooks will run without mock code. + +Examples +-------- +Run all notebooks in a directory with mock code: + +.. code-block:: bash + + python scripts/run_notebooks/runner.py --notebooks notebooks/ --mock + +Run a single notebook without mocked code: + +.. code-block:: bash + + python scripts/run_notebooks/runner.py --notebooks notebooks/notebook.ipynb --no-mock + +Run all the notebook is two different directories with mocked code (default): + +.. code-block:: bash + + python scripts/run_notebooks/runner.py --notebooks notebooks/ notebooks2/ + +""" + +from argparse import ArgumentParser + +from rich.console import Console +from dataclasses import dataclass +import logging +from pathlib import Path +from tempfile import NamedTemporaryFile +from typing import TypedDict +from uuid import uuid4 + +import papermill +from joblib import Parallel, delayed +from nbformat.notebooknode import NotebookNode +from papermill.iorw import load_notebook_node, write_ipynb + +KERNEL_NAME: str = "python3" + +HERE = Path(__file__).parent +INJECTED_CODE_FILE = HERE / "injected.py" +INJECTED_CODE = INJECTED_CODE_FILE.read_text() + + +def setup_logging() -> None: + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + + +def generate_random_id() -> str: + return str(uuid4()) + + +def inject_pymc_sample_mock_code(cells: list) -> None: + cells.insert( + 0, + NotebookNode( + id=f"code-injection-{generate_random_id()}", + execution_count=sum(map(ord, "Mock pm.sample")), + cell_type="code", + metadata={"tags": []}, + outputs=[], + source=INJECTED_CODE, + ), + ) + + +def mock_run(notebook_path: Path, i: int, total: int) -> None: + nb = load_notebook_node(str(notebook_path)) + inject_pymc_sample_mock_code(nb.cells) + with NamedTemporaryFile(suffix=".ipynb") as f: + write_ipynb(nb, f.name) + desc = f"({i} / {total}) Mocked {notebook_path.name}" + papermill.execute_notebook( + input_path=f.name, + output_path=None, + progress_bar=dict(desc=desc), + kernel_name=KERNEL_NAME, + cwd=notebook_path.parent, + ) + + +def actual_run(notebook_path: Path, i: int, total: int) -> None: + papermill.execute_notebook( + input_path=notebook_path, + output_path=None, + kernel_name=KERNEL_NAME, + progress_bar={"desc": f"({i} / {total}) Running {notebook_path.name}"}, + cwd=notebook_path.parent, + ) + + +@dataclass +class NotebookSuccess: + notebook_path: Path + + +@dataclass +class NotebookFailure: + notebook_path: Path + error: str + + +def run_notebook( + notebook_path: Path, + i: int, + total: int, + mock: bool = True, +) -> NotebookFailure | NotebookSuccess: + logging.info(f"Running notebook: {notebook_path.name}") + run = mock_run if mock else actual_run + + try: + run(notebook_path, i=i, total=total) + except Exception as e: + logging.error(f"{e.__class__.__name__} encountered running notebook: {str(notebook_path)}") + return NotebookFailure(notebook_path=notebook_path, error=str(e)) + else: + return NotebookSuccess(notebook_path=notebook_path) + + +class RunParams(TypedDict): + notebook_path: Path + mock: bool + i: int + total: int + + +def run_parameters(notebook_paths: list[Path], mock: bool = True) -> list[RunParams]: + def to_mock(notebook_path: Path, i: int) -> RunParams: + return RunParams( + notebook_path=notebook_path, + mock=mock, + i=i, + total=len(notebook_paths), + ) + + return [to_mock(notebook_path, i=i) for i, notebook_path in enumerate(notebook_paths, start=1)] + + +def main(notebooks_to_run: list[Path], mock: bool = True) -> None: + console = Console() + setup_logging() + logging.info("Starting notebook runner") + logging.info(f"Running {len(notebooks_to_run)} notebook(s).") + results = Parallel(n_jobs=-1)( + delayed(run_notebook)(**run_params) + for run_params in run_parameters(notebooks_to_run, mock=mock) + ) + errors: list[NotebookFailure] = list(filter(lambda x: isinstance(x, NotebookFailure), results)) + successes: list[NotebookSuccess] = list( + filter(lambda x: isinstance(x, NotebookSuccess), results) + ) + + if not errors: + logging.info("All notebooks ran successfully!") + return + + for error in errors: + console.rule(f"[bold red]Error running {error.notebook_path}[/bold red]") + console.print(error.error) + + for success in successes: + console.print(f"[bold green]Success running {success.notebook_path}[/bold green]") + + logging.error(f"{len(errors)} / {len(notebooks_to_run)} notebooks failed") + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument( + "--notebooks", + nargs="+", + help="List of notebooks to run. If not provided, all notebooks will be run.", + ) + mock_group = parser.add_mutually_exclusive_group() + mock_group.add_argument( + "--mock", + action="store_true", + help="Run notebooks with mock code", + dest="mock", + ) + mock_group.add_argument( + "--no-mock", + action="store_false", + help="Run notebooks without mock code", + dest="mock", + ) + parser.set_defaults(mock=True) + args = parser.parse_args() + + notebooks_to_run = [] + notebooks = args.notebooks + notebooks = [Path(notebook) for notebook in notebooks] + for notebook in notebooks: + if notebook.is_dir(): + notebooks_to_run.extend(notebook.glob("*.ipynb")) + notebooks_to_run.extend(notebook.glob("*/*.ipynb")) + else: + notebooks_to_run.append(notebook) + + args.notebooks = notebooks_to_run + + return args + + +if __name__ == "__main__": + args = parse_args() + main(args.notebooks, mock=args.mock)