Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
f75f919
docs: design for cacheable initial load (jupyter + server)
paddymul May 31, 2026
b37ae34
docs: sharpen replay-override parity test in initial-load-cache plan
paddymul May 31, 2026
d28ea5a
docs: lock initial-load-cache decisions (handshake, codec, cross-back…
paddymul May 31, 2026
5131bf1
docs: fold full design-review outcomes into initial-load-cache plan
paddymul May 31, 2026
52115a2
refactor: extract build_df_display_args shared by live + cache paths
paddymul May 31, 2026
33c37dd
test: failing tests for config_fingerprint (initial-load cache key)
paddymul May 31, 2026
38bc9c4
feat(cache): config_fingerprint — stable cross-process key for initia…
paddymul May 31, 2026
6b923f1
test: failing tests for lossless sd codec (initial-load cache)
paddymul May 31, 2026
825b11c
feat(cache): lossless type-tagged sd codec (serialize_sd/deserialize_sd)
paddymul May 31, 2026
4c3e931
test(cache): failing tests for initial-load producer/handshake/apply
paddymul May 31, 2026
ada7e05
feat(cache): initial-load producer, handshake, and consumer
paddymul May 31, 2026
c1a8316
test(cache): failing tests for the initial-load cache store
paddymul May 31, 2026
b6d910f
feat(cache): server-managed initial-load cache store + serve_window p…
paddymul May 31, 2026
8c16e43
test(cache): failing tests for /load_expr store wiring + /cache endpoint
paddymul May 31, 2026
b06d4b2
feat(cache): wire initial-load store into /load_expr + /cache endpoint
paddymul May 31, 2026
2765f6a
test(cache): failing tests for /load_expr cache hit + mismatch recompute
paddymul May 31, 2026
a643fee
feat(cache): /load_expr hit fast path + WS serve_window
paddymul May 31, 2026
0649564
docs: mark initial-load cache producer/store/server integration as la…
paddymul May 31, 2026
4ea53d2
test(cache): failing tests for widget initial_cache kwarg
paddymul May 31, 2026
b9c9a59
feat(cache): widget initial_cache kwarg + handshake
paddymul May 31, 2026
6106a65
docs: mark widget initial_cache mechanism (increment 5) as landed
paddymul May 31, 2026
4271282
test(cache): failing tests for hit-path window slicing + override replay
paddymul May 31, 2026
d45dd9c
fix(cache): slice hit-path window to request + replay display overrides
paddymul May 31, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 48 additions & 27 deletions buckaroo/buckaroo_widget.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import os
import sys
import traceback
import warnings
from datetime import datetime
from typing import Literal, Union
import pandas as pd
Expand All @@ -30,9 +31,10 @@
from buckaroo.extension_utils import copy_extend

from .serialization_utils import EMPTY_DF_WHOLE, check_and_fix_df, pd_to_obj, to_parquet, sd_to_parquet_b64
from .cache.initial_cache import apply_initial_cache, cache_mismatch_reason, extract_column_schema
from .dataflow.dataflow import CustomizableDataflow
from .dataflow.dataflow_extras import (Sampling, exception_protect)
from .dataflow.styling_core import (ComponentConfig, DFViewerConfig, DisplayArgs, OverrideColumnConfig, PinnedRowConfig, StylingAnalysis, merge_column_config, EMPTY_DFVIEWER_CONFIG)
from .dataflow.styling_core import (ComponentConfig, DFViewerConfig, DisplayArgs, OverrideColumnConfig, PinnedRowConfig, StylingAnalysis, build_df_display_args, EMPTY_DFVIEWER_CONFIG)
from .dataflow.autocleaning import PandasAutocleaning
from pathlib import Path

Expand Down Expand Up @@ -126,7 +128,8 @@ def __init__(self, orig_df, debug=False,
column_config_overrides:Union[Literal[None], OverrideColumnConfig]=None,
pinned_rows:Union[Literal[None], PinnedRowConfig]=None, extra_grid_config=None,
component_config:Union[Literal[None], ComponentConfig]=None,
init_sd=None, skip_stat_columns=None, skip_main_serial=False, record_transcript=False):
init_sd=None, skip_stat_columns=None, skip_main_serial=False, record_transcript=False,
initial_cache=None):
"""
BuckarooWidget was originally designed to extend CustomizableDataFlow

Expand Down Expand Up @@ -168,7 +171,38 @@ def _df_to_obj(idfself, df:pd.DataFrame):
bidirectional_wire(self, self.dataflow, "operation_results")
bidirectional_wire(self, self.dataflow, "buckaroo_options")
bidirectional_wire(self, self.dataflow, "command_config")


self._maybe_apply_initial_cache(initial_cache)

def _maybe_apply_initial_cache(self, bundle):
"""Validate + replay a host-provided initial-load bundle (mechanism only —
no Jupyter store / prewarm). A bundle whose config_id + schema match the
widget's live configuration hydrates the display traits from the cache; a
mismatch warns and keeps the freshly-computed values.

The dataflow is already built here, so this is for parity with the server
path (and future Jupyter cache exploitation), not a build skip. Replays
via the same ``apply_initial_cache`` the server uses, regenerating
``df_display_args`` from a zero-row frame when display overrides are set."""
if bundle is None:
return
df = self.dataflow.processed_df
reason = cache_mismatch_reason(
bundle, analysis_klasses=self.dataflow.analysis_klasses,
sampling_klass=getattr(self.dataflow, 'sampling_klass', None),
init_sd=getattr(self.dataflow, 'init_sd', None) or None,
skip_stat_columns=getattr(self.dataflow, 'skip_stat_columns', None),
schema=extract_column_schema(df) if df is not None else None)
if reason is not None:
warnings.warn("initial_cache ignored (config mismatch): %s" % reason)
return
apply_initial_cache(
self, bundle, df_display_klasses=self.dataflow.df_display_klasses,
column_config_overrides=self.dataflow.column_config_overrides,
component_config=self.dataflow.component_config,
extra_grid_config=self.dataflow.extra_grid_config,
pinned_rows=self.dataflow.pinned_rows, sd_to_jsondf=self._sd_to_jsondf)

def _df_to_obj(self, df:pd.DataFrame):
return pd_to_obj(self.sampling_klass.serialize_sample(df))

Expand Down Expand Up @@ -364,38 +398,25 @@ def _handle_widget_change(self, change):
'all_stats': self._sd_to_jsondf(merged_sd),
'empty': []}

temp_display_args = {}
for display_name, A_Klass in self.dataflow.df_display_klasses.items():
df_viewer_config = A_Klass.get_dfviewer_config(merged_sd, processed_df)
base_column_config = df_viewer_config['column_config']
df_viewer_config['column_config'] = merge_column_config(
base_column_config, self.dataflow.processed_df, self.dataflow.column_config_overrides)
disp_arg = {'data_key': A_Klass.data_key,
#'df_viewer_config': json.loads(json.dumps(df_viewer_config)),
'df_viewer_config': df_viewer_config,
'summary_stats_key': A_Klass.summary_stats_key}
temp_display_args[display_name] = disp_arg

if self.dataflow.pinned_rows is not None:
temp_display_args['main']['df_viewer_config']['pinned_rows'] = self.dataflow.pinned_rows
if self.dataflow.extra_grid_config:
temp_display_args['main']['df_viewer_config']['extra_grid_config'] = self.dataflow.extra_grid_config
if self.dataflow.component_config:
temp_display_args['main']['df_viewer_config']['component_config'] = self.dataflow.component_config

self.df_display_args = temp_display_args
self.df_display_args = build_df_display_args(
self.dataflow.df_display_klasses, merged_sd, processed_df,
self.dataflow.column_config_overrides,
pinned_rows=self.dataflow.pinned_rows,
extra_grid_config=self.dataflow.extra_grid_config,
component_config=self.dataflow.component_config)
_bk_flash("_handle_widget_change EXIT (df_display_args → JS)")


def __init__(self, orig_df, debug=False,
column_config_overrides:Union[Literal[None], OverrideColumnConfig]=None,
pinned_rows:Union[Literal[None], PinnedRowConfig]=None, extra_grid_config=None,
component_config:Union[Literal[None], ComponentConfig]=None,
init_sd=None, skip_stat_columns=None, record_transcript=False):
init_sd=None, skip_stat_columns=None, record_transcript=False, initial_cache=None):
super().__init__(orig_df, debug, column_config_overrides, pinned_rows,
extra_grid_config, component_config, init_sd,
skip_stat_columns=skip_stat_columns,
skip_main_serial=True, record_transcript=record_transcript)
skip_main_serial=True, record_transcript=record_transcript,
initial_cache=initial_cache)

def widget_tuple_args_bridge(change_unused):
self._handle_widget_change(change_unused)
Expand Down Expand Up @@ -468,10 +489,10 @@ def __init__(self, orig_df, debug=False,
column_config_overrides:Union[Literal[None], OverrideColumnConfig]=None,
pinned_rows:Union[Literal[None], PinnedRowConfig]=None, extra_grid_config=None,
component_config:Union[Literal[None], ComponentConfig]=None,
init_sd=None, skip_stat_columns=None):
init_sd=None, skip_stat_columns=None, initial_cache=None):
super().__init__(orig_df, debug, column_config_overrides, pinned_rows,
extra_grid_config, component_config, init_sd,
skip_stat_columns=skip_stat_columns)
skip_stat_columns=skip_stat_columns, initial_cache=initial_cache)
self.df_id = str(id(orig_df))


Expand Down
8 changes: 8 additions & 0 deletions buckaroo/cache/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
"""Initial-load cache: snapshot the first render so it can be replayed
without touching the DataFrame or executing the expression.

See docs/initial-load-cache-design.md. The handshake (config_fingerprint +
schema) decides whether a precomputed bundle matches the widget's live
configuration; a mismatch warns and recomputes — the cache is never blindly
trusted.
"""
61 changes: 61 additions & 0 deletions buckaroo/cache/fingerprint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""Stable config fingerprint for the initial-load cache.

``config_fingerprint`` identifies the *data-touching* configuration — the set
of inputs that determine ``merged_sd`` and the row window — so the handshake
can decide whether a cached bundle matches the widget's live config without
recomputing. It is **cross-process stable**: keyed on each class's
``module.qualname`` (plus an optional per-class ``cache_version``), never on
``id()``, so a bundle built in one process validates in another.

Display knobs (column_config_overrides, component_config, pinned_rows, theme)
are deliberately *out* of the fingerprint — they're applied at replay from the
bundle, so re-theming never invalidates the cache. See
docs/initial-load-cache-design.md.
"""
import hashlib
import json
from typing import Any, Iterable, List, Optional

# Bump when the bundle schema or the assembly logic changes incompatibly, so
# old bundles fail the handshake (warn + recompute) rather than mis-serve.
INITIAL_CACHE_VERSION = 1


def _klass_id(kls: Any) -> str:
"""Stable identity for an analysis/styling class.

``module.qualname`` is reproducible across processes (unlike ``id``). An
optional ``cache_version`` class attribute lets a class bust its own cached
bundles when its logic changes without a global version bump.
"""
mod = getattr(kls, '__module__', '')
qn = getattr(kls, '__qualname__', None) or getattr(kls, '__name__', repr(kls))
ver = getattr(kls, 'cache_version', '')
return f"{mod}.{qn}:{ver}"


def _sampling_id(sampling_klass: Any) -> str:
if sampling_klass is None:
return ''
# Sampling affects which rows reach the analysis pipeline and the window,
# so its identity + the limits that change output are part of the key.
return "|".join([_klass_id(sampling_klass), f"pre={getattr(sampling_klass, 'pre_limit', '')}",
f"ser={getattr(sampling_klass, 'serialize_limit', '')}", f"cols={getattr(sampling_klass, 'max_columns', '')}"])


def config_fingerprint(*, analysis_klasses: Iterable[Any], sampling_klass: Any = None,
init_sd: Optional[dict] = None, skip_stat_columns: Optional[Iterable[str]] = None,
cache_version: Optional[str] = None) -> str:
"""Return a stable hex fingerprint of the data-touching configuration."""
skip: List[str] = sorted(str(c) for c in (skip_stat_columns or []))
payload = {
'v': INITIAL_CACHE_VERSION,
'analysis_klasses': [_klass_id(k) for k in analysis_klasses],
'sampling': _sampling_id(sampling_klass),
# init_sd injects/overrides stats, so its content is part of identity.
# default=str keeps the hash deterministic past numpy / odd scalars.
'init_sd': json.dumps(init_sd, sort_keys=True, default=str) if init_sd else None,
'skip_stat_columns': skip,
'cache_version': cache_version}
blob = json.dumps(payload, sort_keys=True, default=str)
return hashlib.blake2b(blob.encode(), digest_size=16).hexdigest()
Loading
Loading