From b5ad233d539803da41ae41f98e7997f68394ec35 Mon Sep 17 00:00:00 2001 From: "Kruglov, Oleg" Date: Fri, 27 Sep 2024 02:29:40 -0700 Subject: [PATCH 1/7] Reduce config --- configs/incremental.json | 40 +++++++++------------------------------- 1 file changed, 9 insertions(+), 31 deletions(-) diff --git a/configs/incremental.json b/configs/incremental.json index c9ffb19c..f09927ee 100644 --- a/configs/incremental.json +++ b/configs/incremental.json @@ -1,7 +1,7 @@ { "PARAMETERS_SETS": { "common": {"bench": {"n_runs": 10, "time_limit": 60}}, - "covariance data": { + "unlabeled dataset": { "data": [ { "source": "make_blobs", @@ -14,18 +14,7 @@ } ] }, - "basic_statistics data": { - "data": { - "source": "make_blobs", - "generation_kwargs": { - "centers": 1, - "n_samples": 10000, - "n_features": [16, 64] - }, - "split_kwargs": {"ignore": true} - } - }, - "linear_regression data": { + "labeled dataset": { "data": { "source": "make_regression", "split_kwargs": {"train_size": 0.2, "test_size": 0.8}, @@ -37,22 +26,11 @@ } } }, - "pca data": { - "data": { - "source": "make_blobs", - "generation_kwargs": { - "centers": 1, - "n_samples": 1000, - "n_features": [16, 64] - }, - "split_kwargs": {"ignore": true} - } - }, "covariance": { "algorithm": [ { "estimator": "IncrementalEmpiricalCovariance", - "library": "sklearnex.covariance", + "library": "sklearnex", "estimator_methods": {"training": "partial_fit"}, "num_batches": {"training": 2} } @@ -62,7 +40,7 @@ "algorithm": [ { "estimator": "IncrementalBasicStatistics", - "library": "sklearnex.basic_statistics", + "library": "sklearnex", "num_batches": {"training": 2} } ] @@ -71,7 +49,7 @@ "algorithm": [ { "estimator": "IncrementalLinearRegression", - "library": "sklearnex.linear_model", + "library": "sklearnex", "num_batches": {"training": 2} } ] @@ -80,17 +58,17 @@ "algorithm": [ { "estimator": "IncrementalPCA", - "library": "sklearnex.preview.decomposition", + "library": "sklearnex.preview", "num_batches": {"training": 2} } ] } }, "TEMPLATES": { - "covariance": {"SETS": ["common", "covariance", "covariance data"]}, + "covariance": {"SETS": ["common", "covariance", "unlabeled dataset"]}, "linear_regression": { - "SETS": ["common", "linear_regression", "linear_regression data"] + "SETS": ["common", "linear_regression", "labeled dataset"] }, - "pca": {"SETS": ["common", "pca", "pca data"]} + "pca": {"SETS": ["common", "pca", "unlabeled dataset"]} } } From fc4ad2b12ffefebdc3fe3f7103d24fc997cdad0f Mon Sep 17 00:00:00 2001 From: "Kruglov, Oleg" Date: Fri, 27 Sep 2024 04:53:32 -0700 Subject: [PATCH 2/7] Add covariance module to incremental config --- configs/incremental.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/incremental.json b/configs/incremental.json index f09927ee..d36e2a16 100644 --- a/configs/incremental.json +++ b/configs/incremental.json @@ -30,7 +30,7 @@ "algorithm": [ { "estimator": "IncrementalEmpiricalCovariance", - "library": "sklearnex", + "library": "sklearnex.covariance", "estimator_methods": {"training": "partial_fit"}, "num_batches": {"training": 2} } From 040802dc7229b4713b5ccab4de4248505e762b65 Mon Sep 17 00:00:00 2001 From: "Kruglov, Oleg" Date: Fri, 4 Oct 2024 02:49:02 -0700 Subject: [PATCH 3/7] Rename example config --- .../{incremental.json => sklearnex_incremental_example.json} | 0 test-configuration-linux.yml | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename configs/{incremental.json => sklearnex_incremental_example.json} (100%) diff --git a/configs/incremental.json b/configs/sklearnex_incremental_example.json similarity index 100% rename from configs/incremental.json rename to configs/sklearnex_incremental_example.json diff --git a/test-configuration-linux.yml b/test-configuration-linux.yml index 722d1008..d8c1a64e 100644 --- a/test-configuration-linux.yml +++ b/test-configuration-linux.yml @@ -48,7 +48,7 @@ steps: - script: | source /usr/share/miniconda/etc/profile.d/conda.sh conda activate bench-env - python -m sklbench --report -l DEBUG --report -c configs/incremental.json + python -m sklbench --report -l DEBUG --report -c configs/sklearnex_incremental_example.json displayName: Incremental algorithms example run - script: | source /usr/share/miniconda/etc/profile.d/conda.sh From 69cc4c1754024b2817fe87b3a0d89a926b45658b Mon Sep 17 00:00:00 2001 From: "Kruglov, Oleg" Date: Fri, 4 Oct 2024 03:54:18 -0700 Subject: [PATCH 4/7] Remove bs mentioning in config (need to be added later) --- configs/sklearnex_incremental_example.json | 9 --------- 1 file changed, 9 deletions(-) diff --git a/configs/sklearnex_incremental_example.json b/configs/sklearnex_incremental_example.json index d36e2a16..37b2c7fb 100644 --- a/configs/sklearnex_incremental_example.json +++ b/configs/sklearnex_incremental_example.json @@ -36,15 +36,6 @@ } ] }, - "basic_statistics": { - "algorithm": [ - { - "estimator": "IncrementalBasicStatistics", - "library": "sklearnex", - "num_batches": {"training": 2} - } - ] - }, "linear_regression": { "algorithm": [ { From f275062098635b049f2ff822c524c44f7b62422a Mon Sep 17 00:00:00 2001 From: "Kruglov, Oleg" Date: Fri, 4 Oct 2024 08:36:17 -0700 Subject: [PATCH 5/7] Fix num_batches and batch_size reading from config --- configs/sklearnex_incremental_example.json | 6 +----- sklbench/benchmarks/sklearn_estimator.py | 8 ++++++-- sklbench/report/implementation.py | 2 ++ 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/configs/sklearnex_incremental_example.json b/configs/sklearnex_incremental_example.json index 37b2c7fb..1fbbcafa 100644 --- a/configs/sklearnex_incremental_example.json +++ b/configs/sklearnex_incremental_example.json @@ -56,10 +56,6 @@ } }, "TEMPLATES": { - "covariance": {"SETS": ["common", "covariance", "unlabeled dataset"]}, - "linear_regression": { - "SETS": ["common", "linear_regression", "labeled dataset"] - }, - "pca": {"SETS": ["common", "pca", "unlabeled dataset"]} + "covariance": {"SETS": ["common", "covariance", "unlabeled dataset"]} } } diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index 3f8b1641..c4f94c47 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -425,8 +425,12 @@ def measure_sklearn_estimator( data_args = (x_test,) if method == "partial_fit": - num_batches = get_bench_case_value(bench_case, "data:num_batches") - batch_size = get_bench_case_value(bench_case, "data:batch_size") + num_batches = get_bench_case_value( + bench_case, f"algorithm:num_batches:{stage}" + ) + batch_size = get_bench_case_value( + bench_case, f"algorithm:batch_size:{stage}" + ) if batch_size is None: if num_batches is None: diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py index df15b5eb..af0398dd 100644 --- a/sklbench/report/implementation.py +++ b/sklbench/report/implementation.py @@ -236,6 +236,7 @@ def get_result_tables_as_df( splitby_columns=["estimator", "method", "function"], compatibility_mode=False, ): + print(results["bench_cases"]) bench_cases = pd.DataFrame( [flatten_dict(bench_case) for bench_case in results["bench_cases"]] ) @@ -244,6 +245,7 @@ def get_result_tables_as_df( if compatibility_mode: bench_cases = transform_results_to_compatible(bench_cases) + print(bench_cases) for column in diffby_columns.copy(): if bench_cases[column].nunique() == 1: bench_cases.drop(columns=[column], inplace=True) From 5a9be80616e5dca5e50bd27145ce11c6316b4c2d Mon Sep 17 00:00:00 2001 From: "Kruglov, Oleg" Date: Fri, 4 Oct 2024 08:41:09 -0700 Subject: [PATCH 6/7] Revert accidentally pushed changes --- configs/sklearnex_incremental_example.json | 6 +++++- sklbench/report/implementation.py | 2 -- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/configs/sklearnex_incremental_example.json b/configs/sklearnex_incremental_example.json index 1fbbcafa..37b2c7fb 100644 --- a/configs/sklearnex_incremental_example.json +++ b/configs/sklearnex_incremental_example.json @@ -56,6 +56,10 @@ } }, "TEMPLATES": { - "covariance": {"SETS": ["common", "covariance", "unlabeled dataset"]} + "covariance": {"SETS": ["common", "covariance", "unlabeled dataset"]}, + "linear_regression": { + "SETS": ["common", "linear_regression", "labeled dataset"] + }, + "pca": {"SETS": ["common", "pca", "unlabeled dataset"]} } } diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py index af0398dd..df15b5eb 100644 --- a/sklbench/report/implementation.py +++ b/sklbench/report/implementation.py @@ -236,7 +236,6 @@ def get_result_tables_as_df( splitby_columns=["estimator", "method", "function"], compatibility_mode=False, ): - print(results["bench_cases"]) bench_cases = pd.DataFrame( [flatten_dict(bench_case) for bench_case in results["bench_cases"]] ) @@ -245,7 +244,6 @@ def get_result_tables_as_df( if compatibility_mode: bench_cases = transform_results_to_compatible(bench_cases) - print(bench_cases) for column in diffby_columns.copy(): if bench_cases[column].nunique() == 1: bench_cases.drop(columns=[column], inplace=True) From 1d48f3a1b35668def560fb05c4b200783102cfda Mon Sep 17 00:00:00 2001 From: Ethan Glaser Date: Mon, 17 Mar 2025 22:30:21 -0700 Subject: [PATCH 7/7] remove batch_size logic from incremental benchmarking for num_batches --- configs/README.md | 1 + sklbench/benchmarks/sklearn_estimator.py | 57 ++++++++++-------------- sklbench/report/implementation.py | 1 + 3 files changed, 26 insertions(+), 33 deletions(-) diff --git a/configs/README.md b/configs/README.md index 8d3c5ac2..07c92dc1 100644 --- a/configs/README.md +++ b/configs/README.md @@ -117,6 +117,7 @@ Configs have the three highest parameter keys: |:---------------|:--------------|:--------|:------------| | `algorithm`:`estimator` | None | | Name of measured estimator. | | `algorithm`:`estimator_params` | Empty `dict` | | Parameters for estimator constructor. | +| `algorithm`:`training`:`num_batches` | 5 | | Number of batches to benchmark `partial_fit` function, using batches the size of number of samples specified (not samples divided by `num_batches`). For incremental estimators only. | | `algorithm`:`online_inference_mode` | False | | Enables online mode for inference methods of estimator (separate call for each sample). | | `algorithm`:`sklearn_context` | None | | Parameters for sklearn `config_context` used over estimator. | | `algorithm`:`sklearnex_context` | None | | Parameters for sklearnex `config_context` used over estimator. Updated by `sklearn_context` if set. | diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index c4f94c47..dd0ef1a5 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -334,27 +334,19 @@ def verify_patching(stream: io.StringIO, function_name) -> bool: return acceleration_lines > 0 and fallback_lines == 0 -def create_online_function( - estimator_instance, method_instance, data_args, num_batches, batch_size -): +def create_online_function(estimator_instance, method_instance, data_args, num_batches): if "y" in list(inspect.signature(method_instance).parameters): def ndarray_function(x, y): for i in range(num_batches): - method_instance( - x[i * batch_size : (i + 1) * batch_size], - y[i * batch_size : (i + 1) * batch_size], - ) + method_instance(x, y) if hasattr(estimator_instance, "_onedal_finalize_fit"): estimator_instance._onedal_finalize_fit() def dataframe_function(x, y): for i in range(num_batches): - method_instance( - x.iloc[i * batch_size : (i + 1) * batch_size], - y.iloc[i * batch_size : (i + 1) * batch_size], - ) + method_instance(x, y) if hasattr(estimator_instance, "_onedal_finalize_fit"): estimator_instance._onedal_finalize_fit() @@ -362,13 +354,13 @@ def dataframe_function(x, y): def ndarray_function(x): for i in range(num_batches): - method_instance(x[i * batch_size : (i + 1) * batch_size]) + method_instance(x) if hasattr(estimator_instance, "_onedal_finalize_fit"): estimator_instance._onedal_finalize_fit() def dataframe_function(x): for i in range(num_batches): - method_instance(x.iloc[i * batch_size : (i + 1) * batch_size]) + method_instance(x) if hasattr(estimator_instance, "_onedal_finalize_fit"): estimator_instance._onedal_finalize_fit() @@ -423,32 +415,20 @@ def measure_sklearn_estimator( data_args = (x_train,) else: data_args = (x_test,) + batch_size = get_bench_case_value( + bench_case, f"algorithm:batch_size:{stage}" + ) if method == "partial_fit": num_batches = get_bench_case_value( - bench_case, f"algorithm:num_batches:{stage}" - ) - batch_size = get_bench_case_value( - bench_case, f"algorithm:batch_size:{stage}" + bench_case, f"algorithm:num_batches:{stage}", 5 ) - if batch_size is None: - if num_batches is None: - num_batches = 5 - batch_size = ( - data_args[0].shape[0] + num_batches - 1 - ) // num_batches - if num_batches is None: - num_batches = ( - data_args[0].shape[0] + batch_size - 1 - ) // batch_size - method_instance = create_online_function( estimator_instance, method_instance, data_args, - num_batches, - batch_size, + num_batches ) # daal4py model builders enabling branch if enable_modelbuilders and stage == "inference": @@ -465,6 +445,10 @@ def measure_sklearn_estimator( metrics[method]["time std[ms]"], _, ) = measure_case(bench_case, method_instance, *data_args) + if batch_size is not None: + metrics[method]["throughput[samples/ms]"] = ( + (data_args[0].shape[0] // batch_size) * batch_size + ) / metrics[method]["time[ms]"] if ensure_sklearnex_patching: full_method_name = f"{estimator_class.__name__}.{method}" sklearnex_logging_stream.seek(0) @@ -561,9 +545,16 @@ def main(bench_case: BenchCase, filters: List[BenchCase]): for stage in estimator_methods.keys(): data_descs[stage].update( { - "batch_size": get_bench_case_value( - bench_case, f"algorithm:batch_size:{stage}" - ) + key: val + for key, val in { + "batch_size": get_bench_case_value( + bench_case, f"algorithm:batch_size:{stage}" + ), + "num_batches": get_bench_case_value( + bench_case, f"algorithm:num_batches:{stage}" + ) + }.items() + if val is not None } ) if "n_classes" in data_description: diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py index 8e76479f..689396f1 100644 --- a/sklbench/report/implementation.py +++ b/sklbench/report/implementation.py @@ -94,6 +94,7 @@ "order", "n_classes", "n_clusters", + "num_batches", "batch_size", ]