diff --git a/configs/README.md b/configs/README.md index e1cf8390..5a981f80 100644 --- a/configs/README.md +++ b/configs/README.md @@ -117,6 +117,7 @@ Configs have the three highest parameter keys: |:---------------|:--------------|:--------|:------------| | `algorithm`:`estimator` | None | | Name of measured estimator. | | `algorithm`:`estimator_params` | Empty `dict` | | Parameters for estimator constructor. | +| `algorithm`:`num_batches`:`training` | 5 | | Number of batches to benchmark `partial_fit` function, using batches the size of number of samples specified (not samples divided by `num_batches`). For incremental estimators only. | | `algorithm`:`online_inference_mode` | False | | Enables online mode for inference methods of estimator (separate call for each sample). | | `algorithm`:`sklearn_context` | None | | Parameters for sklearn `config_context` used over estimator. | | `algorithm`:`sklearnex_context` | None | | Parameters for sklearnex `config_context` used over estimator. Updated by `sklearn_context` if set. | diff --git a/configs/regular/bf16/knn.json b/configs/regular/bf16/knn.json index fabf6d6d..46193894 100644 --- a/configs/regular/bf16/knn.json +++ b/configs/regular/bf16/knn.json @@ -4,7 +4,7 @@ "common knn parameters": { "algorithm": { "estimator_params": { - "n_neighbors": [10, 100], + "n_neighbors": 100, "weights": "uniform" } }, @@ -19,19 +19,10 @@ "synthetic classification data": { "algorithm": { "estimator": "KNeighborsClassifier", - "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": [1, 2] } + "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": 2 } }, "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } - ] - }, - "synthetic regression data": { - "algorithm": { - "estimator": "KNeighborsRegressor", - "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": [1, 2] } - }, - "data": [ - { "source": "make_regression", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 100, "noise":1.5 } } + { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 51000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } ] } }, @@ -43,14 +34,6 @@ "sklearn knn parameters", "synthetic classification data" ] - }, - "sklearn brute knn reg": { - "SETS": [ - "sklearn-ex[gpu] implementations", - "common knn parameters", - "sklearn knn parameters", - "synthetic regression data" - ] } } } diff --git a/configs/regular/bf16/pca.json b/configs/regular/bf16/pca.json index e5113261..01d2a125 100644 --- a/configs/regular/bf16/pca.json +++ b/configs/regular/bf16/pca.json @@ -20,7 +20,7 @@ }, "synthetic data": { "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } } + { "source": "make_blobs", "generation_kwargs": { "n_samples": 3000000, "n_features": 10, "centers": 1 } } ] } }, diff --git a/configs/spmd/large_scale/logreg_strong.json b/configs/spmd/large_scale/logreg_strong.json index 219840ea..962d7da9 100644 --- a/configs/spmd/large_scale/logreg_strong.json +++ b/configs/spmd/large_scale/logreg_strong.json @@ -19,7 +19,7 @@ "logreg": { "SETS": [ "sklearnex spmd implementation", - "large scale strong 64 parameters", + "large scale strong <=64 parameters", "spmd logreg parameters", "synthetic data", "spmd logreg2 parameters" diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index 819f5fb5..bd9b3b51 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -324,27 +324,19 @@ def verify_patching(stream: io.StringIO, function_name) -> bool: return acceleration_lines > 0 and fallback_lines == 0 -def create_online_function( - estimator_instance, method_instance, data_args, num_batches, batch_size -): +def create_online_function(estimator_instance, method_instance, data_args, num_batches): if "y" in list(inspect.signature(method_instance).parameters): def ndarray_function(x, y): for i in range(num_batches): - method_instance( - x[i * batch_size : (i + 1) * batch_size], - y[i * batch_size : (i + 1) * batch_size], - ) + method_instance(x, y) if hasattr(estimator_instance, "_onedal_finalize_fit"): estimator_instance._onedal_finalize_fit() def dataframe_function(x, y): for i in range(num_batches): - method_instance( - x.iloc[i * batch_size : (i + 1) * batch_size], - y.iloc[i * batch_size : (i + 1) * batch_size], - ) + method_instance(x, y) if hasattr(estimator_instance, "_onedal_finalize_fit"): estimator_instance._onedal_finalize_fit() @@ -352,13 +344,13 @@ def dataframe_function(x, y): def ndarray_function(x): for i in range(num_batches): - method_instance(x[i * batch_size : (i + 1) * batch_size]) + method_instance(x) if hasattr(estimator_instance, "_onedal_finalize_fit"): estimator_instance._onedal_finalize_fit() def dataframe_function(x): for i in range(num_batches): - method_instance(x.iloc[i * batch_size : (i + 1) * batch_size]) + method_instance(x) if hasattr(estimator_instance, "_onedal_finalize_fit"): estimator_instance._onedal_finalize_fit() @@ -413,28 +405,17 @@ def measure_sklearn_estimator( data_args = (x_train,) else: data_args = (x_test,) + batch_size = get_bench_case_value( + bench_case, f"algorithm:batch_size:{stage}" + ) if method == "partial_fit": - num_batches = get_bench_case_value(bench_case, "data:num_batches") - batch_size = get_bench_case_value(bench_case, "data:batch_size") - - if batch_size is None: - if num_batches is None: - num_batches = 5 - batch_size = ( - data_args[0].shape[0] + num_batches - 1 - ) // num_batches - if num_batches is None: - num_batches = ( - data_args[0].shape[0] + batch_size - 1 - ) // batch_size + num_batches = get_bench_case_value( + bench_case, f"algorithm:num_batches:{stage}", 5 + ) method_instance = create_online_function( - estimator_instance, - method_instance, - data_args, - num_batches, - batch_size, + estimator_instance, method_instance, data_args, num_batches ) # daal4py model builders enabling branch if enable_modelbuilders and stage == "inference": @@ -452,6 +433,10 @@ def measure_sklearn_estimator( metrics[method]["box filter mean[ms]"], metrics[method]["box filter std[ms]"], ) = measure_case(bench_case, method_instance, *data_args) + if batch_size is not None: + metrics[method]["throughput[samples/ms]"] = ( + (data_args[0].shape[0] // batch_size) * batch_size + ) / metrics[method]["time[ms]"] if ensure_sklearnex_patching: full_method_name = f"{estimator_class.__name__}.{method}" sklearnex_logging_stream.seek(0) @@ -559,9 +544,16 @@ def main(bench_case: BenchCase, filters: List[BenchCase]): for stage in estimator_methods.keys(): data_descs[stage].update( { - "batch_size": get_bench_case_value( - bench_case, f"algorithm:batch_size:{stage}" - ) + key: val + for key, val in { + "batch_size": get_bench_case_value( + bench_case, f"algorithm:batch_size:{stage}" + ), + "num_batches": get_bench_case_value( + bench_case, f"algorithm:num_batches:{stage}" + ), + }.items() + if val is not None } ) if "n_classes" in data_description: diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py index 2bc3a05e..fdc6dc5c 100644 --- a/sklbench/report/implementation.py +++ b/sklbench/report/implementation.py @@ -97,6 +97,7 @@ "order", "n_classes", "n_clusters", + "num_batches", "batch_size", ] @@ -262,10 +263,7 @@ def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame: # only relative improvements are included in summary currently if len(column) > 1 and column[1] == f"{metric_name} relative improvement": metric_columns.append(column) - if metric_columns: - summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T - else: - summary = pd.DataFrame() + summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T summary.index = pd.Index([df_name]) return summary