Add xgboost to benchmark utilities (#7350)

dantegd · web-flow · commit 3be1b8bcf0e9 · 2025-10-17T23:21:55.000Z
PR adds xgboost regression and classification to cuML's benchmark utilities. Authors: - Dante Gama Dessavre (https://github.com/dantegd) Approvers: - Simon Adorf (https://github.com/csadorf) URL: #7350
diff --git a/python/cuml/cuml/benchmark/algorithms.py b/python/cuml/cuml/benchmark/algorithms.py
@@ -70,6 +70,13 @@
     HDBSCAN = None
 
 
+try:
+    from xgboost import XGBClassifier, XGBRegressor
+except ImportError:
+    XGBClassifier = None
+    XGBRegressor = None
+
+
 class AlgorithmPair:
     """
     Wraps a cuML algorithm and (optionally) a cpu-based algorithm
@@ -347,6 +354,28 @@ def all_algorithms():
             accepts_labels=True,
             accuracy_function=metrics.r2_score,
         ),
+        AlgorithmPair(
+            XGBClassifier,
+            XGBClassifier,
+            shared_args={"tree_method": "hist", "n_estimators": 100},
+            cpu_args={"n_jobs": -1},
+            cuml_args={"device": "cuda"},
+            name="xgboost-classification",
+            accepts_labels=True,
+            cpu_data_prep_hook=_labels_to_int_hook,
+            cuml_data_prep_hook=_labels_to_int_hook,
+            accuracy_function=metrics.accuracy_score,
+        ),
+        AlgorithmPair(
+            XGBRegressor,
+            XGBRegressor,
+            shared_args={"tree_method": "hist", "n_estimators": 100},
+            cpu_args={"n_jobs": -1},
+            cuml_args={"device": "cuda"},
+            name="xgboost-regression",
+            accepts_labels=True,
+            accuracy_function=metrics.r2_score,
+        ),
         AlgorithmPair(
             sklearn.manifold.TSNE,
             cuml.manifold.TSNE,
diff --git a/python/cuml/cuml/benchmark/automated/bench_classification.py b/python/cuml/cuml/benchmark/automated/bench_classification.py
@@ -68,3 +68,12 @@ def bench_svc_linear(gpubenchmark, bench_step, classification):  # noqa: F811
 
 def bench_svc_rbf(gpubenchmark, bench_step, classification):  # noqa: F811
     _benchmark_algo(gpubenchmark, "SVC-RBF", bench_step, classification)
+
+
+def bench_xgboost_classification(
+    gpubenchmark, bench_step, classification  # noqa: F811
+):
+    pytest.importorskip("xgboost")
+    _benchmark_algo(
+        gpubenchmark, "xgboost-classification", bench_step, classification
+    )
diff --git a/python/cuml/cuml/benchmark/automated/bench_regression.py b/python/cuml/cuml/benchmark/automated/bench_regression.py
@@ -85,3 +85,12 @@ def bench_svr_rbf(gpubenchmark, bench_step, regression1):  # noqa: F811
 
 def bench_svr_linear(gpubenchmark, bench_step, regression2):  # noqa: F811
     _benchmark_algo(gpubenchmark, "SVR-Linear", bench_step, regression2)
+
+
+def bench_xgboost_regression(
+    gpubenchmark, bench_step, regression1  # noqa: F811
+):
+    pytest.importorskip("xgboost")
+    _benchmark_algo(
+        gpubenchmark, "xgboost-regression", bench_step, regression1
+    )
diff --git a/python/cuml/tests/test_benchmark.py b/python/cuml/tests/test_benchmark.py
@@ -182,16 +182,30 @@ def predict(self, X):
 # skipping UMAP-Supervised due to issue
 # https://github.com/rapidsai/cuml/issues/4243
 @pytest.mark.parametrize(
-    "algo_name", ["DBSCAN", "LogisticRegression", "ElasticNet", "FIL"]
+    "algo_name",
+    [
+        "DBSCAN",
+        "LogisticRegression",
+        "ElasticNet",
+        "FIL",
+        "xgboost-classification",
+        "xgboost-regression",
+    ],
 )
 def test_real_algos_runner(algo_name):
     pair = algorithms.algorithm_by_name(algo_name)
 
-    if algo_name == "FIL":
+    if algo_name in ["FIL", "xgboost-classification", "xgboost-regression"]:
         pytest.importorskip("xgboost")
 
+    # Use appropriate dataset for regression algorithms
+    if algo_name in ["ElasticNet", "xgboost-regression"]:
+        dataset = "regression"
+    else:
+        dataset = "classification"
+
     runner = AccuracyComparisonRunner(
-        [50], [5], dataset_name="classification", test_fraction=0.20
+        [50], [5], dataset_name=dataset, test_fraction=0.20
     )
     results = runner.run(pair)[0]
     print(results)