From 2d4989fc89089b64b4e245615fbbf00d72dc4ce4 Mon Sep 17 00:00:00 2001 From: Jaime Adan Cuevas Ramirez Date: Wed, 2 Apr 2025 16:17:10 -0600 Subject: [PATCH 1/2] Create refactoring_of_benchmarks.py Error handling (invalid JSON, missing files) Automatic fixes (correct dataset paths if names mismatch) Clear logging with warnings and actions --- refactoring_of_benchmarks.py | 64 ++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 refactoring_of_benchmarks.py diff --git a/refactoring_of_benchmarks.py b/refactoring_of_benchmarks.py new file mode 100644 index 00000000..9db4ec96 --- /dev/null +++ b/refactoring_of_benchmarks.py @@ -0,0 +1,64 @@ +import os +import json + +# Configuration +CONFIG_FILE = "xgb_cpu_main_config.json" +DATASET_FOLDER = "dataset" +EXPECTED_DATASETS = ["mlsr", "mortgage1Q", "plasticc", "santander"] + +def load_config(): + """Load the benchmark configuration file.""" + if not os.path.exists(CONFIG_FILE): + print(f"ERROR: Configuration file '{CONFIG_FILE}' not found. Verify its location.") + return None + + with open(CONFIG_FILE, "r") as f: + try: + return json.load(f) + except json.JSONDecodeError: + print(f"ERROR: Failed to parse '{CONFIG_FILE}'. Ensure it contains valid JSON.") + return None + +def check_datasets(): + """Check if required datasets exist in the dataset folder.""" + missing_datasets = [] + for dataset in EXPECTED_DATASETS: + dataset_path = os.path.join(DATASET_FOLDER, dataset) + if not os.path.exists(dataset_path): + print(f"āš ļø WARNING: Dataset '{dataset}' is missing in '{DATASET_FOLDER}'.") + missing_datasets.append(dataset) + + if missing_datasets: + print("\nšŸ”¹ Suggested Actions:") + print("- Ensure dataset names are correct in the 'dataset/' folder.") + print("- Download the missing datasets if necessary.") + print("- If dataset names differ, update 'xgb_cpu_main_config.json'.\n") + + return missing_datasets + +def update_config(missing_datasets): + """Fix dataset names in the configuration file if necessary.""" + config = load_config() + if not config: + return + + updated = False + for dataset in missing_datasets: + if dataset in config.get("datasets", {}): + print(f"šŸ› ļø Fixing dataset path for '{dataset}' in {CONFIG_FILE}...") + config["datasets"][dataset] = os.path.join(DATASET_FOLDER, f"{dataset}.csv") # Adjust extension if necessary + updated = True + + if updated: + with open(CONFIG_FILE, "w") as f: + json.dump(config, f, indent=4) + print(f"āœ… {CONFIG_FILE} has been updated with corrected dataset paths.") + +if __name__ == "__main__": + print("šŸ” Checking dataset availability...\n") + missing = check_datasets() + + if missing: + update_config(missing) + else: + print("āœ… All datasets are present. You can proceed with benchmarking.") From a73d25d103034c72ef27dea3ea0060a495fd5035 Mon Sep 17 00:00:00 2001 From: Jaime Adan Cuevas Ramirez Date: Wed, 2 Apr 2025 16:20:55 -0600 Subject: [PATCH 2/2] Create compilation_frameworks.py Compares ONNX vs. TVM performance on a scikit-learn model Demonstrates ONNX conversion from sklearn Uses TVM to optimize the model for inference Benchmarks inference times for performance analysis --- compilation_frameworks.py | 51 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 compilation_frameworks.py diff --git a/compilation_frameworks.py b/compilation_frameworks.py new file mode 100644 index 00000000..1367690b --- /dev/null +++ b/compilation_frameworks.py @@ -0,0 +1,51 @@ +import numpy as np +import onnx +import onnxruntime as ort +import tvm +from tvm import relay +from skl2onnx import convert_sklearn +from skl2onnx.common.data_types import FloatTensorType +from sklearn.ensemble import RandomForestClassifier +from time import time + +# Generate sample data +X_train = np.random.rand(100, 10).astype(np.float32) +y_train = np.random.randint(0, 2, size=(100,)) + +# Train a simple RandomForest model +model = RandomForestClassifier(n_estimators=10) +model.fit(X_train, y_train) + +# Convert model to ONNX +initial_type = [("input", FloatTensorType([None, 10]))] +onnx_model = convert_sklearn(model, initial_types=initial_type) +onnx.save_model(onnx_model, "model.onnx") + +# Load ONNX model for inference test +ort_session = ort.InferenceSession("model.onnx") +input_data = {ort_session.get_inputs()[0].name: X_train[:5]} +start = time() +ort_outs = ort_session.run(None, input_data) +print(f"ONNX Inference Time: {time() - start:.4f}s") + +# Optimize ONNX model with TVM +onnx_model = onnx.load("model.onnx") +mod, params = relay.frontend.from_onnx(onnx_model, shape={"input": (1, 10)}) + +# Compile with TVM +target = "llvm" +with tvm.transform.PassContext(opt_level=3): + lib = relay.build(mod, target=target, params=params) + +# Run inference with TVM +dev = tvm.cpu() +dtype = "float32" +tvm_model = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) +tvm_model.set_input("input", tvm.nd.array(X_train[:5].astype(dtype))) + +start = time() +tvm_model.run() +tvm_out = tvm_model.get_output(0).numpy() +print(f"TVM Optimized Inference Time: {time() - start:.4f}s") + +print("Optimization complete! Compare ONNX vs. TVM inference times.")