From 2d4989fc89089b64b4e245615fbbf00d72dc4ce4 Mon Sep 17 00:00:00 2001
From: Jaime Adan Cuevas Ramirez <jaime.cuevas.ramirez@intel.com>
Date: Wed, 2 Apr 2025 16:17:10 -0600
Subject: [PATCH 1/2] Create refactoring_of_benchmarks.py

Error handling (invalid JSON, missing files)
Automatic fixes (correct dataset paths if names mismatch)
Clear logging with warnings and actions
---
 refactoring_of_benchmarks.py | 64 ++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 refactoring_of_benchmarks.py

diff --git a/refactoring_of_benchmarks.py b/refactoring_of_benchmarks.py
new file mode 100644
index 00000000..9db4ec96
--- /dev/null
+++ b/refactoring_of_benchmarks.py
@@ -0,0 +1,64 @@
+import os
+import json
+
+# Configuration
+CONFIG_FILE = "xgb_cpu_main_config.json"
+DATASET_FOLDER = "dataset"
+EXPECTED_DATASETS = ["mlsr", "mortgage1Q", "plasticc", "santander"]
+
+def load_config():
+    """Load the benchmark configuration file."""
+    if not os.path.exists(CONFIG_FILE):
+        print(f"ERROR: Configuration file '{CONFIG_FILE}' not found. Verify its location.")
+        return None
+
+    with open(CONFIG_FILE, "r") as f:
+        try:
+            return json.load(f)
+        except json.JSONDecodeError:
+            print(f"ERROR: Failed to parse '{CONFIG_FILE}'. Ensure it contains valid JSON.")
+            return None
+
+def check_datasets():
+    """Check if required datasets exist in the dataset folder."""
+    missing_datasets = []
+    for dataset in EXPECTED_DATASETS:
+        dataset_path = os.path.join(DATASET_FOLDER, dataset)
+        if not os.path.exists(dataset_path):
+            print(f"⚠️ WARNING: Dataset '{dataset}' is missing in '{DATASET_FOLDER}'.")
+            missing_datasets.append(dataset)
+
+    if missing_datasets:
+        print("\n🔹 Suggested Actions:")
+        print("- Ensure dataset names are correct in the 'dataset/' folder.")
+        print("- Download the missing datasets if necessary.")
+        print("- If dataset names differ, update 'xgb_cpu_main_config.json'.\n")
+
+    return missing_datasets
+
+def update_config(missing_datasets):
+    """Fix dataset names in the configuration file if necessary."""
+    config = load_config()
+    if not config:
+        return
+
+    updated = False
+    for dataset in missing_datasets:
+        if dataset in config.get("datasets", {}):
+            print(f"🛠️ Fixing dataset path for '{dataset}' in {CONFIG_FILE}...")
+            config["datasets"][dataset] = os.path.join(DATASET_FOLDER, f"{dataset}.csv")  # Adjust extension if necessary
+            updated = True
+
+    if updated:
+        with open(CONFIG_FILE, "w") as f:
+            json.dump(config, f, indent=4)
+        print(f"✅ {CONFIG_FILE} has been updated with corrected dataset paths.")
+
+if __name__ == "__main__":
+    print("🔍 Checking dataset availability...\n")
+    missing = check_datasets()
+
+    if missing:
+        update_config(missing)
+    else:
+        print("✅ All datasets are present. You can proceed with benchmarking.")

From a73d25d103034c72ef27dea3ea0060a495fd5035 Mon Sep 17 00:00:00 2001
From: Jaime Adan Cuevas Ramirez <jaime.cuevas.ramirez@intel.com>
Date: Wed, 2 Apr 2025 16:20:55 -0600
Subject: [PATCH 2/2] Create compilation_frameworks.py

 Compares ONNX vs. TVM performance on a scikit-learn model
Demonstrates ONNX conversion from sklearn
Uses TVM to optimize the model for inference
Benchmarks inference times for performance analysis
---
 compilation_frameworks.py | 51 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 compilation_frameworks.py

diff --git a/compilation_frameworks.py b/compilation_frameworks.py
new file mode 100644
index 00000000..1367690b
--- /dev/null
+++ b/compilation_frameworks.py
@@ -0,0 +1,51 @@
+import numpy as np
+import onnx
+import onnxruntime as ort
+import tvm
+from tvm import relay
+from skl2onnx import convert_sklearn
+from skl2onnx.common.data_types import FloatTensorType
+from sklearn.ensemble import RandomForestClassifier
+from time import time
+
+# Generate sample data
+X_train = np.random.rand(100, 10).astype(np.float32)
+y_train = np.random.randint(0, 2, size=(100,))
+
+# Train a simple RandomForest model
+model = RandomForestClassifier(n_estimators=10)
+model.fit(X_train, y_train)
+
+# Convert model to ONNX
+initial_type = [("input", FloatTensorType([None, 10]))]
+onnx_model = convert_sklearn(model, initial_types=initial_type)
+onnx.save_model(onnx_model, "model.onnx")
+
+# Load ONNX model for inference test
+ort_session = ort.InferenceSession("model.onnx")
+input_data = {ort_session.get_inputs()[0].name: X_train[:5]}
+start = time()
+ort_outs = ort_session.run(None, input_data)
+print(f"ONNX Inference Time: {time() - start:.4f}s")
+
+# Optimize ONNX model with TVM
+onnx_model = onnx.load("model.onnx")
+mod, params = relay.frontend.from_onnx(onnx_model, shape={"input": (1, 10)})
+
+# Compile with TVM
+target = "llvm"
+with tvm.transform.PassContext(opt_level=3):
+    lib = relay.build(mod, target=target, params=params)
+
+# Run inference with TVM
+dev = tvm.cpu()
+dtype = "float32"
+tvm_model = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+tvm_model.set_input("input", tvm.nd.array(X_train[:5].astype(dtype)))
+
+start = time()
+tvm_model.run()
+tvm_out = tvm_model.get_output(0).numpy()
+print(f"TVM Optimized Inference Time: {time() - start:.4f}s")
+
+print("Optimization complete! Compare ONNX vs. TVM inference times.")