From 2d4989fc89089b64b4e245615fbbf00d72dc4ce4 Mon Sep 17 00:00:00 2001
From: Jaime Adan Cuevas Ramirez <jaime.cuevas.ramirez@intel.com>
Date: Wed, 2 Apr 2025 16:17:10 -0600
Subject: [PATCH] Create refactoring_of_benchmarks.py

Error handling (invalid JSON, missing files)
Automatic fixes (correct dataset paths if names mismatch)
Clear logging with warnings and actions
---
 refactoring_of_benchmarks.py | 64 ++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 refactoring_of_benchmarks.py

diff --git a/refactoring_of_benchmarks.py b/refactoring_of_benchmarks.py
new file mode 100644
index 00000000..9db4ec96
--- /dev/null
+++ b/refactoring_of_benchmarks.py
@@ -0,0 +1,64 @@
+import os
+import json
+
+# Configuration
+CONFIG_FILE = "xgb_cpu_main_config.json"
+DATASET_FOLDER = "dataset"
+EXPECTED_DATASETS = ["mlsr", "mortgage1Q", "plasticc", "santander"]
+
+def load_config():
+    """Load the benchmark configuration file."""
+    if not os.path.exists(CONFIG_FILE):
+        print(f"ERROR: Configuration file '{CONFIG_FILE}' not found. Verify its location.")
+        return None
+
+    with open(CONFIG_FILE, "r") as f:
+        try:
+            return json.load(f)
+        except json.JSONDecodeError:
+            print(f"ERROR: Failed to parse '{CONFIG_FILE}'. Ensure it contains valid JSON.")
+            return None
+
+def check_datasets():
+    """Check if required datasets exist in the dataset folder."""
+    missing_datasets = []
+    for dataset in EXPECTED_DATASETS:
+        dataset_path = os.path.join(DATASET_FOLDER, dataset)
+        if not os.path.exists(dataset_path):
+            print(f"⚠️ WARNING: Dataset '{dataset}' is missing in '{DATASET_FOLDER}'.")
+            missing_datasets.append(dataset)
+
+    if missing_datasets:
+        print("\n🔹 Suggested Actions:")
+        print("- Ensure dataset names are correct in the 'dataset/' folder.")
+        print("- Download the missing datasets if necessary.")
+        print("- If dataset names differ, update 'xgb_cpu_main_config.json'.\n")
+
+    return missing_datasets
+
+def update_config(missing_datasets):
+    """Fix dataset names in the configuration file if necessary."""
+    config = load_config()
+    if not config:
+        return
+
+    updated = False
+    for dataset in missing_datasets:
+        if dataset in config.get("datasets", {}):
+            print(f"🛠️ Fixing dataset path for '{dataset}' in {CONFIG_FILE}...")
+            config["datasets"][dataset] = os.path.join(DATASET_FOLDER, f"{dataset}.csv")  # Adjust extension if necessary
+            updated = True
+
+    if updated:
+        with open(CONFIG_FILE, "w") as f:
+            json.dump(config, f, indent=4)
+        print(f"✅ {CONFIG_FILE} has been updated with corrected dataset paths.")
+
+if __name__ == "__main__":
+    print("🔍 Checking dataset availability...\n")
+    missing = check_datasets()
+
+    if missing:
+        update_config(missing)
+    else:
+        print("✅ All datasets are present. You can proceed with benchmarking.")