AI-Hypercomputer · ycchenzheng · Sep 18, 2025 · shauryagup · Oct 1, 2025 · ycchenzheng
@@ -141,7 +141,7 @@ def write_run(
 
   from benchmark_db_writer import bq_writer_utils
   from benchmark_db_writer import dataclass_bigquery_writer
-  from benchmark_db_writer.run_summary_writer import sample_run_summary_writer
+  from benchmark_db_writer.run_summary_writer import run_summary_writer
   from benchmark_db_writer.schema.workload_benchmark_v2 import workload_benchmark_v2_schema
 
   def get_db_client(
@@ -168,9 +168,9 @@ def get_db_client(
   print(options.model_id)
 
   if (
-      sample_run_summary_writer.validate_model_id(options.model_id, options.is_test)
-      and sample_run_summary_writer.validate_hardware_id(options.hardware_id, options.is_test)
-      and sample_run_summary_writer.validate_software_id(options.software_id, options.is_test)
+      run_summary_writer.validate_model_id(options.model_id, options.is_test)
+      and run_summary_writer.validate_hardware_id(options.hardware_id, options.is_test)
+      and run_summary_writer.validate_software_id(options.software_id, options.is_test)
   ):
     summary = workload_benchmark_v2_schema.WorkloadBenchmarkV2Schema(
         run_id=f"run-{uuid.uuid4()}",
@@ -179,6 +179,7 @@ def get_db_client(
         hardware_id=options.hardware_id,
         hardware_num_chips=number_of_chips,
         hardware_num_nodes=number_of_nodes,
+        hardware_num_slices=options.hardware_num_slices,
         result_success=run_success,
         configs_framework=framework_config_in_json,
         configs_env=env_variables,

@@ -17,7 +17,7 @@
 import os.path
 
 # This is the MaxText root: with "max_utils.py"; &etc. TODO: Replace `os.path.basename` with `os.path.abspath`
-MAXTEXT_PKG_DIR = os.environ.get("MAXTEXT_PKG_DIR", "MaxText")
+MAXTEXT_PKG_DIR = os.environ.get("MAXTEXT_PKG_DIR", "src/MaxText")
 
 # This is the maxtext repo root: with ".git" folder; "README.md"; "pyproject.toml"; &etc.
 MAXTEXT_REPO_ROOT = os.environ.get(

@@ -158,6 +158,9 @@ def __post_init__(self):
     else:
       self.num_devices_per_slice = int(self.device_type.split("-")[1]) / 2
       self.topology = ""
+    self.hardware_id = self.device_type.split("-")[0]
+    if self.hardware_id == "v5litepod":
+      self.hardware_id = "v5e"
 
 
 def wait_for_xpk_workload_completion(cluster_config: XpkClusterConfig, workload_name, xpk_path) -> int:
@@ -341,6 +344,7 @@ def _build_args_from_config(wl_config: WorkloadConfig) -> dict:
       "model_id": wl_config.model.model_type,
       "hardware_id": wl_config.hardware_id,
       "software_id": "jax_maxtext",
+      "hardware_num_slices": wl_config.num_slices,
       "number_of_chips": wl_config.num_devices_per_slice * wl_config.num_slices,
       "container_image_name": wl_config.base_docker_image,
       "global_batch_size": per_device_batch_size * wl_config.num_devices_per_slice * wl_config.num_slices,
@@ -445,7 +449,8 @@ def build_user_command(
           f"base_output_directory={wl_config.base_output_directory}",
           f"{vertex_tensorboard}",
           f"{run_name_command}",
-          f"{enable_metrics_cmd}" f"{upload_hlo_dump}",
+          f"{enable_metrics_cmd}",
+          f"{upload_hlo_dump}",
       ]
   )
   return command

@@ -34,6 +34,9 @@ def generate_and_run_workloads(user_config, num_slices_list, num_steps, priority
     num_slices_list: A list of the number of slices to be executed.
     num_steps: The number of steps for each workload.
   """
+  if user_config.bq_enable and (not user_config.bq_db_project or not user_config.bq_db_dataset):
+    logging.error("Validation FAILED: BQ is enabled, but project or dataset is missing.")
+    return 1
   xpk_workload_cmds = []
   xpk_workload_names = []
 
@@ -65,6 +68,9 @@ def generate_and_run_workloads(user_config, num_slices_list, num_steps, priority
               xpk_path=user_config.xpk_path,
               num_steps=num_steps,
               priority=priority,
+              generate_metrics_and_upload_to_big_query=user_config.bq_enable,
+              db_project=user_config.bq_db_project,
+              db_dataset=user_config.bq_db_dataset,
           )
 
           # Generate XPK command

@@ -70,6 +70,11 @@ class UserConfig:
   selected_model_names: list[str] = dataclasses.field(default_factory=lambda: ["llama3_1_8b_8192"])
   num_slices_list: list[int] = dataclasses.field(default_factory=lambda: [2])
 
+  # BigQuery configuration
+  bq_enable: bool = False
+  bq_db_project: str = ""
+  bq_db_dataset: str = ""
+
   # other configuration
   xpk_path: str = "~/xpk"
   max_restarts: int = 0

@@ -186,6 +186,12 @@ def add_parser_arguments(parser: argparse.ArgumentParser):
       default=True,
       help="Whether to use the testing project or production project",
   )
+  parser.add_argument(
+      "--hardware_num_slices",
+      type=int,
+      required=False,
+      help="hardware slice number",
+  )
 
 
 def download_metrics_file_locally(metrics_gcs_file: str, local_file: str) -> int:

@@ -1,10 +1,11 @@
 absl-py
 aqtp
 array-record
+benchmark_db_writer@git+https://github.com/CIeNET-International/aotc.git@c0bef62eac87c99152ff2e9fd48da1f7d9f3cc04#subdirectory=src/aotc/benchmark_db_writer
 cloud-accelerator-diagnostics
 cloud-tpu-diagnostics
 datasets
-flax
+flax==0.11.1
 gcsfs
 google-api-python-client
 google-cloud-aiplatform