Skip to content

Commit

Permalink
Change regression tests to match the updated recipe (#600)
Browse files Browse the repository at this point in the history
* Change regression tests to match the updated recipe

* Change regression tests to match the updated recipe

* format

* minor change

* minor change

* minor change

* minor change
  • Loading branch information
gunjanj007 authored Feb 18, 2025
1 parent 85354b7 commit de647ec
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 7 deletions.
15 changes: 10 additions & 5 deletions dags/map_reproducibility/a3ultra_mixtral_8_7b_nemo.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,12 @@
VALUE_YAML_PATH = (
f"training/{HYPERCOMPUTER}/{MODEL_ID}/nemo-pretraining-gke/values.yaml"
)
CLUSTER = "gke-a3u-map-01-31"
CLUSTER_REGION = "europe-west1"
CLUSTER = "a3ultra-benchmark"
CLUSTER_REGION = "us-west1"
SOFTWARE_ID = "pytorch_nemo"
IMAGE_VERSION = "nemo_workload:24.07"
DOCKER_IMAGE = f"us-central1-docker.pkg.dev/supercomputer-testing/gunjanjalori/{FRAMEWORK}_test/{IMAGE_VERSION}"
IMAGE_VERSION = "nemo24.07"
DOCKER_IMAGE = f"us-central1-docker.pkg.dev/deeplearning-images/reproducibility/pytorch-gpu-nemo-nccl:{IMAGE_VERSION}-gib1.0.3-A3U"
KUEUE_NAME = "a3-ultra"


@task
Expand Down Expand Up @@ -120,9 +121,13 @@ def run_aotc_workload():
recipe_repo_root,
DOCKER_IMAGE,
cluster_name=CLUSTER,
kueue_name=KUEUE_NAME,
)
+ wait_for_jobs_cmds()
+ copy_bucket_cmds(recipe_repo_root)
+ copy_bucket_cmds(
recipe_repo_root,
hypercomputer=HYPERCOMPUTER,
)
+ get_nemo_metrics_cmds(
global_batch_size,
num_gpus,
Expand Down
12 changes: 10 additions & 2 deletions dags/map_reproducibility/utils/common_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,10 +117,12 @@ def helm_apply_cmds(
docker_image,
aotc: bool = False,
cluster_name: str = "a3plus-benchmark",
kueue_name: str = "a3-ultra",
):
gcs_cmd = ""
if hypercomputer == "a3ultra":
gcs_cmd = f" --set clusterName={cluster_name}"
gcs_cmd += f" --set queue={kueue_name}"
gcs_cmd += f" --set volumes.gcsMounts[0].bucketName={BUCKET_NAME}"
else:
gcs_cmd = f" --set workload.gcsBucketForDataCataPath={BUCKET_NAME}"
Expand Down Expand Up @@ -150,10 +152,16 @@ def wait_for_jobs_cmds():
return wait_for_job


def copy_bucket_cmds(recipe_repo_root):
def copy_bucket_cmds(recipe_repo_root, hypercomputer: str = "a3mega"):
gcs_location = ""
if hypercomputer == "a3ultra":
gcs_location = f"gs://{BUCKET_NAME}/nemo-experiments/megatron_gpt/"
else:
gcs_location = f"gs://{BUCKET_NAME}/nemo-experiments/"

copy_bucket_contents = (
"export COMPLETE_JOB_NAME=$(gcloud storage ls "
f"gs://{BUCKET_NAME}/nemo-experiments/ | grep $JOB_NAME)",
f"{gcs_location} | grep $JOB_NAME)",
'echo "COMPLETE_JOB_NAME ${COMPLETE_JOB_NAME}"',
f"cd {recipe_repo_root}/src/utils/training_metrics",
"gcloud storage cp ${COMPLETE_JOB_NAME}"
Expand Down

0 comments on commit de647ec

Please sign in to comment.