Skip to content

Commit

Permalink
Move a3u load to a working cluster (#602)
Browse files Browse the repository at this point in the history
* change cluster name

* change cluster name
  • Loading branch information
gunjanj007 authored Feb 18, 2025
1 parent de647ec commit 2ebb947
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 8 deletions.
4 changes: 2 additions & 2 deletions dags/map_reproducibility/a3ultra_mixtral_8_7b_nemo.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@
VALUE_YAML_PATH = (
f"training/{HYPERCOMPUTER}/{MODEL_ID}/nemo-pretraining-gke/values.yaml"
)
CLUSTER = "a3ultra-benchmark"
CLUSTER_REGION = "us-west1"
CLUSTER = "a3ultra-bmark72"
CLUSTER_REGION = "europe-west1"
SOFTWARE_ID = "pytorch_nemo"
IMAGE_VERSION = "nemo24.07"
DOCKER_IMAGE = f"us-central1-docker.pkg.dev/deeplearning-images/reproducibility/pytorch-gpu-nemo-nccl:{IMAGE_VERSION}-gib1.0.3-A3U"
Expand Down
7 changes: 1 addition & 6 deletions dags/map_reproducibility/utils/common_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,12 +187,7 @@ def get_nemo_metrics_cmds(


def cleanup_cmds():
cleanup = (
"helm uninstall $JOB_NAME",
"kubectl get pods "
"--no-headers=true | awk '{print $1}' "
"| grep $JOB_NAME | xargs kubectl delete pods",
)
cleanup = ("helm uninstall $JOB_NAME",)
return cleanup


Expand Down

0 comments on commit 2ebb947

Please sign in to comment.