Skip to content

Commit

Permalink
Remove tpu configs from AOT test and update HybridSim test name
Browse files Browse the repository at this point in the history
  • Loading branch information
raymondzouu committed Jan 14, 2025
1 parent 2d2ce84 commit f632913
Show file tree
Hide file tree
Showing 3 changed files with 3 additions and 65 deletions.
2 changes: 1 addition & 1 deletion dags/common/quarantined_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ class QuarantineTests:
# DAG: maxtext_configs_aot
"maxtext-aot-v5e-stable-v4-8": TestInfo(team.PERFORMANCE, "2024-11-12"),
"maxtext-aot-v5e-nightly-v4-8": TestInfo(team.PERFORMANCE, "2024-11-12"),
# DAG: maxtext_configs_aot_hybridsim
# DAG: maxtext_configs_hybridsim
"16b-1xv5litepod-256-aot-hybridsim": TestInfo(
team.PERFORMANCE, "2024-11-12"
),
Expand Down
62 changes: 0 additions & 62 deletions dags/multipod/maxtext_configs_aot.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,72 +36,10 @@
catchup=False,
concurrency=2,
) as dag:
# Testing configurations
tpu_configs = {
# accelerator: [(model_size, num_cores), ...],
"v4": [("22b", 128), ("52b", 384)],
"v5e": [("16b", 256), ("32b", 256), ("64b", 256), ("128b", 256)],
"v5p": [
("32b", 128),
("64b", 128),
("128b", 256),
("128b", 512),
("256b", 1024),
("512b", 1024),
("1024b", 2048),
("1024b", 4096),
],
}
num_slices = [1, 2]
docker_images = [
(SetupMode.STABLE, DockerImage.MAXTEXT_TPU_JAX_STABLE_STACK),
(SetupMode.NIGHTLY, DockerImage.MAXTEXT_TPU_JAX_NIGHTLY),
]

run_model_cmds_dict = {}
for tpu, models in tpu_configs.items():
run_model_cmds = []
for model_size, num_cores in models:
for n in num_slices:
cmd = f"bash MaxText/configs/{tpu}/{model_size}.sh EXECUTABLE=train_compile.py M_COMPILE_TOPOLOGY={tpu}-{num_cores} M_COMPILE_TOPOLOGY_NUM_SLICES={n}"
run_model_cmds.append(cmd)
run_model_cmds_dict[tpu] = run_model_cmds

quarantine_task_group = TaskGroup(
group_id="Quarantine", dag=dag, prefix_group_id=False
)

for mode, image in docker_images:
maxtext_v4_configs_test = gke_config.get_gke_config(
time_out_in_min=60,
test_name=f"maxtext-aot-v4-{mode.value}",
run_model_cmds=run_model_cmds_dict["v4"],
docker_image=image.value,
test_owner=test_owner.RAYMOND_Z,
).run_with_quarantine(quarantine_task_group)

maxtext_v5e_configs_test = gke_config.get_gke_config(
time_out_in_min=60,
test_name=f"maxtext-aot-v5e-{mode.value}",
run_model_cmds=run_model_cmds_dict["v5e"],
docker_image=image.value,
test_owner=test_owner.RAYMOND_Z,
).run_with_quarantine(quarantine_task_group)

maxtext_v5p_configs_test = gke_config.get_gke_config(
time_out_in_min=60,
test_name=f"maxtext-aot-v5p-{mode.value}",
run_model_cmds=run_model_cmds_dict["v5p"],
docker_image=image.value,
test_owner=test_owner.RAYMOND_Z,
).run_with_quarantine(quarantine_task_group)

(
maxtext_v4_configs_test
>> maxtext_v5e_configs_test
>> maxtext_v5p_configs_test
)

# GPU AoT tests
cmd = f"bash MaxText/configs/a3/llama_2_7b/16vm.sh EXECUTABLE=train_compile.py M_COMPILE_TOPOLOGY=a3 M_COMPILE_TOPOLOGY_NUM_SLICES=16"
stable_a3_gpu = gke_config.get_maxtext_end_to_end_gpu_gke_test_config(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def hybridsim_compile_and_run(test_group_id):
shared_gcs_location = name_format.generate_gcs_folder_location.override(
task_id=f"{test_group_id}_generate_gcs_folder_location"
)(
f"{gcs_subfolder}/maxtext_configs_aot_hybridsim/v{tpu.value}",
f"{gcs_subfolder}/maxtext_configs_hybridsim/v{tpu.value}",
test_group_id,
)

Expand Down Expand Up @@ -83,7 +83,7 @@ def hybridsim_compile_and_run(test_group_id):


with models.DAG(
dag_id="maxtext_configs_aot_hybridsim",
dag_id="maxtext_configs_hybridsim",
schedule=SCHEDULED_TIME,
tags=["multipod_team", "maxtext", "nightly", "mlscale_onduty"],
start_date=datetime.datetime(2024, 2, 19),
Expand Down

0 comments on commit f632913

Please sign in to comment.