Skip to content

Commit ca7f426

Browse files
committed
fix tests by changing 2 gpus to 4
1 parent 47da154 commit ca7f426

File tree

2 files changed

+9
-13
lines changed

2 files changed

+9
-13
lines changed

tests/recipes/test_knowledge_distillation_distributed.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def test_loss(self, tmpdir, monkeypatch):
110110
)
111111

112112
@pytest.mark.integration_test
113-
@gpu_test(gpu_count=4)
113+
@gpu_test(gpu_count=2)
114114
def test_training_state_on_resume(self, tmpdir, monkeypatch):
115115
"""Test whether the recipe state is correctly updated on resume. Since this
116116
is model agnostic, we should run this on the small model only. The test
@@ -133,7 +133,7 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch):
133133

134134
# Train for two epochs
135135
cmd_1 = f"""
136-
tune run --nnodes 1 --nproc_per_node 4 knowledge_distillation_distributed \
136+
tune run --nnodes 1 --nproc_per_node 2 knowledge_distillation_distributed \
137137
--config llama3_2/8B_to_1B_KD_lora_distributed \
138138
output_dir={tmpdir} \
139139
checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \
@@ -163,7 +163,7 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch):
163163
epoch_folder = get_largest_iter_folder(tmpdir)
164164
epoch_folder_minus_one = f"epoch_{int(epoch_folder.split('_')[-1]) - 1}"
165165
cmd_2 = f"""
166-
tune run --nnodes 1 --nproc_per_node 4 knowledge_distillation_distributed \
166+
tune run --nnodes 1 --nproc_per_node 2 knowledge_distillation_distributed \
167167
--config llama3_2/8B_to_1B_KD_lora_distributed \
168168
output_dir={tmpdir} \
169169
checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \
@@ -202,7 +202,7 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch):
202202
)
203203

204204
@pytest.mark.integration_test
205-
@gpu_test(gpu_count=2)
205+
@gpu_test(gpu_count=4)
206206
def test_training_state_on_resume_with_async_checkpointing(
207207
self, tmpdir, monkeypatch
208208
):
@@ -227,7 +227,7 @@ def test_training_state_on_resume_with_async_checkpointing(
227227

228228
# Train for two epochs
229229
cmd_1 = f"""
230-
tune run --nnodes 1 --nproc_per_node 2 knowledge_distillation_distributed \
230+
tune run --nnodes 1 --nproc_per_node 4 knowledge_distillation_distributed \
231231
--config llama3_2/8B_to_1B_KD_lora_distributed \
232232
output_dir={tmpdir} \
233233
checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \
@@ -255,17 +255,13 @@ def test_training_state_on_resume_with_async_checkpointing(
255255
runpy.run_path(TUNE_PATH, run_name="__main__")
256256

257257
# Resume training
258-
epoch_folder = get_largest_iter_folder(tmpdir)
259-
epoch_folder_minus_one = f"epoch_{int(epoch_folder.split('_')[-1]) - 1}"
260258
cmd_2 = f"""
261-
tune run --nnodes 1 --nproc_per_node 2 knowledge_distillation_distributed \
259+
tune run --nnodes 1 --nproc_per_node 4 knowledge_distillation_distributed \
262260
--config llama3_2/8B_to_1B_KD_lora_distributed \
263261
output_dir={tmpdir} \
264262
checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \
265263
checkpointer.checkpoint_dir={ckpt_dir} \
266264
checkpointer.checkpoint_files=[{ckpt_path}]\
267-
checkpointer.adapter_checkpoint={os.path.join(epoch_folder_minus_one, f"{ADAPTER_MODEL_FNAME}.pt")}
268-
checkpointer.recipe_checkpoint={os.path.join(RECIPE_STATE_DIRNAME, "recipe_state.pt")}
269265
checkpointer.output_dir={tmpdir} \
270266
teacher_checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \
271267
teacher_checkpointer.checkpoint_dir='{ckpt_dir}' \

tests/recipes/test_qat_lora_finetune_distributed.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ def test_training_state_on_resume(
213213
)
214214

215215
@pytest.mark.integration_test
216-
@gpu_test(gpu_count=2)
216+
@gpu_test(gpu_count=4)
217217
@pytest.mark.parametrize(
218218
"config, model_type, ckpt_type, save_adapter_weights_only",
219219
[
@@ -252,7 +252,7 @@ def test_training_state_on_resume_with_async_checkpointing(
252252

253253
# Train for two epochs
254254
cmd_1 = f"""
255-
tune run --nnodes 1 --nproc_per_node 2 qat_lora_finetune_distributed \
255+
tune run --nnodes 1 --nproc_per_node 4 qat_lora_finetune_distributed \
256256
--config {config} \
257257
batch_size=4 \
258258
gradient_accumulation_steps=1 \
@@ -281,7 +281,7 @@ def test_training_state_on_resume_with_async_checkpointing(
281281
epoch_folder = get_largest_iter_folder(tmpdir)
282282
epoch_folder_minus_one = f"epoch_{int(epoch_folder.split('_')[-1]) - 1}"
283283
cmd_2 = f"""
284-
tune run --nnodes 1 --nproc_per_node 2 qat_lora_finetune_distributed \
284+
tune run --nnodes 1 --nproc_per_node 4 qat_lora_finetune_distributed \
285285
--config {config} \
286286
batch_size=4 \
287287
gradient_accumulation_steps=1 \

0 commit comments

Comments
 (0)