fix tests by changing 2 gpus to 4

ankitageorge · ankitageorge · commit ca7f426d07cd · 2025-05-08T12:40:55.000-07:00
diff --git a/tests/recipes/test_knowledge_distillation_distributed.py b/tests/recipes/test_knowledge_distillation_distributed.py
@@ -110,7 +110,7 @@ def test_loss(self, tmpdir, monkeypatch):
         )
 
     @pytest.mark.integration_test
-    @gpu_test(gpu_count=4)
+    @gpu_test(gpu_count=2)
     def test_training_state_on_resume(self, tmpdir, monkeypatch):
         """Test whether the recipe state is correctly updated on resume. Since this
         is model agnostic, we should run this on the small model only. The test
@@ -133,7 +133,7 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch):
 
         # Train for two epochs
         cmd_1 = f"""
-        tune run --nnodes 1 --nproc_per_node 4 knowledge_distillation_distributed \
+        tune run --nnodes 1 --nproc_per_node 2 knowledge_distillation_distributed \
             --config llama3_2/8B_to_1B_KD_lora_distributed \
             output_dir={tmpdir} \
             checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \
@@ -163,7 +163,7 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch):
         epoch_folder = get_largest_iter_folder(tmpdir)
         epoch_folder_minus_one = f"epoch_{int(epoch_folder.split('_')[-1]) - 1}"
         cmd_2 = f"""
-        tune run --nnodes 1 --nproc_per_node 4 knowledge_distillation_distributed \
+        tune run --nnodes 1 --nproc_per_node 2 knowledge_distillation_distributed \
             --config llama3_2/8B_to_1B_KD_lora_distributed \
             output_dir={tmpdir} \
             checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \
@@ -202,7 +202,7 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch):
         )
 
     @pytest.mark.integration_test
-    @gpu_test(gpu_count=2)
+    @gpu_test(gpu_count=4)
     def test_training_state_on_resume_with_async_checkpointing(
         self, tmpdir, monkeypatch
     ):
@@ -227,7 +227,7 @@ def test_training_state_on_resume_with_async_checkpointing(
 
         # Train for two epochs
         cmd_1 = f"""
-        tune run --nnodes 1 --nproc_per_node 2 knowledge_distillation_distributed \
+        tune run --nnodes 1 --nproc_per_node 4 knowledge_distillation_distributed \
             --config llama3_2/8B_to_1B_KD_lora_distributed \
             output_dir={tmpdir} \
             checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \
@@ -255,17 +255,13 @@ def test_training_state_on_resume_with_async_checkpointing(
         runpy.run_path(TUNE_PATH, run_name="__main__")
 
         # Resume training
-        epoch_folder = get_largest_iter_folder(tmpdir)
-        epoch_folder_minus_one = f"epoch_{int(epoch_folder.split('_')[-1]) - 1}"
         cmd_2 = f"""
-        tune run --nnodes 1 --nproc_per_node 2 knowledge_distillation_distributed \
+        tune run --nnodes 1 --nproc_per_node 4 knowledge_distillation_distributed \
             --config llama3_2/8B_to_1B_KD_lora_distributed \
             output_dir={tmpdir} \
             checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \
             checkpointer.checkpoint_dir={ckpt_dir} \
             checkpointer.checkpoint_files=[{ckpt_path}]\
-            checkpointer.adapter_checkpoint={os.path.join(epoch_folder_minus_one, f"{ADAPTER_MODEL_FNAME}.pt")}
-            checkpointer.recipe_checkpoint={os.path.join(RECIPE_STATE_DIRNAME, "recipe_state.pt")}
             checkpointer.output_dir={tmpdir} \
             teacher_checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \
             teacher_checkpointer.checkpoint_dir='{ckpt_dir}' \
diff --git a/tests/recipes/test_qat_lora_finetune_distributed.py b/tests/recipes/test_qat_lora_finetune_distributed.py
@@ -213,7 +213,7 @@ def test_training_state_on_resume(
         )
 
     @pytest.mark.integration_test
-    @gpu_test(gpu_count=2)
+    @gpu_test(gpu_count=4)
     @pytest.mark.parametrize(
         "config, model_type, ckpt_type, save_adapter_weights_only",
         [
@@ -252,7 +252,7 @@ def test_training_state_on_resume_with_async_checkpointing(
 
         # Train for two epochs
         cmd_1 = f"""
-        tune run --nnodes 1 --nproc_per_node 2 qat_lora_finetune_distributed \
+        tune run --nnodes 1 --nproc_per_node 4 qat_lora_finetune_distributed \
             --config {config} \
             batch_size=4 \
             gradient_accumulation_steps=1 \
@@ -281,7 +281,7 @@ def test_training_state_on_resume_with_async_checkpointing(
         epoch_folder = get_largest_iter_folder(tmpdir)
         epoch_folder_minus_one = f"epoch_{int(epoch_folder.split('_')[-1]) - 1}"
         cmd_2 = f"""
-        tune run --nnodes 1 --nproc_per_node 2 qat_lora_finetune_distributed \
+        tune run --nnodes 1 --nproc_per_node 4 qat_lora_finetune_distributed \
             --config {config} \
             batch_size=4 \
             gradient_accumulation_steps=1 \