@@ -110,7 +110,7 @@ def test_loss(self, tmpdir, monkeypatch):
110
110
)
111
111
112
112
@pytest .mark .integration_test
113
- @gpu_test (gpu_count = 4 )
113
+ @gpu_test (gpu_count = 2 )
114
114
def test_training_state_on_resume (self , tmpdir , monkeypatch ):
115
115
"""Test whether the recipe state is correctly updated on resume. Since this
116
116
is model agnostic, we should run this on the small model only. The test
@@ -133,7 +133,7 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch):
133
133
134
134
# Train for two epochs
135
135
cmd_1 = f"""
136
- tune run --nnodes 1 --nproc_per_node 4 knowledge_distillation_distributed \
136
+ tune run --nnodes 1 --nproc_per_node 2 knowledge_distillation_distributed \
137
137
--config llama3_2/8B_to_1B_KD_lora_distributed \
138
138
output_dir={ tmpdir } \
139
139
checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \
@@ -163,7 +163,7 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch):
163
163
epoch_folder = get_largest_iter_folder (tmpdir )
164
164
epoch_folder_minus_one = f"epoch_{ int (epoch_folder .split ('_' )[- 1 ]) - 1 } "
165
165
cmd_2 = f"""
166
- tune run --nnodes 1 --nproc_per_node 4 knowledge_distillation_distributed \
166
+ tune run --nnodes 1 --nproc_per_node 2 knowledge_distillation_distributed \
167
167
--config llama3_2/8B_to_1B_KD_lora_distributed \
168
168
output_dir={ tmpdir } \
169
169
checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \
@@ -202,7 +202,7 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch):
202
202
)
203
203
204
204
@pytest .mark .integration_test
205
- @gpu_test (gpu_count = 2 )
205
+ @gpu_test (gpu_count = 4 )
206
206
def test_training_state_on_resume_with_async_checkpointing (
207
207
self , tmpdir , monkeypatch
208
208
):
@@ -227,7 +227,7 @@ def test_training_state_on_resume_with_async_checkpointing(
227
227
228
228
# Train for two epochs
229
229
cmd_1 = f"""
230
- tune run --nnodes 1 --nproc_per_node 2 knowledge_distillation_distributed \
230
+ tune run --nnodes 1 --nproc_per_node 4 knowledge_distillation_distributed \
231
231
--config llama3_2/8B_to_1B_KD_lora_distributed \
232
232
output_dir={ tmpdir } \
233
233
checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \
@@ -255,17 +255,13 @@ def test_training_state_on_resume_with_async_checkpointing(
255
255
runpy .run_path (TUNE_PATH , run_name = "__main__" )
256
256
257
257
# Resume training
258
- epoch_folder = get_largest_iter_folder (tmpdir )
259
- epoch_folder_minus_one = f"epoch_{ int (epoch_folder .split ('_' )[- 1 ]) - 1 } "
260
258
cmd_2 = f"""
261
- tune run --nnodes 1 --nproc_per_node 2 knowledge_distillation_distributed \
259
+ tune run --nnodes 1 --nproc_per_node 4 knowledge_distillation_distributed \
262
260
--config llama3_2/8B_to_1B_KD_lora_distributed \
263
261
output_dir={ tmpdir } \
264
262
checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \
265
263
checkpointer.checkpoint_dir={ ckpt_dir } \
266
264
checkpointer.checkpoint_files=[{ ckpt_path } ]\
267
- checkpointer.adapter_checkpoint={ os .path .join (epoch_folder_minus_one , f"{ ADAPTER_MODEL_FNAME } .pt" )}
268
- checkpointer.recipe_checkpoint={ os .path .join (RECIPE_STATE_DIRNAME , "recipe_state.pt" )}
269
265
checkpointer.output_dir={ tmpdir } \
270
266
teacher_checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \
271
267
teacher_checkpointer.checkpoint_dir='{ ckpt_dir } ' \
0 commit comments