pytorch
diff --git a/‎recipes/knowledge_distillation_single_device.py
+60-90 b/‎recipes/knowledge_distillation_single_device.py
+60-90
diff --git a/‎tests/recipes/test_knowledge_distillation_distributed.py
+92 b/‎tests/recipes/test_knowledge_distillation_distributed.py
+92
@@ -13,7 +13,7 @@
 
 import torch
 import torchtune.modules.common_utils as common_utils
-from omegaconf import DictConfig, ListConfig
+from omegaconf import DictConfig, ListConfig, OmegaConf
 
 from torch import nn
 from torch.optim import Optimizer
@@ -24,14 +24,16 @@
 from torchtune.datasets import ConcatDataset
 from torchtune.modules.peft import (
     get_adapter_params,
-    get_adapter_state_dict,
     get_lora_module_names,
-    get_merged_lora_ckpt,
     set_trainable_params,
     validate_missing_and_unexpected_for_lora,
 )
 from torchtune.recipe_interfaces import FTRecipeInterface
 from torchtune.training import DummyProfiler, PROFILER_KEY
+from torchtune.training.checkpointing._checkpoint_client import (
+    CheckpointClient,
+    TrainingProgress,
+)
 from tqdm import tqdm
 
 
@@ -140,36 +142,21 @@ def __init__(self, cfg: DictConfig) -> None:
         self._clip_grad_norm = cfg.get("clip_grad_norm", None)
         self._kd_ratio = cfg.get("kd_ratio", 0.5)
 
-    def load_checkpoint(self, cfg_checkpointer: DictConfig) -> Dict[str, Any]:
-        """
-        Extract the checkpoint state from file and validate. This includes the
-        base model weights. If resume_from_checkpoint is True, this also includes
-        the adapter weights and recipe state
-        """
-        self._checkpointer = config.instantiate(
-            cfg_checkpointer,
-            should_load_recipe_state=self._resume_from_checkpoint,
-        )
-        checkpoint_dict = self._checkpointer.load_checkpoint()
-
-        if self._resume_from_checkpoint:
-            if training.ADAPTER_KEY not in checkpoint_dict:
-                raise ValueError(
-                    "Adapter weights not found. Please ensure a valid adapter checkpoint is provided."
-                )
-            # _update_recipe_state will throw an exception if the recipe state is not corrctly loaded
-            # no need to check here
-            self._update_recipe_state(checkpoint_dict)
-        return checkpoint_dict
+        self._checkpoint_client = CheckpointClient(cfg)
+        self._enable_async_checkpointing = cfg.get("enable_async_checkpointing", False)
 
     def load_teacher_checkpoint(self, cfg_checkpointer: DictConfig) -> Dict[str, Any]:
         """
         Extract the teacher checkpoint state from file.
         """
-        teacher_checkpointer = config.instantiate(
-            cfg_checkpointer,
+        # add checkpointer class to config to work with checkpoint_client
+        checkpointer_dict = {"checkpointer": cfg_checkpointer}
+
+        new_cfg_checkpointer = OmegaConf.create(checkpointer_dict)
+        teacher_checkpoint_client = CheckpointClient(
+            new_cfg_checkpointer,
         )
-        checkpoint_dict = teacher_checkpointer.load_checkpoint()
+        checkpoint_dict = teacher_checkpoint_client.load_base_checkpoint()
         return checkpoint_dict
 
     def _update_recipe_state(self, ckpt_dict: Dict[str, Any]) -> None:
@@ -220,7 +207,7 @@ def setup(self, cfg: DictConfig) -> None:
 
         self._metric_logger = config.instantiate(cfg.metric_logger)
 
-        ckpt_dict = self.load_checkpoint(cfg.checkpointer)
+        checkpoint_dict = self._checkpoint_client.load_base_checkpoint()
 
         # log config with parameter override
         self._metric_logger.log_config(cfg)
@@ -230,7 +217,7 @@ def setup(self, cfg: DictConfig) -> None:
             raise ValueError(
                 "NPU does not support model compilation. Please set `compile: False` in the config."
             )
-        checkpoint_dict = self.load_checkpoint(cfg_checkpointer=cfg.checkpointer)
+
         teacher_checkpoint_dict = self.load_teacher_checkpoint(
             cfg_checkpointer=cfg.teacher_checkpointer
         )
@@ -245,7 +232,7 @@ def setup(self, cfg: DictConfig) -> None:
             base_model_state_dict=checkpoint_dict[training.MODEL_KEY],
             lora_weights_state_dict=(
                 checkpoint_dict[training.ADAPTER_KEY]
-                if self._resume_from_checkpoint
+                if training.ADAPTER_KEY in checkpoint_dict
                 else None
             ),
         )
@@ -255,6 +242,27 @@ def setup(self, cfg: DictConfig) -> None:
             model_state_dict=teacher_checkpoint_dict[training.MODEL_KEY],
         )
 
+        if self._resume_from_checkpoint:
+            # If async checkpointing is enabled, intermediate checkpoints are saved asynchronously
+            # using the DistributedCheckpointer.
+            # Therefore the recipe needs to load the distributed checkpoint to restore the training
+            # progress.
+            if self._enable_async_checkpointing:
+                checkpoint_dict = self._checkpoint_client.load_distributed_checkpoint(
+                    self._model,
+                    self._optimizer,
+                    self._adapter_config,
+                    self._save_adapter_weights_only,
+                )
+
+            if training.ADAPTER_KEY not in checkpoint_dict:
+                raise ValueError(
+                    "Adapter weights not found. Please ensure a valid adapter checkpoint is provided."
+                )
+
+            # Update the recipe state from the checkpoint state dict.
+            self._update_recipe_state(checkpoint_dict)
+
         self._tokenizer = config.instantiate(cfg.tokenizer)
         self._logger.info("Tokenizer is initialized from file.")
 
@@ -294,11 +302,6 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             batch_size=cfg.batch_size,
             shuffle=cfg.shuffle,
-            dataloader_state_dict=(
-                ckpt_dict[training.DATALOADER_KEY]
-                if self._resume_from_checkpoint
-                else None
-            ),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -378,6 +381,17 @@ def _setup_model(
         self._lora_attn_modules = list(cfg_model.lora_attn_modules)
         self._apply_lora_to_mlp = cfg_model.apply_lora_to_mlp
         self._apply_lora_to_output = getattr(cfg_model, "apply_lora_to_output", False)
+        self._adapter_config = {
+            "r": self._lora_rank,
+            "lora_alpha": self._lora_alpha,
+            "target_modules": get_lora_module_names(
+                self._lora_attn_modules,
+                self._apply_lora_to_mlp,
+                self._apply_lora_to_output,
+            ),
+            "peft_type": "LORA",
+        }
+
         self.adapter_params = get_adapter_params(model)
         self._is_dora = any(["magnitude" in k for k in self.adapter_params.keys()])
         set_trainable_params(model, self.adapter_params)
@@ -492,7 +506,6 @@ def _setup_data(
         cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
-        dataloader_state_dict: Optional[Dict[str, Any]] = None,
     ) -> StatefulDataLoader:
         """
         All data related setup happens here. This recipe currently supports only
@@ -537,63 +550,20 @@ def _setup_data(
         return dataloader
 
     def save_checkpoint(self, epoch: int) -> None:
-        """
-        Checkpoint the state of the recipe. The constructed checkpoint state dict
-        contains the following information:
-        - Merged weights with key MODEL_KEY
-        - Adapter weights with key ADAPTER_KEY
-        - Relevant recipe state if training is not complete
-        - If the `self._save_adapter_weights_only` option is True, the checkpointer will save only the adapter weights
-
-        To correctly resume from training, the adapter weights and recipe state must be provided along with the base model weights.
-        """
-        ckpt_dict = {}
-
-        intermediate_checkpoint = epoch + 1 < self.total_epochs
-        # if training is in-progress, checkpoint the optimizer state as well
-        if intermediate_checkpoint:
-            ckpt_dict.update(
-                {
-                    training.OPT_KEY: self._optimizer.state_dict(),
-                    training.SEED_KEY: self.seed,
-                    training.EPOCHS_KEY: self.epochs_run,
-                    training.TOTAL_EPOCHS_KEY: self.total_epochs,
-                    training.MAX_STEPS_KEY: self.max_steps_per_epoch,
-                    training.DATALOADER_KEY: self._dataloader.state_dict(),
-                }
-            )
-
-        # Move to CPU to avoid a copy on GPU
-        state_dict = {k: v.cpu() for k, v in self._model.state_dict().items()}
-
-        # Construct the full state dict with LoRA weights merged into base LLM weights
-        merged_state_dict = get_merged_lora_ckpt(
-            state_dict,
-            rank=self._lora_rank,
-            alpha=self._lora_alpha,
-        )
-        ckpt_dict.update({training.MODEL_KEY: merged_state_dict})
-
-        # Construct the adapter weights
-        adapter_state_dict = get_adapter_state_dict(self._model.state_dict())
-        ckpt_dict.update({training.ADAPTER_KEY: adapter_state_dict})
-        adapter_config = {
-            "r": self._lora_rank,
-            "lora_alpha": self._lora_alpha,
-            "target_modules": get_lora_module_names(
-                self._lora_attn_modules,
-                self._apply_lora_to_mlp,
-                self._apply_lora_to_output,
+        self._checkpoint_client.save_checkpoint(
+            model=self._model,
+            optimizer=self._optimizer,
+            training_progress=TrainingProgress(
+                seed=self.seed,
+                epochs_run=self.epochs_run,
+                total_epochs=self.total_epochs,
+                max_steps_per_epoch=self.max_steps_per_epoch,
+                dataloader_state_dict=self._dataloader.state_dict(),
             ),
-            "peft_type": "LORA",
-        }
-        ckpt_dict.update({training.ADAPTER_CONFIG: adapter_config})
-
-        self._checkpointer.save_checkpoint(
-            ckpt_dict,
             epoch=epoch,
-            intermediate_checkpoint=intermediate_checkpoint,
+            adapter_config=self._adapter_config.copy(),
             adapter_only=self._save_adapter_weights_only,
+            single_device=True,
         )
 
     def _loss_step(
 
@@ -201,6 +201,98 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch):
             loss_values, expected_loss_values, rtol=1e-5, atol=1e-5
         )
 
+    @pytest.mark.integration_test
+    @gpu_test(gpu_count=4)
+    def test_training_state_on_resume_with_async_checkpointing(
+        self, tmpdir, monkeypatch
+    ):
+        """Test whether the recipe state is correctly updated on resume. Since this
+        is model agnostic, we should run this on the small model only. The test
+        consists of three stages:
+            - Train a model for 2 epochs
+            - Resume training after epoch 1
+            - Make sure final loss matches the expected value of a model successfully resumed from a ckpt
+        """
+
+        ckpt = "llama3_tune"
+        ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
+        ckpt_dir = ckpt_path.parent
+        log_file = gen_log_file_name(tmpdir)
+        tokenizer_path = Path(TOKENIZER_PATHS["llama3"])
+
+        # Config file needed for model conversion.
+        # Create a second copy for training resume
+        write_hf_ckpt_config(ckpt_dir)
+        write_hf_ckpt_config(tmpdir)
+
+        # Train for two epochs
+        cmd_1 = f"""
+        tune run --nnodes 1 --nproc_per_node 4 knowledge_distillation_distributed \
+            --config llama3_2/8B_to_1B_KD_lora_distributed \
+            output_dir={tmpdir} \
+            checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \
+            checkpointer.checkpoint_dir='{ckpt_dir}' \
+            checkpointer.checkpoint_files=[{ckpt_path}]\
+            checkpointer.output_dir={tmpdir} \
+            teacher_checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \
+            teacher_checkpointer.checkpoint_dir='{ckpt_dir}' \
+            teacher_checkpointer.checkpoint_files=[{ckpt_path}] \
+            teacher_checkpointer.output_dir={tmpdir} \
+            enable_async_checkpointing=True \
+            tokenizer.path={tokenizer_path} \
+            tokenizer.prompt_template=null \
+        """.split()
+
+        model_config = MODEL_TEST_CONFIGS["llama3_lora"]
+        teacher_config = [
+            "teacher_" + config for config in MODEL_TEST_CONFIGS["llama3"]
+        ]
+
+        cmd_1 = (
+            cmd_1 + self._get_test_config_overrides() + model_config + teacher_config
+        )
+        monkeypatch.setattr(sys, "argv", cmd_1)
+        runpy.run_path(TUNE_PATH, run_name="__main__")
+
+        # Resume training
+        cmd_2 = f"""
+        tune run --nnodes 1 --nproc_per_node 4 knowledge_distillation_distributed \
+            --config llama3_2/8B_to_1B_KD_lora_distributed \
+            output_dir={tmpdir} \
+            checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \
+            checkpointer.checkpoint_dir={ckpt_dir} \
+            checkpointer.checkpoint_files=[{ckpt_path}]\
+            checkpointer.output_dir={tmpdir} \
+            teacher_checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \
+            teacher_checkpointer.checkpoint_dir='{ckpt_dir}' \
+            teacher_checkpointer.checkpoint_files=[{ckpt_path}] \
+            teacher_checkpointer.output_dir={tmpdir} \
+            resume_from_checkpoint=True \
+            enable_async_checkpointing=True \
+            metric_logger.filename={log_file} \
+            tokenizer.path={tokenizer_path} \
+            tokenizer.prompt_template=null \
+        """.split()
+        cmd_2 = (
+            cmd_2
+            + self._get_test_config_overrides(epochs=3)
+            + model_config
+            + teacher_config
+        )
+        monkeypatch.setattr(sys, "argv", cmd_2)
+        runpy.run_path(TUNE_PATH, run_name="__main__")
+
+        # Second epoch only
+        expected_loss_values = self._fetch_expected_loss_values("llama3")[2:]
+        loss_values = get_loss_values_from_metric_logger(log_file)
+        # only take the first loss
+        num_losses = int(len(loss_values) / 4)  # 2 steps per epoch, 2 epochs
+        loss_values = loss_values[0::num_losses][:2]
+
+        torch.testing.assert_close(
+            loss_values, expected_loss_values, rtol=1e-5, atol=1e-5
+        )
+
     @pytest.mark.integration_test
     @gpu_test(gpu_count=4)
     def test_save_and_load_merged_weights(self, tmpdir, monkeypatch):