Add preserves_storage_sharing for checkpoint engines

amaurya · amaurya · commit 09858a7c6325 · 2025-03-28T20:29:40.000Z
Signed-off-by: amaurya &lt;am6429@cs.rit.edu&gt;
diff --git a/deepspeed/runtime/checkpoint_engine/checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/checkpoint_engine.py
@@ -32,3 +32,8 @@ def commit(self, tag):
     def wait(self):
         # To wait in asynchronous checkpoint engines (e.g. DataStates-LLM) for the previous snapshot to finish
         pass
+
+    def preserves_storage_sharing(self):
+        # Check if the checkpoint engine preserves storage sharing
+        # (set to false if cloning is required to get actual tensor sizes)
+        return False
diff --git a/deepspeed/runtime/checkpoint_engine/datastates_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/datastates_checkpoint_engine.py
@@ -32,3 +32,6 @@ def commit(self, tag):
 
     def wait(self):
         return self.ckpt_engine.wait()
+
+    def preserves_storage_sharing(self):
+        return True
diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py
@@ -608,7 +608,6 @@ def save_state_dict(self, save_dir, checkpoint_engine, exclude_frozen_params=Fal
         layer_list = self.forward_funcs[start:end]
 
         checkpoint_engine.makedirs(save_dir, exist_ok=True)
-        debloat_memory = "DataStatesCheckpointEngine" not in str(type(checkpoint_engine))
         for idx, layer in enumerate(layer_list):
             model_ckpt_path = self.ckpt_layer_path(save_dir, start + idx)
             if not hasattr(layer, 'state_dict'):
@@ -619,10 +618,11 @@ def save_state_dict(self, save_dir, checkpoint_engine, exclude_frozen_params=Fal
                 for n in self._get_frozen_parameter_names(layer):
                     del orig_state_dict[n]
 
-            if debloat_memory:
-                final_state_dict = clone_tensors_for_torch_save(orig_state_dict)
-            else:
+            if checkpoint_engine.preserves_storage_sharing():
                 final_state_dict = orig_state_dict
+            else:
+                final_state_dict = clone_tensors_for_torch_save(orig_state_dict)
+
             checkpoint_engine.save(final_state_dict, model_ckpt_path)
 
     def load_state_dir(self, load_dir, checkpoint_engine, strict=True):
diff --git a/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py b/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py
@@ -5,9 +5,10 @@
 """
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
-import torch
+
 from deepspeed.ops.op_builder import AsyncIOBuilder
 from deepspeed import comm as dist
+import torch
 
 from deepspeed.runtime.swap_tensor.constants import *
 from deepspeed.runtime.swap_tensor.utils import swap_in_tensors, swap_out_tensors, print_object
@@ -185,7 +186,6 @@ def _swap_out_optimizer_state(self, aio_handle, parameter, swap_in_op):
             for pinned_dst, unpinned_src in zip(new_alloc_buffers, unpinned_tensors):
                 dst = get_sized_buffer(pinned_dst, unpinned_src.numel())
                 dst.data.copy_(unpinned_src.data)
-                unpinned_src.data = torch.Tensor()
 
         swap_paths = param_info.swap_paths.copy()
         assert len(swap_paths) == len(swap_buffers)
diff --git a/docs/_tutorials/datastates-async-checkpointing.md b/docs/_tutorials/datastates-async-checkpointing.md
@@ -0,0 +1,67 @@
+---
+title: "DataStates-LLM Checkpointing Engine"
+tags: asynchronous checkpointing for minimizing I/O overheads.
+---
+This tutorial will show how to use [DataStates-LLM](https://github.com/DataStates/datastates-llm) for asynchronous checkpointing. DataStates-LLM introduces a lazy asynchronous checkpointing mechanism tailored for LLMs, aiming to minimize I/O overhead and enhance training efficiency. This tutorial provides a guide on integrating DataStates-LLM with the DeepSpeed framework.
+
+## Overview of DataStates-LLM
+
+DataStates-LLM is designed to address the challenges of frequent checkpointing in LLM training by introducing a lazy asynchronous multi-level approach. It leverages the immutability of model parameters and optimizer states during forward and backward passes to perform non-blocking data transfers, thereby reducing interference with the training process. This method has demonstrated up to 48x faster checkpointing and 2.2x faster end-to-end training times compared to traditional approaches as outlined in [DataStates-LLM: Lazy Asynchronous Checkpointing for Large Language Models](https://arxiv.org/abs/2406.10707).
+
+## Prerequisites
+
+Before integrating DataStates-LLM with DeepSpeed, ensure the following:
+
+- **DeepSpeed Installation**: DeepSpeed should be installed in your environment. If not, refer to the [DeepSpeed Getting Started Guide](https://github.com/microsoft/DeepSpeed/blob/master/docs/_tutorials/getting-started.md) for installation instructions.
+
+- **DataStates-LLM Repository**: Access the DataStates-LLM source code from its [GitHub repository](https://github.com/DataStates/datastates-llm) and follow the installation instructions provided therein.
+
+## Configuring DeepSpeed for DataStates-LLM
+
+To enable DataStates-LLM's asynchronous checkpointing within DeepSpeed, please modify the `deepspeed_config.json` file to include specific configurations under the `datastates_ckpt` section. Below is an example configuration:
+
+```json
+{
+    // ... other DeepSpeed configuration options
+    "datastates_ckpt": {
+        "host_cache_size": 16,
+        "parser_threads": 8
+    }
+}
+```
+
+### Configuration Parameters
+
+- **`host_cache_size`**: Specifies the amount of pinned host memory (in gigabytes) reserved for asynchronous checkpoint flushing. Adjust this value based on your system's memory capacity and the size of your model checkpoints.
+
+- **`parser_threads`**: Determines the number of threads dedicated to parsing checkpoint file requests in parallel. Increasing this value can enhance parsing throughput but may also increase CPU utilization.
+
+## Implementing DataStates-LLM in Your Training Script
+
+After enabling datastates checkpointing the `deepspeed_config.json`, the frequency of checkpointing can be configured by specifying the number of iterations after which the checkpoints should be captured using command-line parameter ` --save-interval`.
+
+## Performance Results
+
+The checkpoint acceleration achieved by DataStates-LLM for various models are shown in
+
+![Higher checkpointing throughput](/assets/images/datastates-async-checkpointing/diff-models-ckpt-throughput.png){: .align-center}
+
+![Faster training iterations](/assets/images/datastates-async-checkpointing/diff-models-iter-times.png){: .align-center}
+
+
+## Limitations and Ongoing Work
+
+1. DataStates-LLM currently only supports the CUDA runtime on Nvidia-based GPUs.
+
+
+2. DataStates-LLM has only been tested with ZeRO stage-1 without offloading to any other tiers.
+
+
+3. While the checkpoint layout of datastates matches Huggingface's [safetensor](https://huggingface.co/docs/safetensors/) format, due to pickled objects required by DeepSpeed during restart, it is not fully compatible with safetensor library yet.
+
+4. DataStates-LLM does not yet support universal or elastic checkpointing.
+
+
+## Questions and Support
+
+Please use the [DataStates-LLM Github repository](https://github.com/DataStates/datastates-llm) for any questions, issues, or feature requests.
diff --git a/docs/assets/images/datastates-async-checkpointing/diff-models-ckpt-throughput.png b/docs/assets/images/datastates-async-checkpointing/diff-models-ckpt-throughput.png
diff --git a/docs/assets/images/datastates-async-checkpointing/diff-models-iter-times.png b/docs/assets/images/datastates-async-checkpointing/diff-models-iter-times.png