Merge branch 'master' into feature/world_size_getter

tjruwase · web-flow · commit 05c42e0ca954 · 2025-08-11T17:53:45.000-04:00
diff --git a/.github/workflows/modal-accelerate.yml b/.github/workflows/modal-accelerate.yml
@@ -0,0 +1,99 @@
+name: modal-accelerate
+
+# This CI is running on modal.com's GPUs.
+#
+# It's set up here on github actions and then the cloned repo is sent to modal and everything
+# happens on their hw - see deepspeed/modal_ci/accelerate.py for where the actual vm is loaded, updated and the tests are
+# run.
+#
+# Both files are annotated to what's important and how one might change or update things if needed.
+#
+# Note that since this is a Required job we can't use `on.push.path` file filter - we are using
+# collect-tests job to do the filtering for us so that the job can be skipped and satisfy the
+# Required status for PRs to pass.
+#
+
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+
+  pull_request:
+    paths-ignore:
+      - 'docs/**'
+      - 'blogs/**'
+      - 'deepspeed/inference/v2/**'
+      - 'tests/unit/inference/v2/**'
+    types: [draft, opened, ready_for_review, synchronize]
+    branches:
+      - master
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  collect-tests:
+    name: Collect tests to run
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: read
+    outputs:
+      deepspeed: ${{ steps.filter.outputs.deepspeed }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+
+      - name: Filter changed files
+        uses: dorny/paths-filter@v2
+        id: filter
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          filters: |
+            deepspeed:
+              - 'deepspeed/**'
+              - '.github/workflows/modal*.yml'
+              - 'ci/**'
+              - 'tests/unit/**'
+              - 'csrc/**'
+
+  deploy:
+    name: DeepSpeedAI CI
+    runs-on: ubuntu-latest
+    needs: collect-tests
+    env:
+      # these are created at https://modal.com/settings/deepspeedai/tokens
+      # they are then added to the repo's secrets at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
+      MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
+      MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
+      # this one comes from https://huggingface.co/settings/profile of the bot user
+      # and it too is then updated at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+
+    if: needs.collect-tests.outputs.deepspeed == 'true'
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+          cache: 'pip' # caching pip dependencies
+
+      - name: Install build dependencies
+        run: |
+          pip install uv # much faster than pip
+          uv pip install --system modal
+
+      - name: Run tests
+        run: |
+          modal run -m ci.accelerate
diff --git a/.github/workflows/modal-torch-latest.yml b/.github/workflows/modal-torch-latest.yml
@@ -0,0 +1,99 @@
+name: modal-torch-latest
+
+# This CI is running on modal.com's GPUs.
+#
+# It's set up here on github actions and then the cloned repo is sent to modal and everything
+# happens on their hw - see deepspeed/modal_ci/torch_latest.py  for where the actual vm is loaded, updated and the tests are
+# run.
+#
+# Both files are annotated to what's important and how one might change or update things if needed.
+#
+# Note that since this is a Required job we can't use `on.push.path` file filter - we are using
+# collect-tests job to do the filtering for us so that the job can be skipped and satisfy the
+# Required status for PRs to pass.
+#
+
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+
+  pull_request:
+    paths-ignore:
+      - 'docs/**'
+      - 'blogs/**'
+      - 'deepspeed/inference/v2/**'
+      - 'tests/unit/inference/v2/**'
+    types: [draft, opened, ready_for_review, synchronize]
+    branches:
+      - master
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  collect-tests:
+    name: Collect tests to run
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: read
+    outputs:
+      deepspeed: ${{ steps.filter.outputs.deepspeed }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+
+      - name: Filter changed files
+        uses: dorny/paths-filter@v2
+        id: filter
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          filters: |
+            deepspeed:
+              - 'deepspeed/**'
+              - '.github/workflows/modal*.yml'
+              - 'ci/**'
+              - 'tests/unit/**'
+              - 'csrc/**'
+
+  deploy:
+    name: DeepSpeedAI CI
+    runs-on: ubuntu-latest
+    needs: collect-tests
+    env:
+      # these are created at https://modal.com/settings/deepspeedai/tokens
+      # they are then added to the repo's secrets at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
+      MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
+      MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
+      # this one comes from https://huggingface.co/settings/profile of the bot user
+      # and it too is then updated at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+
+    if: needs.collect-tests.outputs.deepspeed == 'true'
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+          cache: 'pip' # caching pip dependencies
+
+      - name: Install build dependencies
+        run: |
+          pip install uv # much faster than pip
+          uv pip install --system modal
+
+      - name: Run tests
+        run: |
+          modal run -m ci.torch_latest
diff --git a/ci/__init__.py b/ci/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) DeepSpeed Team.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
diff --git a/ci/accelerate.py b/ci/accelerate.py
@@ -0,0 +1,43 @@
+# Copyright (c) Snowflake.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from pathlib import Path
+
+import modal
+
+ROOT_PATH = Path(__file__).parents[1]
+
+# yapf: disable
+image = (modal.Image
+         .from_registry("pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel", add_python="3.10")
+         .run_commands("apt update && apt install -y libaio-dev")
+         .apt_install("git")
+         .run_commands("uv pip install --system --compile-bytecode datasets==3.6.0")
+         .run_commands(
+                "git clone https://github.com/huggingface/accelerate && \
+                uv pip install --system --compile-bytecode ./accelerate[testing]"
+            )
+         .pip_install_from_requirements(ROOT_PATH / "requirements/requirements.txt", gpu="any")
+         .pip_install_from_requirements(ROOT_PATH / "requirements/requirements-dev.txt", gpu="any")
+         .add_local_dir(ROOT_PATH , remote_path="/root/", copy=True)
+         .run_commands("pip install /root")
+         .add_local_dir(ROOT_PATH / "accelerator", remote_path="/root/deepspeed/accelerator")
+         .add_local_dir(ROOT_PATH / "csrc", remote_path="/root/deepspeed/ops/csrc")
+         .add_local_dir(ROOT_PATH / "op_builder", remote_path="/root/deepspeed/ops/op_builder")
+        )
+
+app = modal.App("deepspeedai-accelerate-ci", image=image)
+
+@app.function(
+    gpu="l40s:1",
+    timeout=1800,
+)
+def pytest():
+    import subprocess
+    subprocess.run(
+        "pytest /accelerate/tests/deepspeed".split(),
+        check=True,
+        cwd=ROOT_PATH / ".",
+    )
diff --git a/ci/torch_latest.py b/ci/torch_latest.py
@@ -0,0 +1,39 @@
+# Copyright (c) Snowflake.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from pathlib import Path
+
+import modal
+
+ROOT_PATH = Path(__file__).parents[1]
+
+# yapf: disable
+image = (modal.Image
+         .from_registry("pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel", add_python="3.10")
+         .run_commands("apt update && apt install -y libaio-dev")
+         .pip_install_from_requirements(ROOT_PATH / "requirements/requirements.txt", gpu="any")
+         .pip_install_from_requirements(ROOT_PATH / "requirements/requirements-dev.txt", gpu="any")
+         .add_local_dir(ROOT_PATH , remote_path="/root/", copy=True)
+         .run_commands("pip install /root")
+         .add_local_dir(ROOT_PATH / "accelerator", remote_path="/root/deepspeed/accelerator")
+         .add_local_dir(ROOT_PATH / "csrc", remote_path="/root/deepspeed/ops/csrc")
+         .add_local_dir(ROOT_PATH / "op_builder", remote_path="/root/deepspeed/ops/op_builder")
+        )
+
+
+app = modal.App("deepspeedai-torch-latest-ci", image=image)
+
+
+@app.function(
+    gpu="l40s:2",
+    timeout=1800,
+)
+def pytest():
+    import subprocess
+    subprocess.run(
+        "pytest -n 4 --verbose tests/unit/runtime/zero/test_zero.py tests/unit/runtime/half_precision/test_bf16.py --torch_ver=2.6 --cuda_ver=12.4".split(),
+        check=True,
+        cwd=ROOT_PATH / ".",
+    )
diff --git a/deepspeed/runtime/sequence_parallel/ulysses_sp.py b/deepspeed/runtime/sequence_parallel/ulysses_sp.py
@@ -497,8 +497,12 @@ def __next__(self):
         return self.micro_batches.pop(0)
 
     def refill(self):
-        # this will raise StopIteration when empty
-        batch = next(self.iter)
+        # reset the iterator if StopIteration arrives, and re-raise it to allow multiple epochs to run
+        try:
+            batch = next(self.iter)
+        except StopIteration:
+            self.iter = iter(self.dl)
+            raise StopIteration
         micro_batches = defaultdict(dict)
         # XXX: replace with more efficient all-to-all?
 
@@ -982,7 +986,8 @@ def forward(
         if output_reduction == "mean":
             incoming_grad /= shards
 
-        x_grad = torch.zeros_like(x)
+        # XXX: deal with the use case of running in inference mode, where we don't need backward
+        x_grad = torch.zeros_like(x) if x_requires_grad else None
         x_shards = list(torch.chunk(x, chunks=shards, dim=0))
         y_shards = list(torch.chunk(y, chunks=shards, dim=0))
         if mask is not None:
@@ -1007,15 +1012,18 @@ def forward(
             shard_step = x_shards[i].shape[0]
             shard_offset = i * x_shards[0].shape[0]
 
-            x_shard.grad = x_grad.narrow(0, shard_offset, shard_step).view_as(x_shard)
-
-            with torch.enable_grad():
-                args = (self, x_shard, y_shard)
-                if mask is not None:
-                    args += (mask_shards[i], )
+            args = (self, x_shard, y_shard)
+            if mask is not None:
+                args += (mask_shards[i], )
+            if x_grad is not None:
+                x_shard.grad = x_grad.narrow(0, shard_offset, shard_step).view_as(x_shard)
+                with torch.enable_grad():
+                    output = fn(*args)
+                    output_shards.append(output)
+                torch.autograd.backward(output, incoming_grad)
+            else:
                 output = fn(*args)
                 output_shards.append(output)
-            torch.autograd.backward(output, incoming_grad)
 
         output_unsharded = torch.cat([l.unsqueeze(0) for l in output_shards], dim=0)
 
@@ -1025,9 +1033,10 @@ def forward(
             output = output_unsharded.sum()
 
         # unflatten
-        x_grad = x_grad.view(bs, seqlen, *x_grad.shape[1:])
+        if x_grad is not None:
+            x_grad = x_grad.view(bs, seqlen, *x_grad.shape[1:])
+            ctx.save_for_backward(x_grad.detach())
 
-        ctx.save_for_backward(x_grad.detach())
         return output
 
     @staticmethod
diff --git a/tests/unit/runtime/zero/test_zero.py b/tests/unit/runtime/zero/test_zero.py
@@ -394,7 +394,7 @@ def test(self, allgather_bucket_size, zero_stage=2):
 
 
 class TestPartitionNcclAlignment(DistributedTest):
-    world_size = 4
+    world_size = 2
 
     def test(self, zero_stage=2):
         config_dict = {
@@ -835,7 +835,7 @@ def create_tensor(vals, dtype: torch.dtype = None) -> Tensor:
 @pytest.mark.parametrize("init_context_manager", [True, False])
 @pytest.mark.parametrize("reduce_scatter", [True, False])
 class TestZero3ParamPartitioningLargeParam(DistributedTest):
-    world_size = 4
+    world_size = 2
 
     def test(self, init_context_manager: bool, reduce_scatter: bool, param_sz: int = 8100) -> None:
 
@@ -997,7 +997,7 @@ def forward(self, x: Tensor) -> Tensor:
 
 
 class TestZero3InitForParentWeightInitialization(DistributedTest):
-    world_size = 4
+    world_size = 2
 
     def test(self):