Skip to content

Commit 05c42e0

Browse files
authored
Merge branch 'master' into feature/world_size_getter
2 parents d8f1823 + a54c394 commit 05c42e0

File tree

7 files changed

+308
-15
lines changed

7 files changed

+308
-15
lines changed
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
name: modal-accelerate
2+
3+
# This CI is running on modal.com's GPUs.
4+
#
5+
# It's set up here on github actions and then the cloned repo is sent to modal and everything
6+
# happens on their hw - see deepspeed/modal_ci/accelerate.py for where the actual vm is loaded, updated and the tests are
7+
# run.
8+
#
9+
# Both files are annotated to what's important and how one might change or update things if needed.
10+
#
11+
# Note that since this is a Required job we can't use `on.push.path` file filter - we are using
12+
# collect-tests job to do the filtering for us so that the job can be skipped and satisfy the
13+
# Required status for PRs to pass.
14+
#
15+
16+
17+
on:
18+
workflow_dispatch:
19+
push:
20+
branches:
21+
- master
22+
23+
pull_request:
24+
paths-ignore:
25+
- 'docs/**'
26+
- 'blogs/**'
27+
- 'deepspeed/inference/v2/**'
28+
- 'tests/unit/inference/v2/**'
29+
types: [draft, opened, ready_for_review, synchronize]
30+
branches:
31+
- master
32+
33+
concurrency:
34+
group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
35+
cancel-in-progress: true
36+
37+
jobs:
38+
collect-tests:
39+
name: Collect tests to run
40+
runs-on: ubuntu-latest
41+
permissions:
42+
contents: read
43+
pull-requests: read
44+
outputs:
45+
deepspeed: ${{ steps.filter.outputs.deepspeed }}
46+
47+
steps:
48+
- name: Checkout repository
49+
uses: actions/checkout@v4
50+
with:
51+
lfs: true
52+
53+
- name: Filter changed files
54+
uses: dorny/paths-filter@v2
55+
id: filter
56+
with:
57+
token: ${{ secrets.GITHUB_TOKEN }}
58+
filters: |
59+
deepspeed:
60+
- 'deepspeed/**'
61+
- '.github/workflows/modal*.yml'
62+
- 'ci/**'
63+
- 'tests/unit/**'
64+
- 'csrc/**'
65+
66+
deploy:
67+
name: DeepSpeedAI CI
68+
runs-on: ubuntu-latest
69+
needs: collect-tests
70+
env:
71+
# these are created at https://modal.com/settings/deepspeedai/tokens
72+
# they are then added to the repo's secrets at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
73+
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
74+
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
75+
# this one comes from https://huggingface.co/settings/profile of the bot user
76+
# and it too is then updated at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
77+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
78+
79+
if: needs.collect-tests.outputs.deepspeed == 'true'
80+
steps:
81+
- name: Checkout Repository
82+
uses: actions/checkout@v4
83+
with:
84+
lfs: true
85+
86+
- name: Install Python
87+
uses: actions/setup-python@v5
88+
with:
89+
python-version: "3.10"
90+
cache: 'pip' # caching pip dependencies
91+
92+
- name: Install build dependencies
93+
run: |
94+
pip install uv # much faster than pip
95+
uv pip install --system modal
96+
97+
- name: Run tests
98+
run: |
99+
modal run -m ci.accelerate
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
name: modal-torch-latest
2+
3+
# This CI is running on modal.com's GPUs.
4+
#
5+
# It's set up here on github actions and then the cloned repo is sent to modal and everything
6+
# happens on their hw - see deepspeed/modal_ci/torch_latest.py for where the actual vm is loaded, updated and the tests are
7+
# run.
8+
#
9+
# Both files are annotated to what's important and how one might change or update things if needed.
10+
#
11+
# Note that since this is a Required job we can't use `on.push.path` file filter - we are using
12+
# collect-tests job to do the filtering for us so that the job can be skipped and satisfy the
13+
# Required status for PRs to pass.
14+
#
15+
16+
17+
on:
18+
workflow_dispatch:
19+
push:
20+
branches:
21+
- master
22+
23+
pull_request:
24+
paths-ignore:
25+
- 'docs/**'
26+
- 'blogs/**'
27+
- 'deepspeed/inference/v2/**'
28+
- 'tests/unit/inference/v2/**'
29+
types: [draft, opened, ready_for_review, synchronize]
30+
branches:
31+
- master
32+
33+
concurrency:
34+
group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
35+
cancel-in-progress: true
36+
37+
jobs:
38+
collect-tests:
39+
name: Collect tests to run
40+
runs-on: ubuntu-latest
41+
permissions:
42+
contents: read
43+
pull-requests: read
44+
outputs:
45+
deepspeed: ${{ steps.filter.outputs.deepspeed }}
46+
47+
steps:
48+
- name: Checkout repository
49+
uses: actions/checkout@v4
50+
with:
51+
lfs: true
52+
53+
- name: Filter changed files
54+
uses: dorny/paths-filter@v2
55+
id: filter
56+
with:
57+
token: ${{ secrets.GITHUB_TOKEN }}
58+
filters: |
59+
deepspeed:
60+
- 'deepspeed/**'
61+
- '.github/workflows/modal*.yml'
62+
- 'ci/**'
63+
- 'tests/unit/**'
64+
- 'csrc/**'
65+
66+
deploy:
67+
name: DeepSpeedAI CI
68+
runs-on: ubuntu-latest
69+
needs: collect-tests
70+
env:
71+
# these are created at https://modal.com/settings/deepspeedai/tokens
72+
# they are then added to the repo's secrets at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
73+
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
74+
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
75+
# this one comes from https://huggingface.co/settings/profile of the bot user
76+
# and it too is then updated at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
77+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
78+
79+
if: needs.collect-tests.outputs.deepspeed == 'true'
80+
steps:
81+
- name: Checkout Repository
82+
uses: actions/checkout@v4
83+
with:
84+
lfs: true
85+
86+
- name: Install Python
87+
uses: actions/setup-python@v5
88+
with:
89+
python-version: "3.10"
90+
cache: 'pip' # caching pip dependencies
91+
92+
- name: Install build dependencies
93+
run: |
94+
pip install uv # much faster than pip
95+
uv pip install --system modal
96+
97+
- name: Run tests
98+
run: |
99+
modal run -m ci.torch_latest

ci/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Copyright (c) DeepSpeed Team.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
# DeepSpeed Team

ci/accelerate.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Copyright (c) Snowflake.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
# DeepSpeed Team
5+
6+
from pathlib import Path
7+
8+
import modal
9+
10+
ROOT_PATH = Path(__file__).parents[1]
11+
12+
# yapf: disable
13+
image = (modal.Image
14+
.from_registry("pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel", add_python="3.10")
15+
.run_commands("apt update && apt install -y libaio-dev")
16+
.apt_install("git")
17+
.run_commands("uv pip install --system --compile-bytecode datasets==3.6.0")
18+
.run_commands(
19+
"git clone https://github.com/huggingface/accelerate && \
20+
uv pip install --system --compile-bytecode ./accelerate[testing]"
21+
)
22+
.pip_install_from_requirements(ROOT_PATH / "requirements/requirements.txt", gpu="any")
23+
.pip_install_from_requirements(ROOT_PATH / "requirements/requirements-dev.txt", gpu="any")
24+
.add_local_dir(ROOT_PATH , remote_path="/root/", copy=True)
25+
.run_commands("pip install /root")
26+
.add_local_dir(ROOT_PATH / "accelerator", remote_path="/root/deepspeed/accelerator")
27+
.add_local_dir(ROOT_PATH / "csrc", remote_path="/root/deepspeed/ops/csrc")
28+
.add_local_dir(ROOT_PATH / "op_builder", remote_path="/root/deepspeed/ops/op_builder")
29+
)
30+
31+
app = modal.App("deepspeedai-accelerate-ci", image=image)
32+
33+
@app.function(
34+
gpu="l40s:1",
35+
timeout=1800,
36+
)
37+
def pytest():
38+
import subprocess
39+
subprocess.run(
40+
"pytest /accelerate/tests/deepspeed".split(),
41+
check=True,
42+
cwd=ROOT_PATH / ".",
43+
)

ci/torch_latest.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Copyright (c) Snowflake.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
# DeepSpeed Team
5+
6+
from pathlib import Path
7+
8+
import modal
9+
10+
ROOT_PATH = Path(__file__).parents[1]
11+
12+
# yapf: disable
13+
image = (modal.Image
14+
.from_registry("pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel", add_python="3.10")
15+
.run_commands("apt update && apt install -y libaio-dev")
16+
.pip_install_from_requirements(ROOT_PATH / "requirements/requirements.txt", gpu="any")
17+
.pip_install_from_requirements(ROOT_PATH / "requirements/requirements-dev.txt", gpu="any")
18+
.add_local_dir(ROOT_PATH , remote_path="/root/", copy=True)
19+
.run_commands("pip install /root")
20+
.add_local_dir(ROOT_PATH / "accelerator", remote_path="/root/deepspeed/accelerator")
21+
.add_local_dir(ROOT_PATH / "csrc", remote_path="/root/deepspeed/ops/csrc")
22+
.add_local_dir(ROOT_PATH / "op_builder", remote_path="/root/deepspeed/ops/op_builder")
23+
)
24+
25+
26+
app = modal.App("deepspeedai-torch-latest-ci", image=image)
27+
28+
29+
@app.function(
30+
gpu="l40s:2",
31+
timeout=1800,
32+
)
33+
def pytest():
34+
import subprocess
35+
subprocess.run(
36+
"pytest -n 4 --verbose tests/unit/runtime/zero/test_zero.py tests/unit/runtime/half_precision/test_bf16.py --torch_ver=2.6 --cuda_ver=12.4".split(),
37+
check=True,
38+
cwd=ROOT_PATH / ".",
39+
)

deepspeed/runtime/sequence_parallel/ulysses_sp.py

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -497,8 +497,12 @@ def __next__(self):
497497
return self.micro_batches.pop(0)
498498

499499
def refill(self):
500-
# this will raise StopIteration when empty
501-
batch = next(self.iter)
500+
# reset the iterator if StopIteration arrives, and re-raise it to allow multiple epochs to run
501+
try:
502+
batch = next(self.iter)
503+
except StopIteration:
504+
self.iter = iter(self.dl)
505+
raise StopIteration
502506
micro_batches = defaultdict(dict)
503507
# XXX: replace with more efficient all-to-all?
504508

@@ -982,7 +986,8 @@ def forward(
982986
if output_reduction == "mean":
983987
incoming_grad /= shards
984988

985-
x_grad = torch.zeros_like(x)
989+
# XXX: deal with the use case of running in inference mode, where we don't need backward
990+
x_grad = torch.zeros_like(x) if x_requires_grad else None
986991
x_shards = list(torch.chunk(x, chunks=shards, dim=0))
987992
y_shards = list(torch.chunk(y, chunks=shards, dim=0))
988993
if mask is not None:
@@ -1007,15 +1012,18 @@ def forward(
10071012
shard_step = x_shards[i].shape[0]
10081013
shard_offset = i * x_shards[0].shape[0]
10091014

1010-
x_shard.grad = x_grad.narrow(0, shard_offset, shard_step).view_as(x_shard)
1011-
1012-
with torch.enable_grad():
1013-
args = (self, x_shard, y_shard)
1014-
if mask is not None:
1015-
args += (mask_shards[i], )
1015+
args = (self, x_shard, y_shard)
1016+
if mask is not None:
1017+
args += (mask_shards[i], )
1018+
if x_grad is not None:
1019+
x_shard.grad = x_grad.narrow(0, shard_offset, shard_step).view_as(x_shard)
1020+
with torch.enable_grad():
1021+
output = fn(*args)
1022+
output_shards.append(output)
1023+
torch.autograd.backward(output, incoming_grad)
1024+
else:
10161025
output = fn(*args)
10171026
output_shards.append(output)
1018-
torch.autograd.backward(output, incoming_grad)
10191027

10201028
output_unsharded = torch.cat([l.unsqueeze(0) for l in output_shards], dim=0)
10211029

@@ -1025,9 +1033,10 @@ def forward(
10251033
output = output_unsharded.sum()
10261034

10271035
# unflatten
1028-
x_grad = x_grad.view(bs, seqlen, *x_grad.shape[1:])
1036+
if x_grad is not None:
1037+
x_grad = x_grad.view(bs, seqlen, *x_grad.shape[1:])
1038+
ctx.save_for_backward(x_grad.detach())
10291039

1030-
ctx.save_for_backward(x_grad.detach())
10311040
return output
10321041

10331042
@staticmethod

tests/unit/runtime/zero/test_zero.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -394,7 +394,7 @@ def test(self, allgather_bucket_size, zero_stage=2):
394394

395395

396396
class TestPartitionNcclAlignment(DistributedTest):
397-
world_size = 4
397+
world_size = 2
398398

399399
def test(self, zero_stage=2):
400400
config_dict = {
@@ -835,7 +835,7 @@ def create_tensor(vals, dtype: torch.dtype = None) -> Tensor:
835835
@pytest.mark.parametrize("init_context_manager", [True, False])
836836
@pytest.mark.parametrize("reduce_scatter", [True, False])
837837
class TestZero3ParamPartitioningLargeParam(DistributedTest):
838-
world_size = 4
838+
world_size = 2
839839

840840
def test(self, init_context_manager: bool, reduce_scatter: bool, param_sz: int = 8100) -> None:
841841

@@ -997,7 +997,7 @@ def forward(self, x: Tensor) -> Tensor:
997997

998998

999999
class TestZero3InitForParentWeightInitialization(DistributedTest):
1000-
world_size = 4
1000+
world_size = 2
10011001

10021002
def test(self):
10031003

0 commit comments

Comments
 (0)