Skip to content

Commit 9415711

Browse files
authored
Add pytorch canary build to CI. (#6276)
[skip-matrix][skip-vdc][skip-docs]
1 parent 232beaa commit 9415711

File tree

8 files changed

+280
-14
lines changed

8 files changed

+280
-14
lines changed

.github/workflows/build-matx.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ on:
3939
jobs:
4040
build-matx:
4141
name: Build MatX
42-
runs-on: ${{ github.repository == 'NVIDIA/cccl' && 'linux-amd64-cpu32' || 'ubuntu-latest' }}
42+
runs-on: linux-amd64-cpu16m
4343
permissions:
4444
id-token: write
4545
contents: read
@@ -66,7 +66,7 @@ jobs:
6666
#! /usr/bin/env bash
6767
set -eo pipefail
6868
69-
~/cccl/ci/matx/build_matx.sh;
69+
~/cccl/ci/matx/build_matx.sh
7070
sccache --show-adv-stats
7171
EOF
7272
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
name: Build PyTorch
2+
3+
on:
4+
workflow_dispatch:
5+
inputs:
6+
override_cccl_tag:
7+
description: "If set, override the tag used when pulling the CCCL repository into PyTorch."
8+
required: false
9+
default: ""
10+
type: string
11+
enable_slack_alerts:
12+
description: "If true, a message will be posted to the CCCL GHA CI Alert channel if the workflow fails."
13+
required: false
14+
default: false
15+
type: boolean
16+
workflow_call:
17+
inputs:
18+
override_cccl_tag:
19+
description: "If set, override the tag used when pulling the CCCL repository into PyTorch."
20+
required: false
21+
default: ""
22+
type: string
23+
enable_slack_alerts:
24+
description: "If true, a message will be posted to the CCCL GHA CI Alert channel if the workflow fails."
25+
required: false
26+
default: false
27+
type: boolean
28+
29+
jobs:
30+
build-pytorch:
31+
name: Build PyTorch
32+
runs-on: linux-amd64-cpu16m
33+
permissions:
34+
id-token: write
35+
contents: read
36+
steps:
37+
- name: Checkout repo
38+
uses: actions/checkout@v4
39+
with:
40+
fetch-depth: 0
41+
persist-credentials: false
42+
- name: Add NVCC problem matcher
43+
run: echo "::add-matcher::$(pwd)/.github/problem-matchers/problem-matcher.json"
44+
- uses: aws-actions/configure-aws-credentials@v4
45+
if: ${{ github.repository == 'NVIDIA/cccl' }}
46+
with:
47+
role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-NVIDIA
48+
aws-region: us-east-2
49+
role-duration-seconds: 43200 # 12h
50+
- name: Run command
51+
env:
52+
CCCL_TAG: ${{ inputs.override_cccl_tag }}
53+
CCCL_VERSION: ${{ inputs.override_cccl_version }}
54+
run: |
55+
cat <<"EOF" > "$RUNNER_TEMP/ci.sh"
56+
#! /usr/bin/env bash
57+
set -eo pipefail
58+
59+
~/cccl/ci/pytorch/build_pytorch.sh
60+
sccache --show-adv-stats
61+
EOF
62+
63+
chmod +x "$RUNNER_TEMP/ci.sh"
64+
65+
mkdir -p .aws
66+
67+
cat <<EOF > .aws/config
68+
[default]
69+
bucket=rapids-sccache-devs
70+
region=us-east-2
71+
EOF
72+
73+
cat <<EOF > .aws/credentials
74+
[default]
75+
aws_access_key_id=${AWS_ACCESS_KEY_ID:-}
76+
aws_session_token=${AWS_SESSION_TOKEN:-}
77+
aws_secret_access_key=${AWS_SECRET_ACCESS_KEY:-}
78+
EOF
79+
80+
chmod 0600 .aws/credentials
81+
chmod 0664 .aws/config
82+
83+
.devcontainer/launch.sh \
84+
--docker \
85+
--cuda 13.0 \
86+
--host gcc14 \
87+
--cuda-ext \
88+
--env "CCCL_TAG=${CCCL_TAG}" \
89+
--env VAULT_HOST= \
90+
--env "GITHUB_SHA=$GITHUB_SHA" \
91+
--env "GITHUB_REF_NAME=$GITHUB_REF_NAME" \
92+
--env "GITHUB_REPOSITORY=$GITHUB_REPOSITORY" \
93+
--volume "$RUNNER_TEMP/ci.sh:/ci.sh" \
94+
-- /ci.sh
95+
96+
notify-failure:
97+
name: Notify Slack of PyTorch failure
98+
if: ${{ failure() && inputs.enable_slack_alerts }}
99+
needs: build-pytorch
100+
runs-on: ubuntu-latest
101+
steps:
102+
- name: Notify
103+
uses: slackapi/[email protected]
104+
env:
105+
SLACK_BOT_TOKEN: ${{ secrets.SLACK_NOTIFIER_BOT_TOKEN }}
106+
WORKFLOW_TYPE: ${{ github.workflow }}
107+
SUMMARY_URL: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}}
108+
with:
109+
channel-id: ${{ secrets.SLACK_CHANNEL_CI_ALERT }}
110+
slack-message: |
111+
PyTorch build in workflow '${{ env.WORKFLOW_TYPE }}' failed.
112+
113+
Details: ${{ env.SUMMARY_URL }}

.github/workflows/ci-workflow-nightly.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,3 +160,13 @@ jobs:
160160
uses: ./.github/workflows/build-matx.yml
161161
with:
162162
enable_slack_alerts: true
163+
164+
build-pytorch:
165+
name: Build PyTorch
166+
secrets: inherit
167+
permissions:
168+
id-token: write
169+
contents: read
170+
uses: ./.github/workflows/build-pytorch.yml
171+
with:
172+
enable_slack_alerts: true

.github/workflows/ci-workflow-pull-request.yml

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,18 +45,21 @@ jobs:
4545
docs_enabled: ${{ steps.export-flags.outputs.docs_enabled }}
4646
rapids_enabled: ${{ steps.export-flags.outputs.rapids_enabled }}
4747
matx_enabled: ${{ steps.export-flags.outputs.matx_enabled }}
48+
pytorch_enabled: ${{ steps.export-flags.outputs.pytorch_enabled }}
4849
steps:
4950
- name: Export workflow flags
5051
id: export-flags
52+
env:
53+
skip_tpt: ${{ contains(github.event.head_commit.message, '[skip-tpt]') || contains(github.event.head_commit.message, '[skip-third-party-testing]') }}
5154
run: |
5255
output() { echo "$1=$2" | tee -a "${GITHUB_OUTPUT}"; }
5356
54-
output matrix_enabled "${{ !contains(github.event.head_commit.message, '[skip-matrix]') }}"
55-
output vdc_enabled "${{ !contains(github.event.head_commit.message, '[skip-vdc]') }}"
56-
output docs_enabled "${{ !contains(github.event.head_commit.message, '[skip-docs]') }}"
57-
output rapids_enabled "${{ contains(github.event.head_commit.message, '[test-rapids]') }}"
58-
# MatX build OOMs the public github runners used on forks:
59-
output matx_enabled "${{ !contains(github.event.head_commit.message, '[skip-matx]') && github.repository == 'NVIDIA/cccl' }}"
57+
output matrix_enabled "${{ !contains(github.event.head_commit.message, '[skip-matrix]') }}"
58+
output vdc_enabled "${{ !contains(github.event.head_commit.message, '[skip-vdc]') }}"
59+
output docs_enabled "${{ !contains(github.event.head_commit.message, '[skip-docs]') }}"
60+
output rapids_enabled "${{ contains(github.event.head_commit.message, '[test-rapids]') && !fromJSON(env.skip_tpt) }}"
61+
output matx_enabled "${{ !contains(github.event.head_commit.message, '[skip-matx]') && !fromJSON(env.skip_tpt) }}"
62+
output pytorch_enabled "${{ !contains(github.event.head_commit.message, '[skip-pytorch]') && !fromJSON(env.skip_tpt) }}"
6063
- name: Checkout repo
6164
uses: actions/checkout@v4
6265
with:
@@ -240,6 +243,16 @@ jobs:
240243
contents: read
241244
uses: ./.github/workflows/build-matx.yml
242245

246+
build-pytorch:
247+
name: Build PyTorch (optional)
248+
needs: build-workflow
249+
if: ${{ needs.build-workflow.outputs.pytorch_enabled == 'true' }}
250+
secrets: inherit
251+
permissions:
252+
id-token: write
253+
contents: read
254+
uses: ./.github/workflows/build-pytorch.yml
255+
243256
# Check all other job statuses. This job gates branch protection checks.
244257
ci:
245258
name: CI

AGENTS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,9 @@ Tags appended to the commit summary (case-sensitive) control CI behavior:
303303
* `[skip-matrix]`: Skip CCCL project build/test jobs. (Docs, devcontainers, and third-party builds still run.)
304304
* `[skip-vdc]`: Skip "Verify Devcontainer" jobs. Safe unless CI or devcontainer infra is modified.
305305
* `[skip-docs]`: Skip doc tests/previews. Safe if docs are unaffected.
306+
* `[skip-third-party-testing]` / `[skip-tpt]`: Skip third-party smoke tests (MatX, PyTorch, RAPIDS).
306307
* `[skip-matx]`: Skip building the MatX third-party smoke test.
308+
* `[skip-pytorch]`: Skip building the PyTorch third-party smoke test.
307309

308310
> ⚠️ All of these tags block merging until removed and a full CI run (with no overrides) succeeds.
309311

ci-overview.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,10 @@ These commands can be combined with the [override matrix](#temporarily-overridin
3434
- `[skip-matrix]`: Skip all build and test jobs specified in `ci/matrix.yaml`.
3535
- `[skip-vdc]`: Skip all "Validate Devcontainer" jobs.
3636
- `[skip-docs]`: Skip the documentation verification build.
37+
- `[skip-third-party-testing]` (alias `[skip-tpt]`): Skip all third-party canary builds (MatX, PyTorch, and RAPIDS).
3738
- `[skip-matx]`: Skip all MatX canary builds.
38-
- **Example:** `git commit -m "README tidy-up [skip-matrix][skip-vdc][skip-docs][skip-matx]"`
39+
- `[skip-pytorch]`: Skip all PyTorch canary builds.
40+
- **Example:** `git commit -m "README tidy-up [skip-matrix][skip-vdc][skip-docs][skip-third-party-testing]"`
3941

4042
- `[test-rapids]`: Opt‑in to run RAPIDS canary builds alongside CCCL CI.
4143

ci/matx/build_matx.sh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@ log_vars() {
1818
version_max() {
1919
local v1="${1}"
2020
local v2="${2}"
21-
if [[ "$(printf "%s\n" "${v1}" "${v2}" | sort -V | head -n1)" == "${v1}" ]]; then
22-
echo "${v2}"
21+
if ci/util/version_compare.sh "$v1" ge "$v2"; then
22+
echo "$v1"
2323
else
24-
echo "${v1}"
24+
echo "$v2"
2525
fi
2626
}
2727

@@ -95,11 +95,11 @@ rm -rf build
9595
mkdir build
9696
cd build
9797
cmake -G Ninja ../MatX \
98-
"-DCMAKE_CUDA_ARCHITECTURES=75;80" \
98+
"-DCMAKE_CUDA_ARCHITECTURES=75;120" \
9999
"-DRAPIDS_CMAKE_CPM_OVERRIDE_VERSION_FILE=${version_override_file}" \
100100
-DMATX_BUILD_TESTS=ON \
101101
-DMATX_BUILD_EXAMPLES=ON \
102102
-DMATX_BUILD_BENCHMARKS=ON \
103103
-DMATX_EN_CUTENSOR=ON
104104

105-
cmake --build . -j 8
105+
cmake --build .

ci/pytorch/build_pytorch.sh

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
#!/bin/bash
2+
3+
set -euo pipefail
4+
5+
readonly pytorch_repo=https://github.com/pytorch/pytorch.git
6+
readonly pytorch_branch=main
7+
8+
# Ensure the script is being executed in the root cccl directory:
9+
cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/../..";
10+
readonly cccl_repo="${PWD}"
11+
12+
log_vars() {
13+
for var in "$@"; do
14+
echo "${var}=${!var}"
15+
done
16+
}
17+
18+
# Define CCCL_TAG to override the default CCCL SHA. Otherwise the current HEAD of the local checkout is used.
19+
echo "CCCL_TAG (override): ${CCCL_TAG-}";
20+
if test -n "${CCCL_TAG-}"; then
21+
# If CCCL_TAG is defined, fetch it to the local checkout
22+
git -C "${cccl_repo}" fetch origin "${CCCL_TAG}";
23+
cccl_sha="$(git -C "${cccl_repo}" rev-parse FETCH_HEAD)";
24+
else
25+
cccl_sha="$(git -C "${cccl_repo}" rev-parse HEAD)";
26+
fi
27+
28+
readonly workdir="${cccl_repo}/build/${CCCL_BUILD_INFIX:-}/pytorch"
29+
30+
log_vars \
31+
pytorch_repo pytorch_branch \
32+
cccl_repo cccl_sha \
33+
workdir
34+
35+
mkdir -p "${workdir}"
36+
cd "${workdir}"
37+
echo "Working in ${workdir}"
38+
39+
echo "::group::Cloning CCCL..."
40+
rm -rf cccl
41+
git clone "${cccl_repo}"
42+
git -C cccl checkout "${cccl_sha}"
43+
echo "CCCL HEAD:"
44+
git -C cccl log -1 --format=short
45+
echo "::endgroup::"
46+
47+
# Setup a CUDA environment with the requested CCCL.
48+
# Use a local directory to avoid modifying the actual CUDA install:
49+
echo "::group::Setting up clone of CUDA environment with custom CCCL..."
50+
(
51+
set -x
52+
rm -rf ./cuda
53+
cp -Hr /usr/local/cuda ./cuda
54+
rm -rf ./cuda/include/cccl/*
55+
cccl/ci/install_cccl.sh ./cccl-install > /dev/null
56+
cp -r ./cccl-install/include/* ./cuda/include/cccl
57+
)
58+
export PATH="$PWD/cuda/bin:$PATH"
59+
export CUDA_HOME="$PWD/cuda"
60+
export CUDA_PATH="$PWD/cuda"
61+
which nvcc
62+
nvcc --version
63+
echo "::endgroup::"
64+
65+
echo "::group::Cloning PyTorch..."
66+
rm -rf pytorch
67+
git clone ${pytorch_repo} -b ${pytorch_branch} --recursive --depth 1
68+
echo "PyTorch HEAD:"
69+
git -C pytorch log -1 --format=short
70+
echo "::endgroup::"
71+
72+
echo "::group::Installing PyTorch build dependencies..."
73+
pytorch_root="$PWD/pytorch"
74+
export PYTHONPATH="${pytorch_root}:${pytorch_root}/tools:${PYTHONPATH:-}"
75+
pip install -r "${pytorch_root}/requirements-build.txt"
76+
echo "::endgroup::"
77+
78+
echo "::group::Configuring PyTorch..."
79+
rm -rf build
80+
mkdir build
81+
declare -a cmake_args=(
82+
"-DUSE_NCCL=OFF"
83+
# Need to define this explicitly, torch's FindCUDA logic adds ancient arches if left undefined:
84+
"-DTORCH_CUDA_ARCH_LIST=7.5;8.0;9.0;10.0;12.0"
85+
)
86+
cmake -S ./pytorch -B ./build -G Ninja "${cmake_args[@]}"
87+
echo "::endgroup::"
88+
89+
# Verify that the configured build is using the custom CUDA dir for CTK and nvcc:
90+
if ! grep -q "CUDA_TOOLKIT_ROOT_DIR:PATH=$PWD/cuda" ./build/CMakeCache.txt; then
91+
echo "Error: CUDA_TOOLKIT_ROOT_DIR does not point to the custom CUDA";
92+
exit 1;
93+
fi
94+
if ! grep -q "CUDA_NVCC_EXECUTABLE:FILEPATH=$PWD/cuda/bin/nvcc" ./build/CMakeCache.txt; then
95+
echo "Error: CUDA_NVCC_EXECUTABLE does not point to the custom CUDA";
96+
exit 1;
97+
fi
98+
99+
# This builds a bunch of unnecessary targets. Leaving here to use as a fallback if the
100+
# ninja target extraction below starts failing:
101+
# echo "::group::Building torch_cuda target..."
102+
# cmake --build ./build/ --target torch_cuda
103+
# echo "::endgroup::"
104+
105+
# This cuts the number of built targets roughly in half:
106+
echo "::group::Extracting cuda targets from build.ninja..."
107+
# Query ninja for all object files built from CUDA source files in ATen/native/cuda/
108+
# that are part of the torch_cuda library:
109+
ninja -C ./build -t query lib/libtorch_cuda.so |
110+
grep -E "ATen/native/cuda/.*\\.cu\\.o$" |
111+
sort | uniq | tee build/cuda_targets.txt
112+
# At the time this script was written, there were 217 cuda targets.
113+
# Check that there are at least 100 detected targets, otherwise fail.
114+
num_targets=$(wc -l < build/cuda_targets.txt)
115+
if test "$num_targets" -lt 100; then
116+
echo "Error: extracted cuda targets count is less than 100! ($num_targets)";
117+
echo "This likely indicates a failure to extract the targets from ninja.";
118+
exit 1;
119+
fi
120+
echo "::endgroup::"
121+
122+
echo "::group::Building pytorch CUDA targets with custom CCCL..."
123+
ninja -C ./build $(xargs -a build/cuda_targets.txt)
124+
echo "::endgroup::"
125+
126+
echo "PyTorch CUDA targets built successfully with custom CCCL."

0 commit comments

Comments
 (0)