Add pytorch canary build to CI. (#6276)

alliepiper · web-flow · commit 9415711ca309 · 2025-10-20T11:10:08.000-05:00
[skip-matrix][skip-vdc][skip-docs]
diff --git a/.github/workflows/build-matx.yml b/.github/workflows/build-matx.yml
@@ -39,7 +39,7 @@ on:
 jobs:
   build-matx:
     name: Build MatX
-    runs-on: ${{ github.repository == 'NVIDIA/cccl' && 'linux-amd64-cpu32' || 'ubuntu-latest' }}
+    runs-on: linux-amd64-cpu16m
     permissions:
       id-token: write
       contents: read
@@ -66,7 +66,7 @@ jobs:
           #! /usr/bin/env bash
           set -eo pipefail
 
-          ~/cccl/ci/matx/build_matx.sh;
+          ~/cccl/ci/matx/build_matx.sh
           sccache --show-adv-stats
           EOF
 
diff --git a/.github/workflows/build-pytorch.yml b/.github/workflows/build-pytorch.yml
@@ -0,0 +1,113 @@
+name: Build PyTorch
+
+on:
+  workflow_dispatch:
+    inputs:
+      override_cccl_tag:
+        description: "If set, override the tag used when pulling the CCCL repository into PyTorch."
+        required: false
+        default: ""
+        type: string
+      enable_slack_alerts:
+        description: "If true, a message will be posted to the CCCL GHA CI Alert channel if the workflow fails."
+        required: false
+        default: false
+        type: boolean
+  workflow_call:
+    inputs:
+      override_cccl_tag:
+        description: "If set, override the tag used when pulling the CCCL repository into PyTorch."
+        required: false
+        default: ""
+        type: string
+      enable_slack_alerts:
+        description: "If true, a message will be posted to the CCCL GHA CI Alert channel if the workflow fails."
+        required: false
+        default: false
+        type: boolean
+
+jobs:
+  build-pytorch:
+    name: Build PyTorch
+    runs-on: linux-amd64-cpu16m
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          persist-credentials: false
+      - name: Add NVCC problem matcher
+        run: echo "::add-matcher::$(pwd)/.github/problem-matchers/problem-matcher.json"
+      - uses: aws-actions/configure-aws-credentials@v4
+        if: ${{ github.repository == 'NVIDIA/cccl' }}
+        with:
+          role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-NVIDIA
+          aws-region: us-east-2
+          role-duration-seconds: 43200 # 12h
+      - name: Run command
+        env:
+          CCCL_TAG: ${{ inputs.override_cccl_tag }}
+          CCCL_VERSION: ${{ inputs.override_cccl_version }}
+        run: |
+          cat <<"EOF" > "$RUNNER_TEMP/ci.sh"
+          #! /usr/bin/env bash
+          set -eo pipefail
+
+          ~/cccl/ci/pytorch/build_pytorch.sh
+          sccache --show-adv-stats
+          EOF
+
+          chmod +x "$RUNNER_TEMP/ci.sh"
+
+          mkdir -p .aws
+
+          cat <<EOF > .aws/config
+          [default]
+          bucket=rapids-sccache-devs
+          region=us-east-2
+          EOF
+
+          cat <<EOF > .aws/credentials
+          [default]
+          aws_access_key_id=${AWS_ACCESS_KEY_ID:-}
+          aws_session_token=${AWS_SESSION_TOKEN:-}
+          aws_secret_access_key=${AWS_SECRET_ACCESS_KEY:-}
+          EOF
+
+          chmod 0600 .aws/credentials
+          chmod 0664 .aws/config
+
+          .devcontainer/launch.sh \
+            --docker \
+            --cuda 13.0 \
+            --host gcc14 \
+            --cuda-ext \
+            --env "CCCL_TAG=${CCCL_TAG}" \
+            --env VAULT_HOST= \
+            --env "GITHUB_SHA=$GITHUB_SHA" \
+            --env "GITHUB_REF_NAME=$GITHUB_REF_NAME" \
+            --env "GITHUB_REPOSITORY=$GITHUB_REPOSITORY" \
+            --volume "$RUNNER_TEMP/ci.sh:/ci.sh" \
+            -- /ci.sh
+
+  notify-failure:
+    name: Notify Slack of PyTorch failure
+    if: ${{ failure() && inputs.enable_slack_alerts }}
+    needs: build-pytorch
+    runs-on: ubuntu-latest
+    steps:
+      - name: Notify
+        uses: slackapi/slack-github-action@v1.26.0
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_NOTIFIER_BOT_TOKEN }}
+          WORKFLOW_TYPE: ${{ github.workflow }}
+          SUMMARY_URL: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}}
+        with:
+          channel-id: ${{ secrets.SLACK_CHANNEL_CI_ALERT }}
+          slack-message: |
+            PyTorch build in workflow '${{ env.WORKFLOW_TYPE }}' failed.
+
+            Details: ${{ env.SUMMARY_URL }}
diff --git a/.github/workflows/ci-workflow-nightly.yml b/.github/workflows/ci-workflow-nightly.yml
@@ -160,3 +160,13 @@ jobs:
     uses: ./.github/workflows/build-matx.yml
     with:
       enable_slack_alerts: true
+
+  build-pytorch:
+    name: Build PyTorch
+    secrets: inherit
+    permissions:
+      id-token: write
+      contents: read
+    uses: ./.github/workflows/build-pytorch.yml
+    with:
+      enable_slack_alerts: true
diff --git a/.github/workflows/ci-workflow-pull-request.yml b/.github/workflows/ci-workflow-pull-request.yml
@@ -45,18 +45,21 @@ jobs:
       docs_enabled: ${{ steps.export-flags.outputs.docs_enabled }}
       rapids_enabled: ${{ steps.export-flags.outputs.rapids_enabled }}
       matx_enabled: ${{ steps.export-flags.outputs.matx_enabled }}
+      pytorch_enabled: ${{ steps.export-flags.outputs.pytorch_enabled }}
     steps:
       - name: Export workflow flags
         id: export-flags
+        env:
+          skip_tpt: ${{ contains(github.event.head_commit.message, '[skip-tpt]') || contains(github.event.head_commit.message, '[skip-third-party-testing]') }}
         run: |
           output() { echo "$1=$2" | tee -a "${GITHUB_OUTPUT}"; }
 
-          output matrix_enabled "${{ !contains(github.event.head_commit.message, '[skip-matrix]') }}"
-          output vdc_enabled    "${{ !contains(github.event.head_commit.message, '[skip-vdc]') }}"
-          output docs_enabled   "${{ !contains(github.event.head_commit.message, '[skip-docs]') }}"
-          output rapids_enabled "${{  contains(github.event.head_commit.message, '[test-rapids]') }}"
-          # MatX build OOMs the public github runners used on forks:
-          output matx_enabled   "${{ !contains(github.event.head_commit.message, '[skip-matx]') && github.repository == 'NVIDIA/cccl' }}"
+          output matrix_enabled  "${{ !contains(github.event.head_commit.message, '[skip-matrix]') }}"
+          output vdc_enabled     "${{ !contains(github.event.head_commit.message, '[skip-vdc]') }}"
+          output docs_enabled    "${{ !contains(github.event.head_commit.message, '[skip-docs]') }}"
+          output rapids_enabled  "${{  contains(github.event.head_commit.message, '[test-rapids]') && !fromJSON(env.skip_tpt) }}"
+          output matx_enabled    "${{ !contains(github.event.head_commit.message, '[skip-matx]') && !fromJSON(env.skip_tpt) }}"
+          output pytorch_enabled "${{ !contains(github.event.head_commit.message, '[skip-pytorch]') && !fromJSON(env.skip_tpt) }}"
       - name: Checkout repo
         uses: actions/checkout@v4
         with:
@@ -240,6 +243,16 @@ jobs:
       contents: read
     uses: ./.github/workflows/build-matx.yml
 
+  build-pytorch:
+    name: Build PyTorch (optional)
+    needs: build-workflow
+    if: ${{ needs.build-workflow.outputs.pytorch_enabled == 'true' }}
+    secrets: inherit
+    permissions:
+      id-token: write
+      contents: read
+    uses: ./.github/workflows/build-pytorch.yml
+
   # Check all other job statuses. This job gates branch protection checks.
   ci:
     name: CI
diff --git a/AGENTS.md b/AGENTS.md
@@ -303,7 +303,9 @@ Tags appended to the commit summary (case-sensitive) control CI behavior:
 * `[skip-matrix]`: Skip CCCL project build/test jobs. (Docs, devcontainers, and third-party builds still run.)
 * `[skip-vdc]`: Skip "Verify Devcontainer" jobs. Safe unless CI or devcontainer infra is modified.
 * `[skip-docs]`: Skip doc tests/previews. Safe if docs are unaffected.
+* `[skip-third-party-testing]` / `[skip-tpt]`: Skip third-party smoke tests (MatX, PyTorch, RAPIDS).
 * `[skip-matx]`: Skip building the MatX third-party smoke test.
+* `[skip-pytorch]`: Skip building the PyTorch third-party smoke test.
 
 > ⚠️ All of these tags block merging until removed and a full CI run (with no overrides) succeeds.
 
diff --git a/ci-overview.md b/ci-overview.md
@@ -34,8 +34,10 @@ These commands can be combined with the [override matrix](#temporarily-overridin
   - `[skip-matrix]`: Skip all build and test jobs specified in `ci/matrix.yaml`.
   - `[skip-vdc]`: Skip all "Validate Devcontainer" jobs.
   - `[skip-docs]`: Skip the documentation verification build.
+  - `[skip-third-party-testing]` (alias `[skip-tpt]`): Skip all third-party canary builds (MatX, PyTorch, and RAPIDS).
   - `[skip-matx]`: Skip all MatX canary builds.
-  - **Example:** `git commit -m "README tidy-up [skip-matrix][skip-vdc][skip-docs][skip-matx]"`
+  - `[skip-pytorch]`: Skip all PyTorch canary builds.
+  - **Example:** `git commit -m "README tidy-up [skip-matrix][skip-vdc][skip-docs][skip-third-party-testing]"`
 
 - `[test-rapids]`: Opt‑in to run RAPIDS canary builds alongside CCCL CI.
 
diff --git a/ci/matx/build_matx.sh b/ci/matx/build_matx.sh
@@ -18,10 +18,10 @@ log_vars() {
 version_max() {
   local v1="${1}"
   local v2="${2}"
-  if [[ "$(printf "%s\n" "${v1}" "${v2}" | sort -V | head -n1)" == "${v1}" ]]; then
-    echo "${v2}"
+  if ci/util/version_compare.sh "$v1" ge "$v2"; then
+    echo "$v1"
   else
-    echo "${v1}"
+    echo "$v2"
   fi
 }
 
@@ -95,11 +95,11 @@ rm -rf build
 mkdir build
 cd build
 cmake -G Ninja ../MatX \
-  "-DCMAKE_CUDA_ARCHITECTURES=75;80" \
+  "-DCMAKE_CUDA_ARCHITECTURES=75;120" \
   "-DRAPIDS_CMAKE_CPM_OVERRIDE_VERSION_FILE=${version_override_file}" \
   -DMATX_BUILD_TESTS=ON \
   -DMATX_BUILD_EXAMPLES=ON \
   -DMATX_BUILD_BENCHMARKS=ON \
   -DMATX_EN_CUTENSOR=ON
 
-cmake --build . -j 8
+cmake --build .
diff --git a/ci/pytorch/build_pytorch.sh b/ci/pytorch/build_pytorch.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+
+set -euo pipefail
+
+readonly pytorch_repo=https://github.com/pytorch/pytorch.git
+readonly pytorch_branch=main
+
+# Ensure the script is being executed in the root cccl directory:
+cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/../..";
+readonly cccl_repo="${PWD}"
+
+log_vars() {
+  for var in "$@"; do
+    echo "${var}=${!var}"
+  done
+}
+
+# Define CCCL_TAG to override the default CCCL SHA. Otherwise the current HEAD of the local checkout is used.
+echo "CCCL_TAG (override): ${CCCL_TAG-}";
+if test -n "${CCCL_TAG-}"; then
+    # If CCCL_TAG is defined, fetch it to the local checkout
+    git -C "${cccl_repo}" fetch origin "${CCCL_TAG}";
+    cccl_sha="$(git -C "${cccl_repo}" rev-parse FETCH_HEAD)";
+else
+    cccl_sha="$(git -C "${cccl_repo}" rev-parse HEAD)";
+fi
+
+readonly workdir="${cccl_repo}/build/${CCCL_BUILD_INFIX:-}/pytorch"
+
+log_vars \
+  pytorch_repo pytorch_branch \
+  cccl_repo cccl_sha \
+  workdir
+
+mkdir -p "${workdir}"
+cd "${workdir}"
+echo "Working in ${workdir}"
+
+echo "::group::Cloning CCCL..."
+rm -rf cccl
+git clone "${cccl_repo}"
+git -C cccl checkout "${cccl_sha}"
+echo "CCCL HEAD:"
+git -C cccl log -1 --format=short
+echo "::endgroup::"
+
+# Setup a CUDA environment with the requested CCCL.
+# Use a local directory to avoid modifying the actual CUDA install:
+echo "::group::Setting up clone of CUDA environment with custom CCCL..."
+(
+  set -x
+  rm -rf ./cuda
+  cp -Hr /usr/local/cuda ./cuda
+  rm -rf ./cuda/include/cccl/*
+  cccl/ci/install_cccl.sh ./cccl-install > /dev/null
+  cp -r ./cccl-install/include/* ./cuda/include/cccl
+)
+export PATH="$PWD/cuda/bin:$PATH"
+export CUDA_HOME="$PWD/cuda"
+export CUDA_PATH="$PWD/cuda"
+which nvcc
+nvcc --version
+echo "::endgroup::"
+
+echo "::group::Cloning PyTorch..."
+rm -rf pytorch
+git clone ${pytorch_repo} -b ${pytorch_branch} --recursive --depth 1
+echo "PyTorch HEAD:"
+git -C pytorch log -1 --format=short
+echo "::endgroup::"
+
+echo "::group::Installing PyTorch build dependencies..."
+pytorch_root="$PWD/pytorch"
+export PYTHONPATH="${pytorch_root}:${pytorch_root}/tools:${PYTHONPATH:-}"
+pip install -r "${pytorch_root}/requirements-build.txt"
+echo "::endgroup::"
+
+echo "::group::Configuring PyTorch..."
+rm -rf build
+mkdir build
+declare -a cmake_args=(
+  "-DUSE_NCCL=OFF"
+  # Need to define this explicitly, torch's FindCUDA logic adds ancient arches if left undefined:
+  "-DTORCH_CUDA_ARCH_LIST=7.5;8.0;9.0;10.0;12.0"
+)
+cmake -S ./pytorch -B ./build -G Ninja "${cmake_args[@]}"
+echo "::endgroup::"
+
+# Verify that the configured build is using the custom CUDA dir for CTK and nvcc:
+if ! grep -q "CUDA_TOOLKIT_ROOT_DIR:PATH=$PWD/cuda" ./build/CMakeCache.txt; then
+    echo "Error: CUDA_TOOLKIT_ROOT_DIR does not point to the custom CUDA";
+    exit 1;
+fi
+if ! grep -q "CUDA_NVCC_EXECUTABLE:FILEPATH=$PWD/cuda/bin/nvcc" ./build/CMakeCache.txt; then
+    echo "Error: CUDA_NVCC_EXECUTABLE does not point to the custom CUDA";
+    exit 1;
+fi
+
+# This builds a bunch of unnecessary targets. Leaving here to use as a fallback if the
+# ninja target extraction below starts failing:
+# echo "::group::Building torch_cuda target..."
+# cmake --build ./build/ --target torch_cuda
+# echo "::endgroup::"
+
+# This cuts the number of built targets roughly in half:
+echo "::group::Extracting cuda targets from build.ninja..."
+# Query ninja for all object files built from CUDA source files in ATen/native/cuda/
+# that are part of the torch_cuda library:
+ninja -C ./build -t query lib/libtorch_cuda.so |
+  grep -E "ATen/native/cuda/.*\\.cu\\.o$" |
+  sort | uniq | tee build/cuda_targets.txt
+# At the time this script was written, there were 217 cuda targets.
+# Check that there are at least 100 detected targets, otherwise fail.
+num_targets=$(wc -l < build/cuda_targets.txt)
+if test "$num_targets" -lt 100; then
+    echo "Error: extracted cuda targets count is less than 100! ($num_targets)";
+    echo "This likely indicates a failure to extract the targets from ninja.";
+    exit 1;
+fi
+echo "::endgroup::"
+
+echo "::group::Building pytorch CUDA targets with custom CCCL..."
+ninja -C ./build $(xargs -a build/cuda_targets.txt)
+echo "::endgroup::"
+
+echo "PyTorch CUDA targets built successfully with custom CCCL."