-
Notifications
You must be signed in to change notification settings - Fork 361
88 lines (82 loc) · 3.04 KB
/
gpu_tests.yml
File metadata and controls
88 lines (82 loc) · 3.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
name: GPU tests
on:
push:
branches: ["pull-request/[0-9]+"]
# NOTE: paths cannot be used since push happens to copied PR and only latest commit to PR is used
schedule:
- cron: "0 0 * * *" # Nightly
workflow_dispatch:
# On-demand
concurrency:
# Cancel previous runs if new commit is pushed to the same PR
group: ${{ github.workflow }}-${{ startsWith(github.ref, 'refs/heads/pull-request/') && github.ref || github.sha }}
cancel-in-progress: true
jobs:
pr-gate:
uses: ./.github/workflows/_pr_gate.yml
permissions:
checks: read
secrets: inherit
with:
files: |
.github/workflows/gpu_tests.yml
modelopt/**
noxfile.py
pyproject.toml
tests/gpu/**
tests/gpu_megatron/**
tests/gpu_trtllm/**
gpu-tests:
needs: [pr-gate]
if: needs.pr-gate.outputs.any_changed == 'true'
strategy:
fail-fast: false
matrix:
include:
- example: gpu
timeout: 60
container_image: pytorch:26.01-py3
# tests/gpu/_extensions/test_onnx_extensions.py fails for newer containers until https://github.com/tbenthompson/cppimport/pull/98
- example: gpu_megatron
timeout: 45
container_image: nemo:26.04
- example: gpu_trtllm
timeout: 30
container_image: tensorrt-llm/release:1.3.0rc10
runs-on: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && 'linux-amd64-gpu-rtxpro6000-latest-1' || 'linux-amd64-gpu-rtxpro6000-latest-2' }}
timeout-minutes: ${{ matrix.timeout }}
container:
image: nvcr.io/nvidia/${{ matrix.container_image }}
env:
GIT_DEPTH: 1000 # For correct version
PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
HF_TOKEN: ${{ secrets.HF_TOKEN }}
steps:
- uses: actions/checkout@v6
- uses: nv-gha-runners/setup-proxy-cache@main
- name: Setup environment variables
run: |
echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV
- name: Run gpu tests
env:
COVERAGE_PROCESS_START: ${{ github.workspace }}/pyproject.toml
COVERAGE_FILE: ${{ github.workspace }}/.coverage
run: |
python -m pip install nox && nox -s ${{ matrix.example }}
- name: Upload GPU coverage to Codecov
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
files: coverage.xml
flags: gpu
fail_ci_if_error: false # test may be skipped if relevant file changes are not detected
verbose: true
gpu-pr-required-check:
# Run even if gpu-tests is skipped
if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
needs: [pr-gate, gpu-tests]
runs-on: ubuntu-latest
steps:
- name: Required GPU tests did not succeed
if: ${{ needs.pr-gate.result != 'success' || (needs.pr-gate.outputs.any_changed == 'true' && needs.gpu-tests.result != 'success') }}
run: exit 1