Skip to content

Commit 00b8957

Browse files
committed
Build Zoom wheel
1 parent aaef6b9 commit 00b8957

File tree

5 files changed

+260
-11
lines changed

5 files changed

+260
-11
lines changed
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
name: "Build Zoom wheel"
2+
3+
on:
4+
workflow_dispatch:
5+
inputs:
6+
force_debug_with_tmate:
7+
type: boolean
8+
description: 'Run the build with tmate session'
9+
required: false
10+
default: false
11+
debug_with_tmate:
12+
type: boolean
13+
description: 'Run the build with a tmate session ONLY in case of failure'
14+
required: false
15+
default: false
16+
pull_request:
17+
push:
18+
branches:
19+
- main
20+
21+
concurrency:
22+
group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
23+
cancel-in-progress: true
24+
25+
jobs:
26+
build:
27+
28+
strategy:
29+
fail-fast: false
30+
matrix:
31+
include:
32+
- name: "ubuntu-22.04"
33+
runs-on: "nodai-amdgpu-mi250-x86-64"
34+
# runs-on: "azure-cpubuilder-linux-scale"
35+
# container: "rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0"
36+
37+
runs-on: ${{ matrix.runs-on }}
38+
39+
name: ${{ matrix.name }}
40+
41+
env:
42+
CACHE_DIR: ${{ github.workspace }}/.container-cache
43+
# either the PR number or `branch-N` where N always increments
44+
CACHE_KEY: linux-build-test-cpp-asserts-manylinux-v2-${{ format('{0}-{1}', github.ref_name, github.run_number) }}
45+
46+
defaults:
47+
run:
48+
shell: bash
49+
50+
permissions:
51+
id-token: write
52+
contents: write
53+
54+
container:
55+
image: ${{ matrix.container }}
56+
57+
steps:
58+
- name: "Check out repository"
59+
uses: actions/[email protected]
60+
with:
61+
submodules: true
62+
63+
- name: Enable cache
64+
uses: actions/cache/restore@v3
65+
with:
66+
path: ${{ env.CACHE_DIR }}
67+
key: ${{ env.CACHE_KEY }}
68+
restore-keys: linux-build-test-cpp-
69+
70+
# - name: "Setting up Python"
71+
# run: |
72+
# sudo apt update
73+
# sudo apt install software-properties-common -y
74+
# sudo add-apt-repository ppa:deadsnakes/ppa -y
75+
# sudo apt install python3.11 python3-pip -y
76+
# sudo apt-get install python3.11-dev python3.11-venv build-essential -y
77+
78+
- name: "Build PyTorch"
79+
id: build
80+
run: |
81+
82+
# curl -sSL https://raw.githubusercontent.com/mrodden/get-rocm/refs/heads/master/get-rocm.py -o get-rocm.py
83+
# sudo python3.11 get-rocm.py --rocm-version 6.2.3
84+
85+
export CCACHE_DIR="${{ env.CACHE_DIR }}"
86+
export CMAKE_C_COMPILER_LAUNCHER=ccache
87+
export CMAKE_CXX_COMPILER_LAUNCHER=ccache
88+
export CCACHE_SLOPPINESS=include_file_ctime,include_file_mtime,time_macros
89+
90+
python3.11 -m venv venv
91+
source venv/bin/activate
92+
pip install -r requirements.txt
93+
./build.sh
94+
95+
- name: "Audit"
96+
id: audit
97+
run: |
98+
99+
sudo apt install patchelf
100+
source venv/bin/activate
101+
pip install auditwheel
102+
auditwheel repair -w dist --plat manylinux_2_39_x86_64 dist/torch*
103+
104+
- name: "Test"
105+
id: test
106+
run: |
107+
108+
# smoke test
109+
python zoom_extension/examples/test.py
110+
# device tests
111+
PYTORCH_TEST_WITH_SLOW=1 TORCH_TEST_DEVICES=zoom_extension/test/pytorch_test_base.py ./test.sh
112+
113+
cat zoom_test_errors.log
114+
cat zoom_unimplemented_operators.log
115+
116+
- name: Save cache
117+
uses: actions/cache/save@v3
118+
if: ${{ !cancelled() }}
119+
with:
120+
path: ${{ env.CACHE_DIR }}
121+
key: ${{ env.CACHE_KEY }}
122+
123+
- name: Upload artifacts
124+
if: ${{ !cancelled() }}
125+
uses: actions/upload-artifact@v4
126+
with:
127+
name: ${{ matrix.name }}_artifact
128+
path: dist
129+
if-no-files-found: warn
130+
131+
- name: Release current commit
132+
if: ${{ !cancelled() }}
133+
uses: ncipollo/[email protected]
134+
with:
135+
artifacts: "dist/torch*.whl"
136+
token: "${{ secrets.GITHUB_TOKEN }}"
137+
tag: "latest"
138+
name: "latest"
139+
removeArtifacts: false
140+
allowUpdates: true
141+
replacesArtifacts: true
142+
makeLatest: true
143+
144+
- name: "Setup tmate session"
145+
if: ${{ (failure() && inputs.debug_with_tmate) || inputs.force_debug_with_tmate }}
146+
uses: mxschmitt/[email protected]
147+
with:
148+
limit-access-to-actor: true
149+
install-dependencies: ${{ startsWith(matrix.runs-on, 'macos') || startsWith(matrix.runs-on, 'windows') }}

build.sh

100644100755
Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
#!/bin/bash
22

33
rm -rf build
4-
git clean -fdx -e .idea
5-
git clean -fdX -e .idea
6-
74

85
export USE_ZOOM=1
96
export USE_ROCM=0
@@ -118,13 +115,12 @@ export USE_VULKAN_FP16_INFERENCE=0
118115
export USE_VULKAN_RELAXED_PRECISI0=0
119116
export USE_XNNPACK=0
120117
export USE_XPU=0
118+
export ONNX_ML=0
121119

122-
# for the ligerllama example we need distributed and tensorpipe, only because
123-
# huggingface model.generate insists on querying torch.distributed and distributed relies on tensorpipe
124-
# this could be a factor of nod-pytorch being out of date with upstream:
125-
# https://github.com/pytorch/pytorch/issues/97397
120+
export PYTORCH_ROCM_ARCH="gfx90a;gfx940;gfx941;gfx942;gfx1100;"
121+
source venv/bin/activate
122+
#python setup.py develop
123+
python setup.py bdist_wheel
126124

127-
python setup.py develop
128-
python zoom_extension/examples/test.py
129-
PYTORCH_TEST_WITH_SLOW=1 TORCH_TEST_DEVICES=zoom_extension/test/pytorch_test_base.py ./test.sh
130-
python setup.py bdist_wheel
125+
#python zoom_extension/examples/test.py
126+
#PYTORCH_TEST_WITH_SLOW=1 TORCH_TEST_DEVICES=zoom_extension/test/pytorch_test_base.py ./test.sh

test.sh

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/bin/bash
2+
3+
log_file="test.log"
4+
bak_file="test.log.bak"
5+
output_file="zoom_unimplemented_operators.log"
6+
bak_out="zoom_unimplemented_operators.log.bak"
7+
error_file="zoom_test_errors.log"
8+
bak_err="zoom_test_errors.log.bak"
9+
10+
# backup logs
11+
[ -f $log_file ] && cp $log_file $bak_file
12+
[ -f $output_file ] && cp $output_file $bak_out
13+
[ -f $error_file ] && cp $error_file $bak_err
14+
15+
python test/test_torch.py --run-parallel 0 -k TestTorchDeviceTypePRIVATEUSEONE --verbose &> $log_file
16+
#python test/test_ops.py -k TestCommonPRIVATEUSEONE
17+
#python test/test_ops.py -k TestCommonPRIVATEUSEONE.test_compare_cpu --verbose &> $log_file
18+
#python test/test_ops.py -k TestCommonPRIVATEUSEONE.test_numpy_ref --verbose &> $log_file
19+
20+
## Find Unimplemented Operator Errors from failing tests
21+
# Pattern to search for
22+
pattern="Could not run 'aten::[^']*' with arguments from the 'zoom' backend"
23+
24+
# Extract aten operators, count frequencies, sort by frequency (descending), and save to output file
25+
grep -oP "$pattern" "$log_file" |
26+
sed -n "s/.*'aten::\([^']*\)'.*/\1/p" |
27+
sort |
28+
uniq -c |
29+
sort -rn |
30+
sed 's/^ *//; s/ /\t/' > "$output_file"
31+
32+
# Count total matches
33+
total_matches=$(grep -cP "$pattern" "$log_file")
34+
35+
# Append total matches to the output file
36+
echo -e "\nTotal unimplemented operator failures: $total_matches" >> "$output_file"
37+
echo "A list of unimplemented operators has been saved to $output_file"
38+
39+
## Find errors from failing tests
40+
# Extract error messages, count frequencies, sort by frequency (descending), and save to output file
41+
# Pattern to search for
42+
pattern="^.*Error: (?!test)(.+?)(?=\n|$)"
43+
44+
grep -oP "$pattern" "$log_file" |
45+
sed 's/^(.*Error): //g' |
46+
awk '{print substr($0, 1, 100)}' | # Limit to first 100 characters
47+
sort |
48+
uniq -c |
49+
sort -rn |
50+
sed 's/^ *//; s/ /\t/' > "$error_file"
51+
52+
# Count total matches
53+
total_matches=$(grep -cP "$pattern" "$log_file")
54+
55+
# Append total matches to the output file
56+
echo -e "\nTotal test errors failures: $total_matches" >> "$error_file"
57+
echo "A list of test errors has been saved to $error_file"
58+
59+
echo "Test logs have been saved to $log_file"

zoom_extension/examples/test.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
import torch.zoom
2+
3+
torch.utils.rename_privateuse1_backend("zoom")
4+
# TODO: figure this out
5+
unsupported_dtypes = None
6+
torch.utils.generate_methods_for_privateuse1_backend(
7+
unsupported_dtype=unsupported_dtypes
8+
)
9+
x = torch.empty(5, device="zoom:0", dtype=torch.int64)
10+
print(x)
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import torch
2+
import torch.zoom
3+
from typing import ClassVar
4+
5+
torch.utils.rename_privateuse1_backend('zoom')
6+
unsupported_dtypes = None
7+
torch.utils.generate_methods_for_privateuse1_backend(unsupported_dtype=unsupported_dtypes)
8+
9+
class ZoomTestBase(DeviceTypeTestBase):
10+
device_type = 'privateuseone'
11+
primary_device: ClassVar[str]
12+
13+
@classmethod
14+
def get_primary_device(cls):
15+
return cls.primary_device
16+
17+
18+
@classmethod
19+
def get_all_devices(cls):
20+
primary_device_idx = int(cls.get_primary_device().split(':')[1])
21+
num_devices = torch.zoom.device_count()
22+
23+
prim_device = cls.get_primary_device()
24+
zoom_str = 'zoom:{0}'
25+
non_primary_devices = [zoom_str.format(idx) for idx in range(num_devices) if idx != primary_device_idx]
26+
return [prim_device] + non_primary_devices
27+
28+
@classmethod
29+
def setUpClass(cls):
30+
# Force Zoom Init
31+
t = torch.ones(1, device='zoom')
32+
# Acquires the current device as the primary (test) device
33+
cls.primary_device = f'zoom:{torch.zoom.current_device()}'
34+
35+
TEST_CLASS = ZoomTestBase

0 commit comments

Comments
 (0)