Skip to content

Commit 8b63191

Browse files
committed
Merge from daisyden/distributed_2.8 branch
Signed-off-by: Cheng, Penghui <[email protected]>
2 parents 2ed7973 + faf4a7f commit 8b63191

26 files changed

+929
-169
lines changed

.github/actions/pt2e/action.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ runs:
9696
run: |
9797
source activate e2e_ci
9898
source .github/scripts/env.sh ${{ inputs.pytorch }}
99-
pt2e_logs_dir="${{ github.workspace }}/../pytorch/inductor_log"
99+
pt2e_logs_dir="${{ github.workspace }}/../pytorch/inductor_log/pt2e"
100100
rm -rf "${pt2e_logs_dir}" && mkdir -p "${pt2e_logs_dir}"
101101
if [[ "${{ inputs.scenario }}" == *"accuracy"* ]];then
102102
if [[ "${{ inputs.dt }}" == *"float32"* ]];then

.github/workflows/_linux_build.yml

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,6 @@ on:
1313
type: string
1414
default: 'false'
1515
description: Keep torch-xpu-ops pin. `true` means use pined commit
16-
abi:
17-
required: false
18-
type: string
19-
default: '1'
20-
description: ABI version. Default abi as 1.
2116
python:
2217
required: false
2318
type: string
@@ -98,11 +93,6 @@ jobs:
9893
export TORCH_XPU_ARCH_LIST='pvc'
9994
fi
10095
pip install mkl-static==2025.0.1 mkl-include==2025.0.1
101-
if [[ ${{ inputs.abi }} == '0' ]]; then
102-
export _GLIBCXX_USE_CXX11_ABI=0
103-
else
104-
export _GLIBCXX_USE_CXX11_ABI=1
105-
fi
10696
build_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
10797
repo="${{ github.repository }}"
10898
last_commit=$(gh --repo $repo issue view $commit_issue --json body -q .body | grep ${{ inputs.pytorch }} | cut -d'[' -f 2 | cut -d']' -f 1)
@@ -181,11 +171,11 @@ jobs:
181171
if: ${{ ! cancelled() }}
182172
uses: actions/upload-artifact@v4
183173
with:
184-
name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}
174+
name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ env.TORCH_COMMIT_ID }}
185175
path: ${{ github.workspace }}/torch*.whl
186176
- name: Upload Build Log
187177
if: ${{ ! cancelled() }}
188178
uses: actions/upload-artifact@v4
189179
with:
190-
name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}
180+
name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }}-${{ env.TORCH_COMMIT_ID }}
191181
path: ${{ github.workspace }}/pytorch_*.log

.github/workflows/_linux_ut.yml

Lines changed: 140 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,6 @@ on:
2323
type: string
2424
default: ''
2525
description: UT scope. `op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu` Delimiter is comma
26-
abi:
27-
required: false
28-
type: string
29-
default: '1'
30-
description: ABI version. Default abi as 1.
3126
python:
3227
required: false
3328
type: string
@@ -49,7 +44,7 @@ permissions: read-all
4944
jobs:
5045
ut_test:
5146
runs-on: ${{ inputs.runner }}
52-
if: ${{ inputs.ut != 'xpu_distributed' || inputs.ut != 'pytorch_distributed' }}
47+
if: ${{ inputs.ut != 'xpu_distributed' && inputs.ut != 'pytorch_distributed' }}
5348
timeout-minutes: 900
5449
env:
5550
NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
@@ -100,18 +95,13 @@ jobs:
10095
if: ${{ inputs.pytorch != 'nightly_wheel' }}
10196
uses: actions/download-artifact@v4
10297
with:
103-
name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}
98+
name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }}
10499
path: ${{ github.workspace }}
105100
- name: Install Pytorch XPU
106101
run: |
107102
source activate xpu_op_${ZE_AFFINITY_MASK}
108103
source .github/scripts/env.sh ${{ inputs.pytorch }}
109104
pip install mkl-static==2025.0.1 mkl-include==2025.0.1
110-
if [[ ${{ inputs.abi }} == '0' ]]; then
111-
export _GLIBCXX_USE_CXX11_ABI=0
112-
else
113-
export _GLIBCXX_USE_CXX11_ABI=1
114-
fi
115105
if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
116106
cd ../pytorch
117107
export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
@@ -293,7 +283,7 @@ jobs:
293283
if: always()
294284
uses: actions/upload-artifact@v4
295285
with:
296-
name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-${{ env.UT_NAME }}
286+
name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ env.UT_NAME }}
297287
path: ${{ github.workspace }}/ut_log
298288

299289
distributed_ut_test:
@@ -349,18 +339,13 @@ jobs:
349339
if: ${{ inputs.pytorch != 'nightly_wheel' }}
350340
uses: actions/download-artifact@v4
351341
with:
352-
name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}
342+
name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }}
353343
path: ${{ github.workspace }}
354344
- name: Install Pytorch XPU
355345
run: |
356346
source activate xpu_op_${ZE_AFFINITY_MASK}
357347
source .github/scripts/env.sh ${{ inputs.pytorch }}
358348
pip install mkl-static==2025.0.1 mkl-include==2025.0.1
359-
if [[ ${{ inputs.abi }} == '0' ]]; then
360-
export _GLIBCXX_USE_CXX11_ABI=0
361-
else
362-
export _GLIBCXX_USE_CXX11_ABI=1
363-
fi
364349
if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
365350
cd ../pytorch
366351
export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
@@ -433,7 +418,142 @@ jobs:
433418
if: ${{ ! cancelled() }}
434419
uses: actions/upload-artifact@v4
435420
with:
436-
name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-xpu_distributed
421+
name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_distributed
422+
path: ${{ github.workspace }}/ut_log
423+
424+
pytorch_distributed_test:
425+
runs-on: ${{ inputs.runner }}
426+
if: contains(inputs.ut, 'pytorch_distributed')
427+
timeout-minutes: 900
428+
env:
429+
NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
430+
DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
431+
steps:
432+
- name: Checkout torch-xpu-ops
433+
uses: actions/checkout@v4
434+
- name: Prepare Stock Pytorch
435+
run: |
436+
pwd
437+
which conda && conda clean -ay
438+
conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \
439+
rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
440+
conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
441+
source activate xpu_op_${ZE_AFFINITY_MASK}
442+
cd ../ && rm -rf pytorch
443+
pip install requests
444+
git clone https://github.com/daisyden/pytorch.git pytorch
445+
if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
446+
cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
447+
# apply PRs for stock pytorch
448+
python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
449+
git status && git show -s
450+
git submodule sync && git submodule update --init --recursive
451+
if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
452+
echo "Don't replace torch-xpu-ops!"
453+
else
454+
rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
455+
# Workaround for torch-xpu-ops ci test
456+
sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
457+
fi
458+
fi
459+
- name: Triton Installation
460+
run: |
461+
source activate xpu_op_${ZE_AFFINITY_MASK}
462+
cd ../pytorch
463+
TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
464+
if [ -z ${{ inputs.triton }} ]; then
465+
TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)"
466+
else
467+
TRITON_COMMIT_ID="${{ inputs.triton }}"
468+
fi
469+
echo ${TRITON_REPO}@${TRITON_COMMIT_ID}
470+
if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
471+
pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python"
472+
fi
473+
- name: Download Pytorch wheel
474+
if: ${{ inputs.pytorch != 'nightly_wheel' }}
475+
uses: actions/download-artifact@v4
476+
with:
477+
name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }}
478+
path: ${{ github.workspace }}
479+
- name: Install Pytorch XPU
480+
run: |
481+
source activate xpu_op_${ZE_AFFINITY_MASK}
482+
source .github/scripts/env.sh ${{ inputs.pytorch }}
483+
pip install mkl-static==2025.0.1 mkl-include==2025.0.1
484+
if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
485+
cd ../pytorch
486+
export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
487+
pip install -r requirements.txt
488+
pip install --force-reinstall ${{ github.workspace }}/torch*.whl
489+
git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
490+
else
491+
pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
492+
TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
493+
cd ../pytorch
494+
git reset --hard && git checkout ${TORCH_COMMIT_ID}
495+
TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
496+
rm -rf third_party/torch-xpu-ops
497+
git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
498+
cd third_party/torch-xpu-ops
499+
git checkout ${TORCH_XPU_OPS_COMMIT}
500+
cd ../..
501+
python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py
502+
fi
503+
pip install -r .ci/docker/requirements-ci.txt
504+
- name: Torch Config
505+
run: |
506+
source activate xpu_op_${ZE_AFFINITY_MASK}
507+
source .github/scripts/env.sh ${{ inputs.pytorch }}
508+
python -c "import torch; print(torch.__config__.show())"
509+
python -c "import torch; print(torch.__config__.parallel_info())"
510+
python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
511+
python -c "import triton; print(triton.__version__)"
512+
513+
cd ..
514+
python pytorch/torch/utils/collect_env.py
515+
rm -rf /tmp/torchinductor_*
516+
rm -rf ~/.triton/cache
517+
- name: Run Torch XPU Distributed UT
518+
run: |
519+
source .github/scripts/env.sh ${{ inputs.pytorch }}
520+
source activate xpu_op_${ZE_AFFINITY_MASK}
521+
pip install pytest
522+
cd ${{ github.workspace }}
523+
sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk
524+
sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope
525+
mkdir -p ut_log/pytorch_distributed
526+
cd ../pytorch/third_party/torch-xpu-ops/test/xpu
527+
XCCL_EANBLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())")
528+
if [[ "${XCCL_ENABLE}}" == 'False' ]]; then
529+
echo -e "[ERROR] XCCL is not enabled"
530+
exit 1
531+
fi
532+
python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log
533+
cd ${{ github.workspace }}
534+
sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
535+
- name: UT Test Results Check
536+
shell: bash
537+
run: |
538+
function contains() {
539+
contains_status="echo 'Start $2 ...'"
540+
{
541+
[[ $1 =~ (^|,)$2($|,) ]]
542+
} || {
543+
echo "[Warning] $2 is not suppotted type! Skipped!"
544+
contains_status="continue"
545+
}
546+
}
547+
set -xe
548+
echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
549+
cd ${{ github.workspace }}/ut_log/pytorch_distributed
550+
cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./
551+
bash ut_result_check.sh 'pytorch_distributed'
552+
- name: Upload Inductor XPU UT Log
553+
if: ${{ ! cancelled() }}
554+
uses: actions/upload-artifact@v4
555+
with:
556+
name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-pytorch_distributed
437557
path: ${{ github.workspace }}/ut_log
438558

439559
pytorch_distributed_test:

.github/workflows/nightly_ondemand.yml

Lines changed: 2 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,6 @@ jobs:
7676
with:
7777
pytorch: ${{ github.event_name == 'schedule' && 'main' || inputs.pytorch }}
7878
keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
79-
abi: 1
8079
python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
8180
runner: pvc_e2e
8281
update_lkg: 'true'
@@ -186,7 +185,7 @@ jobs:
186185
if: ${{ inputs.pytorch != 'nightly_wheel' }}
187186
uses: actions/download-artifact@v4
188187
with:
189-
name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-1
188+
name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
190189
path: ${{ github.workspace }}
191190
- name: Install Pytorch XPU
192191
run: |
@@ -326,7 +325,7 @@ jobs:
326325
source activate e2e_ci
327326
cp -r ${{ github.workspace }}/.github/scripts/summary_pt2e.py ${{ github.workspace }}/upload_files
328327
cd ${{ github.workspace }}/upload_files
329-
python summary_pt2e.py ${{ github.workspace }}/upload_files
328+
python summary_pt2e.py ${{ github.workspace }}/upload_files/inductor_log/pt2e
330329
rm -rf summary_pt2e.py
331330
fi
332331
- name: Upload Inductor XPU E2E Data
@@ -336,34 +335,6 @@ jobs:
336335
name: Inductor-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}
337336
path: ${{ github.workspace }}/upload_files
338337

339-
Linux-Nightly-Ondemand-Build-ABI-0:
340-
secrets: inherit
341-
if: github.event_name == 'schedule' && github.event.schedule == '0 16 * * 5'
342-
name: linux-nightly-ondemand-abi0
343-
permissions:
344-
issues: write
345-
uses: ./.github/workflows/_linux_build.yml
346-
with:
347-
pytorch: ${{ github.event_name == 'schedule' && 'main' || inputs.pytorch }}
348-
keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
349-
abi: 0
350-
python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
351-
runner: pvc_e2e
352-
353-
Linux-Weekly-UT-Tests-ABI-0:
354-
if: github.event_name == 'schedule' && github.event.schedule == '0 16 * * 5'
355-
name: linux-nightly-ondemand-abi0
356-
needs: Linux-Nightly-Ondemand-Build-ABI-0
357-
uses: ./.github/workflows/_linux_ut.yml
358-
with:
359-
abi: 0
360-
keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
361-
ut: op_regression,op_regression_dev1,op_extended,op_ut
362-
pytorch: ${{ needs.Linux-Nightly-Ondemand-Build-ABI-0.outputs.torch_commit_id }}
363-
python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
364-
triton: ${{ github.event_name == 'schedule' && '' || inputs.triton }}
365-
runner: linux.idc.xpu
366-
367338
Tests-Failure-And-Report:
368339
if: ${{ ! cancelled() }}
369340
runs-on: [ self-hosted, Linux ]

.github/workflows/nightly_ondemand_rolling.yml

Lines changed: 2 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,6 @@ jobs:
7777
with:
7878
pytorch: ${{ github.event_name == 'schedule' && 'main' || inputs.pytorch }}
7979
keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
80-
abi: 1
8180
python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
8281
driver: rolling
8382
runner: pvc_rolling
@@ -191,7 +190,7 @@ jobs:
191190
if: ${{ inputs.pytorch != 'nightly_wheel' }}
192191
uses: actions/download-artifact@v4
193192
with:
194-
name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-1
193+
name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
195194
path: ${{ github.workspace }}
196195
- name: Install Pytorch XPU
197196
run: |
@@ -341,7 +340,7 @@ jobs:
341340
source activate e2e_ci
342341
cp -r ${{ github.workspace }}/.github/scripts/summary_pt2e.py ${{ github.workspace }}/upload_files
343342
cd ${{ github.workspace }}/upload_files
344-
python summary_pt2e.py ${{ github.workspace }}/upload_files
343+
python summary_pt2e.py ${{ github.workspace }}/upload_files/inductor_log/pt2e
345344
rm -rf summary_pt2e.py
346345
fi
347346
- name: Upload Inductor XPU E2E Data
@@ -350,37 +349,7 @@ jobs:
350349
with:
351350
name: Inductor-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}
352351
path: ${{ github.workspace }}/upload_files
353-
354-
Linux-Nightly-Ondemand-Build-Rolling-ABI-0:
355-
secrets: inherit
356-
if: github.event_name == 'schedule' && github.event.schedule == '30 16 * * 5'
357-
name: linux-nightly-ondemand-rolling-abi0
358-
permissions:
359-
issues: write
360-
uses: ./.github/workflows/_linux_build.yml
361-
with:
362-
pytorch: ${{ github.event_name == 'schedule' && 'main' || inputs.pytorch }}
363-
keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
364-
abi: 0
365-
python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
366-
driver: rolling
367-
runner: pvc_rolling
368352

369-
Linux-Weekly-UT-Tests-Rolling-ABI-0:
370-
if: github.event_name == 'schedule' && github.event.schedule == '30 16 * * 5'
371-
name: linux-nightly-ondemand-rolling-abi0
372-
needs: Linux-Nightly-Ondemand-Build-Rolling-ABI-0
373-
uses: ./.github/workflows/_linux_ut.yml
374-
with:
375-
abi: 0
376-
keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
377-
ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_extended,op_ut' || inputs.ut }}
378-
pytorch: ${{ needs.Linux-Nightly-Ondemand-Build-Rolling-ABI-0.outputs.torch_commit_id }}
379-
python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
380-
triton: ${{ github.event_name == 'schedule' && '' || inputs.triton }}
381-
driver: rolling
382-
runner: pvc_rolling
383-
384353
Tests-Failure-And-Report:
385354
if: ${{ ! cancelled() }}
386355
runs-on: [ self-hosted, Linux ]

.github/workflows/nightly_ondemand_whl.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,7 @@ jobs:
287287
source activate e2e_ci
288288
cp -r ${{ github.workspace }}/.github/scripts/summary_pt2e.py ${{ github.workspace }}/upload_files
289289
cd ${{ github.workspace }}/upload_files
290-
python summary_pt2e.py ${{ github.workspace }}/upload_files
290+
python summary_pt2e.py ${{ github.workspace }}/upload_files/inductor_log/pt2e
291291
rm -rf summary_pt2e.py
292292
fi
293293
- name: Upload Inductor XPU E2E Data

0 commit comments

Comments
 (0)