diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py index 8cd490bc8..7e7c6ecd4 100644 --- a/.github/scripts/check-ut.py +++ b/.github/scripts/check-ut.py @@ -1,22 +1,47 @@ import argparse import sys import os +import re from junitparser import JUnitXml, Error, Failure, Skipped -parser = argparse.ArgumentParser() -parser.add_argument('junitxml', nargs='+') +parser = argparse.ArgumentParser(description='Test results analyzer') +parser.add_argument('input_files', nargs='+', help='JUnit XML files or log files') args = parser.parse_args() failures = [] -suites = [] +summaries = [] + +error_types = [ + "RuntimeError", + "ValueError", + "TypeError", + "AttributeError", + "KeyError", + "IndexError", + "ImportError", + "AssertionError", + "Exception", + "OSError", + "Failed", + "TimeoutError", + "asyncio.TimeoutError", + "FileNotFoundError", + "PermissionError", + "NotImplementedError", +] def get_classname(case): - return ' '.join(case.classname.split()) + return ' '.join(case.classname.split()) if hasattr(case, 'classname') else case.get('classname', '') def get_name(case): + if isinstance(case, dict): + return case.get('name', '') return ' '.join(case.name.split()) def get_result(case): + if isinstance(case, dict): + return case.get('status', 'failed') + result = "passed" if case.result: if isinstance(case.result[0], Error): @@ -28,88 +53,210 @@ def get_result(case): return result def get_message(case): + if isinstance(case, dict): + return case.get('error', '') + if not case.result: return "" - return f"{case.result[0].message.splitlines()[0]}" -def print_md_row(row, print_header): + full_text = case.result[0].text if hasattr(case.result[0], 'text') else case.result[0].message + if not full_text: + return "" + + error_messages = [] + capture_next_lines = False + indent_level = 0 + + for line in full_text.splitlines(): + stripped_line = line.strip() + if not stripped_line: + continue + + for error_type in error_types: + if stripped_line.startswith(error_type + ": "): + error_msg = stripped_line[len(error_type)+2:] + error_messages.append(f"{error_type}: {error_msg}") + capture_next_lines = True + indent_level = 0 + break + elif f"{error_type}:" in stripped_line and "Traceback" not in stripped_line: + error_msg = stripped_line.split(f'{error_type}:')[-1].strip() + error_messages.append(f"{error_type}: {error_msg}") + capture_next_lines = True + indent_level = 0 + break + + return " ; ".join(error_messages) if error_messages else f"{case.result[0].message.splitlines()[0]}" + + +def print_md_row(row, print_header=False): if print_header: - header = " | ".join([f"{key}" for key, _ in row.items()]) + header = " | ".join([f"{key}" for key in row.keys()]) print(f"| {header} |") - header = " | ".join(["-"*len(key) for key, _ in row.items()]) + header = " | ".join(["---"] * len(row)) print(f"| {header} |") - row = " | ".join([f"{value}" for _, value in row.items()]) - print(f"| {row} |") + row_values = " | ".join([f"{value}" for value in row.values()]) + print(f"| {row_values} |") -def print_cases(cases): +def print_failures(): + if not failures: + return + + print("### Test Failures") print_header = True - for case in cases: - classname = get_classname(case) - name = get_name(case) - result = get_result(case) - message = get_message(case) - row = { - 'Class name': classname, - 'Test name': name, - 'Status': result, - 'Message': message, - } - print_md_row(row, print_header) + for case in failures: + print_md_row({ + 'Class name': get_classname(case), + 'Test name': get_name(case), + 'Status': get_result(case), + 'Message': get_message(case), + 'Source': case['source'] if isinstance(case, dict) else 'XML' + }, print_header) print_header = False -def print_suite(suite): +def parse_log_file(log_file): + with open(log_file, encoding='utf-8') as f: + content = f.read() + + ut_name = os.path.splitext(os.path.basename(log_file))[0] + summary = { + 'Category': determine_category(ut_name), + 'UT': ut_name, + 'Test cases': 0, + 'Passed': 0, + 'Skipped': 0, + 'Failures': 0, + 'Errors': 0, + 'Source': 'Log' + } + + # Extract test counts + test_run_match = re.search(r"Ran (\d+) tests in [\d.]+s", content) + if test_run_match: + summary['Test cases'] = int(test_run_match.group(1)) + + # Extract skipped case number + skipped_match = re.search(r"skipped[ =](\d+)", content, re.IGNORECASE) + if skipped_match: + summary['Skipped'] = int(skipped_match.group(1)) + else: + skipped_match = re.search(r"skipped (\d+) cases?", content, re.IGNORECASE) + if skipped_match: + summary['Skipped'] = int(skipped_match.group(1)) + + # Extract failures + failure_blocks = re.findall(r"(FAIL:.*?)(?:\n\n|\n=+\n|\Z)", content, re.DOTALL) + exist_test_names = set() + failures_number = 0 + + for block in failure_blocks: + case_match = re.match(r"FAIL: (\w+) \(__mp_main__\.(\w+)\)", block) + if not case_match: + continue + + test_name = case_match.group(1) + if test_name in exist_test_names: + continue + exist_test_names.add(test_name) + + error_msg = [] + error_pattern = r"(" + "|".join(error_types) + r"):.*?(?=\n\S|\n\n|\n=+\n|\Z)" + error_matches = re.finditer(error_pattern, block, re.DOTALL) + if not error_matches and "Traceback" in block: + error_msg.append("Unknown error (see traceback)") + else: + for match in error_matches: + error_msg.append(match.group(0).strip()) + + failures.append({ + 'classname': ut_name, + 'name': f"{case_match.group(2)}:{test_name}", + 'error': " ".join(error_msg), + 'status': 'failed', + 'source': 'Log' + }) + failures_number += 1 + + if failures_number > summary['Failures']: + summary['Failures'] = failures_number + summary['Passed'] = summary['Test cases'] - summary['Failures'] - summary['Skipped'] + + return summary + +def determine_category(ut): + if ut == 'op_regression': + return 'op_regression' + elif ut == 'op_regression_dev1': + return 'op_regression_dev1' + elif ut == 'op_extended': + return 'op_extended' + elif 'op_ut' in ut: + return 'op_ut' + else: + return 'unknown' + +def process_log_file(log_file): + try: + summary = parse_log_file(log_file) + summaries.append(summary) + except Exception as e: + print(f"Error processing {log_file}: {e}", file=sys.stderr) + +def process_xml_file(xml_file): + try: + xml = JUnitXml.fromfile(xml_file) + ut = os.path.basename(xml_file).split('.')[0] + category = determine_category(ut) + + for suite in xml: + suite_summary = { + 'Category': category, + 'UT': ut, + 'Test cases': suite.tests, + 'Passed': suite.tests - suite.skipped - suite.failures - suite.errors, + 'Skipped': suite.skipped, + 'Failures': suite.failures, + 'Errors': suite.errors, + 'Source': 'XML' + } + summaries.append(suite_summary) + + for case in suite: + if get_result(case) not in ["passed", "skipped"]: + failures.append(case) + except Exception as e: + print(f"Error processing {xml_file}: {e}", file=sys.stderr) + +def print_summary(): + print("### Results Summary") print_header = True - for suite in suites: - ut = args.junitxml[0] - del(args.junitxml[0]) - ut = os.path.basename(ut).split('.')[0] - tests = suite.tests - skipped = suite.skipped - failures = suite.failures - errors = suite.errors - if ut == 'op_regression': - category = 'op_regression' - elif ut == 'op_regression_dev1': - category = 'op_regression_dev1' - elif ut == 'op_extended': - category = 'op_extended' - elif 'op_ut' in ut: - category = 'op_ut' - row = { - 'Category': category, - 'UT': ut, - 'Test cases': tests, - 'Passed': tests-skipped-failures-errors, - 'Skipped': skipped, - 'Failures': failures, - 'Errors': errors, - } - print_md_row(row, print_header) + + for summary in summaries: + print_md_row({ + 'Category': summary['Category'], + 'UT': summary['UT'], + 'Test cases': summary['Test cases'], + 'Passed': summary['Passed'], + 'Skipped': summary['Skipped'], + 'Failures': summary['Failures'], + 'Errors': summary['Errors'], + 'Source': summary['Source'] + }, print_header) + print_header = False -xmls = [ JUnitXml.fromfile(f) for f in args.junitxml ] -for idx, xml in enumerate(xmls): - for suite in xml: - suites.append(suite) - for case in suite: - classname = get_classname(case) - name = get_name(case) - result = get_result(case) - if result not in ["passed", "skipped"]: - failures.append(case) - -printed = False -def print_break(needed): - if needed: - print("") - -if failures: - print_break(printed) - print("### Failures") - print_cases(failures) - printed = True - -print("### Results Summary") -print_suite(suites) - -sys.exit(0) +def main(): + for input_file in args.input_files: + if input_file.endswith('.log'): + process_log_file(input_file) + elif input_file.endswith('.xml'): + process_xml_file(input_file) + else: + print(f"Skipping unknown file type: {input_file}", file=sys.stderr) + + print_failures() + print_summary() + + +if __name__ == "__main__": + main() diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh index 3fb1a1997..9bf611786 100644 --- a/.github/scripts/ut_result_check.sh +++ b/.github/scripts/ut_result_check.sh @@ -72,14 +72,14 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then echo -e "[PASS] UT ${ut_suite} test Pass" fi fi -if [[ "${ut_suite}" == 'xpu_distributed' ]]; then - grep -E "^FAILED|have failures" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log - num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed.log") +if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then + grep -E "^FAILED|have failures" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log + num_failed_distributed=$(wc -l < "./${ut_suite}_test_failed.log") echo -e "=========================================================================" - echo -e "Show Failed cases in ${ut_suite} xpu distributed" + echo -e "Show Failed cases in ${ut_suite}" echo -e "=========================================================================" - cat "./${ut_suite}_xpu_distributed_test_failed.log" - ((num_failed=num_failed_xpu_distributed)) + cat "./${ut_suite}_test_failed.log" + ((num_failed=num_failed_distributed)) if [[ $num_failed -gt 0 ]]; then echo -e "[ERROR] UT ${ut_suite} test Fail" exit 1 diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index f7381a502..282197c06 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -163,13 +163,13 @@ jobs: if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ env.TORCH_COMMIT_ID }} path: ${{ github.workspace }}/torch*.whl - name: Upload Build Log if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: - name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }}-${{ env.TORCH_COMMIT_ID }} path: ${{ github.workspace }}/pytorch_*.log - name: Cleanup if: always() diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index f0f8ea42f..0fbb85b7e 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -44,7 +44,7 @@ permissions: read-all jobs: ut_test: runs-on: ${{ inputs.runner }} - if: ${{ inputs.ut != 'xpu_distributed' }} + if: ${{ inputs.ut != 'xpu_distributed' && inputs.ut != 'pytorch_distributed' }} timeout-minutes: 900 env: NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} @@ -60,7 +60,7 @@ jobs: rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y source activate xpu_op_${ZE_AFFINITY_MASK} - cd ../ && rm -rf pytorch + cd ../ && sudo rm -rf pytorch pip install requests git clone https://github.com/pytorch/pytorch pytorch if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then @@ -95,7 +95,7 @@ jobs: if: ${{ inputs.pytorch != 'nightly_wheel' }} uses: actions/download-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }} path: ${{ github.workspace }} - name: Install Pytorch XPU run: | @@ -175,6 +175,18 @@ jobs: cd ../pytorch/third_party/torch-xpu-ops/test/xpu timeout 10000 python run_test_with_skip.py 2>${{ github.workspace }}/ut_log/op_ut/op_ut_with_skip_test_error.log | tee ${{ github.workspace }}/ut_log/op_ut/op_ut_with_skip_test.log cp *.xml ${{ github.workspace }}/ut_log + find op_ut_with_skip_nn op_ut_with_skip_quantization/core -type f -exec sh -c ' + dir_path=$(dirname "$1"); + case "$dir_path" in + *"op_ut_with_skip_quantization/core"*) + dir_name="op_ut_with_skip_quantization_core";; + *) + dir_name=$(basename "$dir_path");; + esac; + mv "$1" "$dir_path/${dir_name}_$(basename "$1")" + ' _ {} \; + cp op_ut_with_skip_nn/*.xml ${{ github.workspace }}/ut_log + cp op_ut_with_skip_quantization/core/*.xml ${{ github.workspace }}/ut_log # Cases run with a on-demand white list, since some suites are too # slow to go through all operators on CPU. So add cases on-demand # when XPU implementatoin is done. @@ -292,7 +304,7 @@ jobs: rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y source activate xpu_op_${ZE_AFFINITY_MASK} - cd ../ && rm -rf pytorch + cd ../ && sudo rm -rf pytorch pip install requests git clone https://github.com/pytorch/pytorch pytorch if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then @@ -327,7 +339,7 @@ jobs: if: ${{ inputs.pytorch != 'nightly_wheel' }} uses: actions/download-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }} path: ${{ github.workspace }} - name: Install Pytorch XPU run: | @@ -403,3 +415,145 @@ jobs: with: name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_distributed path: ${{ github.workspace }}/ut_log + + pytorch_distributed_test: + runs-on: ${{ inputs.runner }} + if: contains(inputs.ut, 'pytorch_distributed') + timeout-minutes: 900 + env: + NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} + DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} + steps: + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Prepare Stock Pytorch + run: | + pwd + which conda && conda clean -ay + conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \ + rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} + conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ../ && rm -rf pytorch + pip install requests + git clone https://github.com/daisyden/pytorch.git pytorch + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + fi + fi + - name: Triton Installation + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ../pytorch + TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton" + if [ -z ${{ inputs.triton }} ]; then + TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" + else + TRITON_COMMIT_ID="${{ inputs.triton }}" + fi + echo ${TRITON_REPO}@${TRITON_COMMIT_ID} + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python" + fi + - name: Download Pytorch wheel + if: ${{ inputs.pytorch != 'nightly_wheel' }} + uses: actions/download-artifact@v4 + with: + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }} + path: ${{ github.workspace }} + - name: Install Pytorch XPU + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + source .github/scripts/env.sh ${{ inputs.pytorch }} + pip install mkl-static==2025.0.1 mkl-include==2025.0.1 + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd ../pytorch + export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"} + pip install -r requirements.txt + pip install --force-reinstall ${{ github.workspace }}/torch*.whl + git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. + else + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu + TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') + cd ../pytorch + git reset --hard && git checkout ${TORCH_COMMIT_ID} + TORCH_XPU_OPS_COMMIT=$(${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log + cp op_ut_with_skip_* ${{ github.workspace }}/ut_log/pytorch_distributed_summary + cd ${{ github.workspace }} + sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope + - name: Distributed UT Test Results Summary + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + pip install junitparser + python .github/scripts/check-ut.py ${{ github.workspace }}/ut_log/pytorch_distributed_summary/* >> $GITHUB_STEP_SUMMARY || true + - name: UT Test Results Check + shell: bash + run: | + function contains() { + contains_status="echo 'Start $2 ...'" + { + [[ $1 =~ (^|,)$2($|,) ]] + } || { + echo "[Warning] $2 is not suppotted type! Skipped!" + contains_status="continue" + } + } + set -xe + echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" + cd ${{ github.workspace }}/ut_log/pytorch_distributed + cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ + bash ut_result_check.sh 'pytorch_distributed' + - name: Upload Inductor XPU UT Log + if: ${{ ! cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-pytorch_distributed + path: ${{ github.workspace }}/ut_log diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 7592e24b6..0b62c4c2d 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -66,6 +66,31 @@ jobs: pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }} ut: op_regression,op_regression_dev1,op_extended,op_ut,xpu_distributed runner: linux.idc.xpu + + preci-linux-build-distributed: + # Don't run on forked repos and draft PRs + secrets: inherit + if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }} + name: preci-linux-distributed + needs: preci-lint-check + permissions: + issues: write + uses: ./.github/workflows/_linux_build.yml + with: + pytorch: distributed_2.8 + runner: pvc_e2e + + preci-ut-distributed: + # Don't run on forked repos and draft PRs + secrets: inherit + if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }} + name: preci-linux-distributed + needs: preci-linux-build-distributed + uses: ./.github/workflows/_linux_ut.yml + with: + pytorch: ${{ needs.preci-linux-build-distributed.outputs.torch_commit_id }} + ut: pytorch_distributed + runner: pvc_e2e Inductor-XPU-E2E-CI-Tests: name: preci-linux / e2e_test @@ -110,7 +135,7 @@ jobs: - name: Download Pytorch wheel uses: actions/download-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ needs.preci-linux-build.outputs.torch_commit_id }} path: ${{ github.workspace }} - name: Install Pytorch XPU run: | diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py new file mode 100644 index 000000000..94fbfae32 --- /dev/null +++ b/test/xpu/run_distributed_local.py @@ -0,0 +1,135 @@ +import os +import subprocess +import sys + +from skip_list_dist_local import skip_dict, skip_dict_python + +res = 0 +res2 = 0 +fail_test = [] +error_log = "" + +os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining" +# Get the xelink group card affinity +ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") +if ret == 0: + gpu_dict = {} + with open("topology.log") as file: + lines = file.readlines() + for line in lines: + if "CPU Affinity" in line: + continue + line = line.strip() + if line.startswith("GPU "): + items = line.split(" ") + items = [x for x in items if x] + gpu_id = items[1] + i = gpu_id.split("/")[0] + affinity = "" + for j, item in enumerate(items): + if "SYS" not in item and ("XL" in item or "S" in item): + if len(affinity) == 0: + affinity = str(j - 2) + else: + affinity = affinity + "," + str(j - 2) + gpu_dict[i] = affinity + + max_affinity = "" + for key, value in gpu_dict.items(): + if len(value) > len(max_affinity): + max_affinity = value + + os.environ["ZE_AFFINITY_MASK"] = str(max_affinity) + print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK"))) + +else: + print("xpu-smi topology failed") + sys.exit(255) + + +from xpu_test_utils import launch_test + + +# run python test +def run(test_command): + result = subprocess.run(test_command, capture_output=True, text=True) + print(result.stdout) + print(result.stderr) + return result + + +for key in skip_dict_python: + skip_list = skip_dict_python[key] if skip_dict_python[key] else [] + test_command = ["python", key] + fail = run(test_command) + num_skipped = 0 + num_err = 0 + if fail.returncode: + for i, err in enumerate(fail.stderr.split("FAIL: ")): + if i == 0 and len(err) > 0: + error_log += err + continue + is_skipped = False + for skip_case in skip_list: + if skip_case in err: + print("Skipped error: ", key + " " + skip_case) + num_skipped += 1 + is_skipped = True + break + if not is_skipped: + num_err += 1 + res2 += fail.returncode + if i == len(fail.stderr.split("FAIL: ")) - 1: + error_log += "FAIL: " + for line in err.split("\n"): + if line.startswith("FAILED (failures="): + num_errs = line.split("=")[1].split(")")[0].strip() + error_log += ( + "FAILED (failures=" + + str(int(num_errs) - num_skipped) + + f" skipped {num_skipped} cases" + + ")\n" + ) + else: + error_log += line + "\n" + else: + error_log += "FAIL: " + err + else: + if i == len(fail.stderr.split("FAIL: ")) - 1: + error_log += "FAIL: " + for line in err.split("\n"): + if line.startswith("FAILED (failures="): + num_errs = line.split("=")[1].split(")")[0].strip() + error_log += ( + "FAILED (failures=" + + str(int(num_errs) - num_skipped) + + f" skipped {num_skipped} cases" + + ")\n" + ) + + renamed_key = key.replace("../../../../", "").replace("/", "_") + if num_err > 0: + fail_test.append(key) + with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f: + f.write(error_log) + else: + with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f: + f.write(fail.stdout) + f.write(fail.stderr) + +# run pytest with skiplist +for key in skip_dict: + skip_list = skip_dict[key] + fail = launch_test(key, skip_list) + res += fail + if fail: + fail_test.append(key) + +if fail_test: + print(",".join(fail_test) + " have failures") + +exit_code = os.WEXITSTATUS(res) +if exit_code == 0: + sys.exit(res2) +else: + sys.exit(exit_code) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py new file mode 100644 index 000000000..0254e69a3 --- /dev/null +++ b/test/xpu/skip_list_dist_local.py @@ -0,0 +1,578 @@ +skip_dict = { + "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None, + # https://github.com/intel/torch-xpu-ops/issues/1536 + # "../../../../test/distributed/fsdp/test_distributed_checkpoint.py": ( + # "test_distributed_checkpoint_state_dict_type0_xpu", + # "test_distributed_checkpoint_state_dict_type1_xpu", + # ), + "../../../../test/distributed/fsdp/test_fsdp_apply.py": None, + "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None, + "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + # "test_basic_checkpoint_end_to_end_cpu_offload1_offload_activations_False_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_True_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False", + "test_checkpoint_submodule_use_reentrant_False_xpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_ddp_parity_xpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_comm.py": None, + "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_bf16_hook_has_wrapping_False_sharding_strategy0", + "test_bf16_hook_has_wrapping_False_sharding_strategy1", + "test_bf16_hook_has_wrapping_False_sharding_strategy2", + "test_bf16_hook_has_wrapping_True_sharding_strategy0", + "test_bf16_hook_has_wrapping_True_sharding_strategy1", + "test_bf16_hook_has_wrapping_True_sharding_strategy2", + "test_fp16_hook_has_wrapping_False_sharding_strategy1", + "test_fp16_hook_has_wrapping_False_sharding_strategy2", + "test_fp16_hook_has_wrapping_True_sharding_strategy0", + "test_fp16_hook_has_wrapping_True_sharding_strategy1", + "test_fp16_hook_has_wrapping_True_sharding_strategy2", + ), + "../../../../test/distributed/fsdp/test_fsdp_core.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + # "test_delayed_optim_step_offload_true_no_shard_xpu", + "test_transformer_no_grad_mixed_precision_True_xpu", + "test_delayed_optim_step_offload_false_no_shard_xpu", + "test_delayed_optim_step_offload_false_none_xpu", + "test_delayed_optim_step_offload_false_shard_grad_op_xpu", + "test_delayed_optim_step_offload_true_none_xpu", + "test_delayed_optim_step_offload_true_shard_grad_op_xpu", + "test_delayed_reduce_scatter_offload_false_no_shard_xpu", + "test_delayed_reduce_scatter_offload_false_none_xpu", + "test_delayed_reduce_scatter_offload_false_shard_grad_op_xpu", + "test_delayed_reduce_scatter_offload_true_none_xpu", + "test_delayed_reduce_scatter_offload_true_shard_grad_op_xpu", + "test_mixture_of_experts_offload_false_no_shard_xpu", + "test_mixture_of_experts_offload_false_none_xpu", + "test_mixture_of_experts_offload_false_shard_grad_op_xpu", + "test_mixture_of_experts_offload_true_none_xpu", + "test_mixture_of_experts_offload_true_shard_grad_op_xpu", + "test_mixture_of_experts_with_delay_before_free_offload_false_no_shard_xpu", + "test_mixture_of_experts_with_delay_before_free_offload_false_none_xpu", + "test_mixture_of_experts_with_delay_before_free_offload_false_shard_grad_op_xpu", + "test_mixture_of_experts_with_delay_before_free_offload_true_none_xpu", + "test_mixture_of_experts_with_delay_before_free_offload_true_shard_grad_op_xpu", + "test_nested_always_wrap_model_offload_false_no_shard_xpu", + "test_nested_always_wrap_model_offload_false_none_xpu", + "test_nested_always_wrap_model_offload_false_shard_grad_op_xpu", + "test_nested_always_wrap_model_offload_true_none_xpu", + "test_nested_always_wrap_model_offload_true_shard_grad_op_xpu", + "test_nested_wrapped_model_offload_false_no_shard_xpu", + "test_nested_wrapped_model_offload_false_none_xpu", + "test_nested_wrapped_model_offload_false_shard_grad_op_xpu", + "test_nested_wrapped_model_offload_true_none_xpu", + "test_nested_wrapped_model_offload_true_shard_grad_op_xpu", + "test_transformer_offload_false_none_xpu", + "test_transformer_offload_false_shard_grad_op_xpu", + "test_transformer_offload_true_none_xpu", + "test_transformer_offload_true_shard_grad_op_xpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None, + # ( + # # https://github.com/intel/torch-xpu-ops/issues/1504 + # " test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_False_xpu", + # ), + "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None, + "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_hooks_multi_traversal_xpu", + "test_parity_with_ddp_xpu", + "test_parity_with_non_frozen_fsdp_xpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None, + "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True ", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True", + ), + "../../../../test/distributed/fsdp/test_fsdp_fx.py": None, + "../../../../test/distributed/fsdp/test_fsdp_grad_acc.py": None, + "../../../../test/distributed/fsdp/test_fsdp_hybrid_shard.py": None, + "../../../../test/distributed/fsdp/test_fsdp_ignored_modules.py": None, + "../../../../test/distributed/fsdp/test_fsdp_input.py": None, + "../../../../test/distributed/fsdp/test_fsdp_memory.py": None, + "../../../../test/distributed/fsdp/test_fsdp_meta.py": None, + "../../../../test/distributed/fsdp/test_fsdp_misc.py": ( + # # https://github.com/intel/torch-xpu-ops/issues/1535 + # "test_fsdp_zero2_eval_with_prefetch", + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_fsdp_optimizer_overlap", + ), + "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None, + # ( + # "test_buffer_dtype_no_root_handle", + # ), + "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_multi_forward_cpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None, + # https://github.com/intel/torch-xpu-ops/issues/1537 + "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_flatten_sharded_optim_state_dict_nested", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_False", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_True", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_True_use_diff_optim_inputs_False", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_True_use_diff_optim_inputs_True", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_False", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_True", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_True_use_diff_optim_inputs_False", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_True_use_diff_optim_inputs_True", + "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_False", + "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_True", + "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_False", + "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_True", + "test_rekey_optim_state_dict_to_ids_state_dict_type0_use_multiple_param_groups_False", + "test_rekey_optim_state_dict_to_ids_state_dict_type0_use_multiple_param_groups_True", + "test_rekey_optim_state_dict_to_ids_state_dict_type1_use_multiple_param_groups_False", + "test_rekey_optim_state_dict_to_ids_state_dict_type1_use_multiple_param_groups_True", + "test_rekey_optim_state_dict_to_names", + "test_scatter_full_optim_state_dict_nested_halve_world_size", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_False", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_True", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_False", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_True", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_False", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_True", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_False", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_True", + "test_shard_full_optim_state_dict_nested_halve_world_size", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_False", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_True", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_False", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_True", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_False", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_True", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_False", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_True", + "test_use_orig_params", + ), + # Performance check, skip + # "../../../../test/distributed/fsdp/test_fsdp_overlap.py": ( + # # https://github.com/intel/torch-xpu-ops/issues/1504 + # "test_forward_overlap", + # "test_forward_overlap_xpu", + # ), + "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None, + "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_fsdp_ddp_parity_with_grad_scaler_offload_false_none_none_none", + "test_fsdp_ddp_parity_with_grad_scaler_offload_false_shard_grad_op_none_none", + "test_fsdp_ddp_parity_with_grad_scaler_offload_true_none_none_none", + "test_fsdp_ddp_parity_with_grad_scaler_offload_true_shard_grad_op_none_none", + ), + "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_state_dict_save_load_flow_state_dict_type_local_state_dict", + "test_state_dict_save_load_flow_state_dict_type_sharded_state_dict", + "test_state_dict_save_load_flow_state_dict_type_state_dict", + ), + "../../../../test/distributed/fsdp/test_fsdp_tp_integration.py": None, + "../../../../test/distributed/fsdp/test_fsdp_traversal.py": None, + "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None, + "../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None, + "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_diff_hyperparams_sharding_strategy_str_full_shard", + "test_diff_hyperparams_sharding_strategy_str_no_shard", + "test_diff_hyperparams_sharding_strategy_str_shard_grad_op", + "test_no_sync_correctness", + ), + "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": None, + "../../../../test/distributed/fsdp/test_shard_utils.py": None, + "../../../../test/distributed/fsdp/test_utils.py": None, + "../../../../test/distributed/fsdp/test_wrap.py": None, + "../../../../test/distributed/test_backends.py": None, + "../../../../test/distributed/test_c10d_common.py": None, + "../../../../test/distributed/test_c10d_functional_native.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1508 + # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path + "test_reduce_scatter_tensor_coalesced", + "test_reduce_scatter_tensor_single", + # # https://github.com/intel/torch-xpu-ops/issues/1525 + # # ValueError: trying to initialize the default process group twice! + # "test_inductor_all_gather_into_tensor_coalesced", + # "test_inductor_all_gather_into_tensor_single", + # "test_inductor_all_reduce_coalesced", + # "test_inductor_all_reduce_non_contig_input", + # "test_inductor_all_reduce_single", + # "test_inductor_all_to_all_single", + # "test_inductor_broadcast", + # "test_inductor_inplace_op_on_view", + # "test_inductor_reduce_scatter_tensor_coalesced", + # "test_inductor_reduce_scatter_tensor_single", + # "test_inductor_reuse_buffer_after_inplace_collective", + # "test_ranks_and_tag", + # "test_wait_tensor", + ), + "../../../../test/distributed/test_c10d_logger.py": None, + "../../../../test/distributed/test_c10d_object_collectives.py": ( + # # RuntimeError: Process 0 terminated or timed out after 300.09047198295593 seconds + # # https://github.com/intel/torch-xpu-ops/issues/1535 + # "test_gather_object_cpu", + # "test_gather_object_xpu", + # "test_gather_object_list_cpu", + # "test_gather_object_list_xpu", + # RuntimeError: Process 2 exited with error code 10 and exception: ; AssertionError: Scalars are not equal! + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_scatter_object_list_cpu", + "test_scatter_object_list_xpu", + ), + "../../../../test/distributed/test_compute_comm_reordering.py": None, + "../../../../test/distributed/test_control_collectives.py": None, + "../../../../test/distributed/test_device_mesh.py": None, + "../../../../test/distributed/test_dynamo_distributed.py": ( + # # AttributeError:'torch._C._distributed_c10d.ProcessGroupXCCL' object has no attribute '_set_default_timeout' + # "test_asymmetric_compilation", + # "test_asymmetric_compilation_with_fx_cache", + # # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device. + # "test_compiled_flex_attention_full_model_ddp", + # "test_compiled_flex_attention_local_ddp", + # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ + # https://github.com/intel/torch-xpu-ops/issues/1527 + # "test_compiler_collectives_automatic_dynamic_scalar", + # "test_compiler_collectives_automatic_dynamic_speculation_divergence", + # "test_compiler_collectives_automatic_dynamic_tensor", + # "test_compiler_collectives_dim_mismatch", + # "test_compiler_collectives_graph_break_empty_graph_still_collective", + # "test_compiler_collectives_missing_source", + # "test_compiler_collectives_scalar_missing_source", + # "test_compiler_collectives_type_mismatch", + # "test_ddp_activation_checkpointing", + # "test_ddp_baseline_aot_eager_multiprocess", + # "test_fsdp_activation_checkpointing", + # "test_fsdp_aot_eager", + # "test_fsdp_inductor", + "test_fsdp_setattr", + # "test_fsdp_unspecialized_forced_getattr_inline", + # "test_fsdp_unspecialized_forced_getattr_no_inline", + ), + "../../../../test/distributed/test_fake_pg.py": None, + "../../../../test/distributed/test_functional_api.py": None, + "../../../../test/distributed/test_inductor_collectives.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1581 + # Fatal Python error: Segmentation fault + "test_dynamo_rewrite_dist_all_gather", + "test_dynamo_rewrite_dist_all_gather_list", + "test_dynamo_rewrite_dist_all_gather_args_match", + "test_dynamo_rewrite_dist_reduce_scatter", + "test_dynamo_support_collective_op_with_async_op_False", + "test_dynamo_trace_reduce_scatter_tensor", + "test_dynamo_trace_all_gather_tensor", + "test_dynamo_trace_allgather_coalesced", + "test_inductor_reduce_scatter_coalesced", + "test_inductor_all_gather_coalesced", + ), + "../../../../test/distributed/test_multi_threaded_pg.py": None, + # ( + # # oneccl not support multi-threaded well, so skip it first. + # # https://github.com/intel/torch-xpu-ops/issues/1509 + # "test_bwd_sees_fwd_pg", + # ), + "../../../../test/distributed/test_store.py": None, + "../../../../test/distributed/pipelining/test_backward.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_stage_backward_weight_multiple_iters_xpu", + "test_stage_backward_weight_xpu", + "test_stage_backward_xpu", + ), + "../../../../test/distributed/pipelining/test_microbatch.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_chunk_spec_xpu", + ), + "../../../../test/distributed/pipelining/test_pipe.py": None, + "../../../../test/distributed/pipelining/test_schedule.py": None, + "../../../../test/distributed/pipelining/test_transformer.py": None, + "../../../../test/distributed/pipelining/test_unflatten.py": None, + "../../../../test/distributed/tensor/parallel/test_micro_pipeline_tp.py": ( + # NotImplementedError: The operator 'symm_mem::fused_matmul_reduce_scatter' + # is not currently implemented for the XPU device + # https://github.com/intel/torch-xpu-ops/issues/1547 + "test_dtensor_seq_par_shard_dim_0", + "test_dtensor_seq_par_shard_dim_1", + "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_0", + "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_1", + "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_0", + "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_1", + "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_2", + # AssertionError: 'fused_all_gather_matmul' not found in '# AOT ID: ......' + # https://github.com/intel/torch-xpu-ops/issues/1548 + "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_False", + "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_True", + "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_False", + "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_True", + "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_False", + "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_True", + # AssertionError: 'fused_all_gather_scaled_matmul' not found in 'graph():\n......' + # https://github.com/intel/torch-xpu-ops/issues/1549 + "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_True", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_True", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_True", + # NotImplementedError: The operator 'aten::_scaled_mm.out' is not currently implemented for the XPU device. + # https://github.com/intel/torch-xpu-ops/issues/1550 + "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_True", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_True", + # NotImplementedError: The operator 'symm_mem::fused_scaled_matmul_reduce_scatter' + # is not currently implemented for the XPU device. + # https://github.com/intel/torch-xpu-ops/issues/1551 + "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_0", + "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_1", + "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_0", + "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_1", + "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_2", + "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_0", + "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_1", + "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_2", + ), + "../../../../test/distributed/tensor/parallel/test_tp_examples.py": ( + # RuntimeError: aten.add.Tensor: got mixed torch.Tensor and DTensor, need to convert all torch.Tensor to DTensor before calling distributed operators! + # https://github.com/intel/torch-xpu-ops/issues/1555 + "test_transformer_req_grad_seq_parallel_float32_thaw_all", + "test_transformer_req_grad_seq_parallel_float32_thaw_layers_0_attention_wv__layers_0_feed_forward_w1__layers_1_feed_forward_w2__layers_1_ffn_norm__output__tok_embeddings", + "test_transformer_req_grad_seq_parallel_float32_thaw_layers_1_ffn_norm__norm__output__tok_embeddings", + "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output__tok_embeddings", + "test_transformer_req_grad_seq_parallel_float32_thaw_output__tok_embeddings", + "test_transformer_training_is_seq_parallel_False_float32", + "test_transformer_training_is_seq_parallel_True_float32", + # NotImplementedError: Operator aten._scaled_dot_product_fused_attention_overrideable.default does not have a sharding strategy registered. + # https://github.com/intel/torch-xpu-ops/issues/1556 + "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output", + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_loss_parallel", + "test_mlp_training_is_seq_parallel_False_recompute_activation_False", + "test_mlp_training_is_seq_parallel_True_recompute_activation_False", + "test_transformer_req_grad_float64_thaw_all", + "test_transformer_training_is_seq_parallel_False_float64", + "test_transformer_training_is_seq_parallel_True_float64", + "test_sequence_parallel_style", + ), + "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, + "../../../../test/distributed/tensor/parallel/test_parallelize_api.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_linear_col_wise_parallel", + "test_parallelize_mlp_with_module_api", + "test_parallelize_mlp_with_module_api_nested", + "test_parallelize_module_multi_wildcard", + "test_parallelize_module_src_data_rank", + "test_parallelize_module_with_digit", + "test_parallelize_module_with_question", + "test_parallelize_module_with_star", + "test_under_devicemesh_context", + ), + "../../../../test/distributed/tensor/parallel/test_tp_style.py": None, + "../../../../test/distributed/tensor/test_api.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_distribute_tensor_rank", + "test_distribute_tensor_uneven_sharding", + ), + "../../../../test/distributed/tensor/test_attention.py": None, + "../../../../test/distributed/tensor/test_common_rules.py": None, + "../../../../test/distributed/tensor/test_dtensor.py": None, + "../../../../test/distributed/tensor/test_dtensor_compile.py": None, + # ( + # # https://jira.devtools.intel.com/browse/MLSL-3625 + # "test_2d_fsdp_tp_compile", + # ), + "../../../../test/distributed/tensor/test_experimental_ops.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1535 + "test_bernoulli", + ), + "../../../../test/distributed/tensor/test_init.py": None, + "../../../../test/distributed/tensor/test_math_ops.py": ( + # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path + # https://github.com/intel/torch-xpu-ops/issues/1508 + "test_mean", + "test_nll_loss_and_cross_entropy", + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_cumsum", + "test_layer_norm_bwd", + "test_layer_norm_bwd_req_grad", + "test_layer_norm_fwd", + "test_linear_op_reductions", + "test_shard0_svd", + "test_softmax_fwd", + "test_topk", + ), + "../../../../test/distributed/tensor/test_random_ops.py": ( + # Need to update world size + "test_hsdp_tp_model_meta_init", + ), + "../../../../test/distributed/tensor/test_redistribute.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_redistribute_shard_dim_multi_dim_mesh", + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_redistribute_shard_dim_change", + "test_redistribute_uneven_sharding", + "test_shard_to_replicate_forward_backward", + "test_shard_to_replicate_forward_backward_datatype_conversion", + "test_multi_dim_mesh", + ), + "../../../../test/distributed/tensor/test_tensor_ops.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_aten_contiguous", + "test_gather", + "test_index", + # "test_op_out_variant", + "test_slice", + "test_stack", + # "test_where_type_promotion", + ), + "../../../../test/distributed/tensor/experimental/test_register_sharding.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_argmax", + "test_softmax_fwd", + ), + # FSDP2 + "../../../../test/distributed/_composable/fsdp/test_fully_shard_autograd.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_nontensor_activations", + "test_unused_forward_module", + "test_unused_forward_output", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_clip_grad_norm_2d", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1571 + "test_set_reduce_scatter_divide_factor", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_extensions.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_frozen.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_train_mixed_requires_grad_per_group", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": None, + # ( + # # https://github.com/intel/torch-xpu-ops/issues/1508 + # "test_gradient_scaler", + # ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_init.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_logging.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_memory.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1535 + "test_fully_shard_training_memory", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_compute_dtype", + "test_grad_acc_with_reduce_dtype", + "test_reduce_dtype", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_overlap.py": ( + # Performance test, should skip + "test_fully_shard_training_overlap", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1572 + "test_dp_state_dict_cpu_offload", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_state.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_training.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1508 + "test_post_optim_event", + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_train_parity_multi_group_unshard_async_op", + "test_train_parity_with_activation_checkpointing", + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_1f1b_microbatching", + "test_gradient_accumulation", + ), + "../../../../test/distributed/_composable/test_replicate_with_compiler.py": ( + # AssertionError: Tensor-likes are not close! + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_compile_backward_only", + "test_compile_bf16", + "test_compile_fp16", + "test_compile_gpu", + "test_compile_gpu_ac", + ), + "../../../../test/distributed/_shard/test_sharder.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_custom_sharder", + ), + "../../../../test/distributed/_shard/sharded_tensor/test_logger.py": None, + "../../../../test/distributed/_shard/sharded_tensor/test_sharded_tensor.py": { + # RuntimeError: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) + "test_complete_world_size", + "test_multiple_local_shards", + "test_new_group", + "test_partial_world_size", + "test_grid_sharding", + "test_multiple_local_shards", + "test_new_group", + "test_partial_world_size", + "test_with_rpc_names", + "test_init_from_local_tensor", + # what(): Attempting to send a Tensor with unexpected device type xpu:3 + # https://github.com/intel/torch-xpu-ops/issues/1616 + "test_init_from_local_shards", + "test_init_from_local_shards_and_global_metadata", + }, + "../../../../test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py": None, + "../../../../test/distributed/_shard/sharding_plan/test_sharding_plan.py": None, + "../../../../test/distributed/_shard/sharding_spec/test_sharding_spec.py": None, + "../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1508 + # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path + "test_tracker_multi_group_eager", + "test_tracker_non_root_forward_backward", + "test_tracker_with_activation_checkpointing", + ), + "../../../../test/distributed/_tools/test_mem_tracker.py": None, + "../../../../test/distributed/_tools/test_memory_tracker.py": None, + "../../../../test/distributed/_tools/test_mod_tracker.py": None, +} + +skip_dict_python = { + "distributed/test_c10d_ops_xccl.py": None, + "distributed/test_c10d_xccl.py": None, + # "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. + "../../../../test/distributed/pipelining/test_stage.py": None, + "../../../../test/distributed/pipelining/test_transformer.py": None, +} diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index 7b2bac5e6..d58d3d9a5 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -1153,6 +1153,7 @@ def copy_tests( def launch_test(test_case, skip_list=None, exe_list=None): os.environ["PYTORCH_ENABLE_XPU_FALLBACK"] = "1" os.environ["PYTORCH_TEST_WITH_SLOW"] = "1" + rename = test_case.replace("../../../../", "").replace("/", "_") if skip_list is not None: skip_options = ' -k "not ' + skip_list[0] for skip_case in skip_list[1:]: @@ -1160,8 +1161,7 @@ def launch_test(test_case, skip_list=None, exe_list=None): skip_options += skip_option skip_options += '"' test_command = ( - f"pytest --timeout 600 -v --junit-xml=./op_ut_with_skip_{test_case}.xml " - + test_case + f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml " + test_case ) test_command += skip_options elif exe_list is not None: @@ -1171,13 +1171,11 @@ def launch_test(test_case, skip_list=None, exe_list=None): exe_options += exe_option exe_options += '"' test_command = ( - f"pytest --timeout 600 -v --junit-xml=./op_ut_with_skip_{test_case}.xml " - + test_case + f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml " + test_case ) test_command += exe_options else: test_command = ( - f"pytest --timeout 600 -v --junit-xml=./op_ut_with_skip_{test_case}.xml " - + test_case + f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml " + test_case ) return os.system(test_command)