From a2c3f35f047e6b711d3ccf4983856482563b1c13 Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Wed, 2 Apr 2025 06:01:16 -0700 Subject: [PATCH 01/41] enable fsdp cases based on local branch --- test/xpu/run_distributed_local.py | 63 +++++++++++++++++++++++++++++++ test/xpu/skip_list_dist_local.py | 57 ++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100644 test/xpu/run_distributed_local.py create mode 100644 test/xpu/skip_list_dist_local.py diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py new file mode 100644 index 000000000..8074b3292 --- /dev/null +++ b/test/xpu/run_distributed_local.py @@ -0,0 +1,63 @@ +import os +import subprocess +import sys + +from skip_list_dist_local import skip_dict +from xpu_test_utils import launch_test + +res = 0 +fail_test = [] + +# Get the xelink group card affinity +ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") +if ret == 0: + gpu_dict = {} + with open("topology.log", "r") as file: + lines = file.readlines() + for line in lines: + if "CPU Affinity" in line: + continue + line = line.strip() + if line.startswith("GPU "): + items = line.split(' ') + items = [x for x in items if x] + gpu_id = items[1] + i = gpu_id.split('/')[0] + affinity = "" + for j, item in enumerate(items): + if "SYS" not in item and ( "XL" in item or "S" in item ): + if len(affinity) == 0: + affinity = str(j-2) + else: + affinity = affinity + ',' + str(j-2) + gpu_dict[i] = affinity + + + max_affinity = "" + for key, value in gpu_dict.items(): + if len(value) > len(max_affinity): + max_affinity = value + + os.environ["ZE_AFFINITY_MASK"] = str(max_affinity) + print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK"))) + +else: + print("xpu-smi topology failed") + sys.exit(255) + +# run pytest with skiplist +for key in skip_dict: + skip_list = skip_dict[key] + fail = launch_test(key, skip_list) + res += fail + if fail: + fail_test.append(key) + +if fail_test: + print(",".join(fail_test) + " have failures") + +exit_code = os.WEXITSTATUS(res) +if exit_code == 0: + sys.exit(res) +else: + sys.exit(exit_code) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py new file mode 100644 index 000000000..08f90c6b5 --- /dev/null +++ b/test/xpu/skip_list_dist_local.py @@ -0,0 +1,57 @@ +skip_dict = { + "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None, + # https://github.com/intel/torch-xpu-ops/issues/1536 + #"../../../../test/distributed/fsdp/test_distributed_checkpoint.py": ( + # "test_distributed_checkpoint_state_dict_type0_xpu", + # "test_distributed_checkpoint_state_dict_type1_xpu", + #), + "../../../../test/distributed/fsdp/test_fsdp_apply.py": None, + "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None, + "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None, + "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": None, + "../../../../test/distributed/fsdp/test_fsdp_comm.py": None, + "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None, + "../../../../test/distributed/fsdp/test_fsdp_core.py": ( + "test_delayed_optim_step_offload_true_no_shard_xpu", + "test_transformer_no_grad_mixed_precision_True_xpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None, + "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None, + "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": None, + "../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None, + "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": None, + "../../../../test/distributed/fsdp/test_fsdp_fx.py": None, + "../../../../test/distributed/fsdp/test_fsdp_grad_acc.py": None, + "../../../../test/distributed/fsdp/test_fsdp_hybrid_shard.py": None, + "../../../../test/distributed/fsdp/test_fsdp_ignored_modules.py": None, + "../../../../test/distributed/fsdp/test_fsdp_input.py": None, + "../../../../test/distributed/fsdp/test_fsdp_memory.py": None, + "../../../../test/distributed/fsdp/test_fsdp_meta.py": None, + "../../../../test/distributed/fsdp/test_fsdp_misc.py": ( + "test_fsdp_zero2_eval_with_prefetch", + ), + "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None, + "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None, + "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None, + # https://github.com/intel/torch-xpu-ops/issues/1537 + "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": ( + "test_use_orig_params", + ), + # Performance check, skip + #"../../../../test/distributed/fsdp/test_fsdp_overlap.py": ( + # "test_forward_overlap", + # "test_forward_overlap_xpu", + #), + "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None, + "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None, + "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None, + "../../../../test/distributed/fsdp/test_fsdp_tp_integration.py": None, + "../../../../test/distributed/fsdp/test_fsdp_traversal.py": None, + "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None, + "../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None, + "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": None, + "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": None, + "../../../../test/distributed/fsdp/test_shard_utils.py": None, + "../../../../test/distributed/fsdp/test_utils.py": None, + "../../../../test/distributed/fsdp/test_wrap.py": None, +} From e772d23680c67301d6e9e5a47b741fc622c49158 Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Wed, 2 Apr 2025 19:46:24 -0700 Subject: [PATCH 02/41] add 2025.0 WA --- test/xpu/run_distributed_local.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index 8074b3292..b6a9ef60c 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -8,6 +8,9 @@ res = 0 fail_test = [] +os.environ["CCL_ATL_TRANSPORT"] = "ofi" +os.environ["CCL_SEND"] = "direct" +os.environ["CCL_RECV"] = "direct" # Get the xelink group card affinity ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") if ret == 0: From cbd34cd308e4cd601561c3ce64e44c408b94f730 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Thu, 3 Apr 2025 01:27:44 -0700 Subject: [PATCH 03/41] Update distributed UT cases in DDP and PP Signed-off-by: Cheng, Penghui --- test/xpu/run_distributed_local.py | 29 +++++++++- test/xpu/skip_list_dist_local.py | 91 +++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+), 2 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index b6a9ef60c..982f05409 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -2,15 +2,17 @@ import subprocess import sys -from skip_list_dist_local import skip_dict +from skip_list_dist_local import skip_dict, skip_dict_python from xpu_test_utils import launch_test res = 0 +res2 = 0 fail_test = [] os.environ["CCL_ATL_TRANSPORT"] = "ofi" os.environ["CCL_SEND"] = "direct" os.environ["CCL_RECV"] = "direct" +os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining" # Get the xelink group card affinity ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") if ret == 0: @@ -48,6 +50,29 @@ print("xpu-smi topology failed") sys.exit(255) +# run python test +def run(test_command): + result = subprocess.run(test_command, capture_output=True, text=True) + print(result.stdout) + print(result.stderr) + return result + +for key in skip_dict_python: + skip_list = skip_dict_python[key] + test_command = ["python", key] + fail = run(test_command) + if fail.returncode: + for line in fail.stderr.split("\n"): + if "FAIL: " in line: + is_error = True + for skip_case in skip_list: + if skip_case in line: + print("Skiped error: ", key + " " + skip_case) + is_error = False + if is_error: + res2 += fail.returncode + fail_test.append("".join(key + " " + line)) + # run pytest with skiplist for key in skip_dict: skip_list = skip_dict[key] @@ -61,6 +86,6 @@ exit_code = os.WEXITSTATUS(res) if exit_code == 0: - sys.exit(res) + sys.exit(res2) else: sys.exit(exit_code) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 08f90c6b5..d65b7aee6 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -54,4 +54,95 @@ "../../../../test/distributed/fsdp/test_shard_utils.py": None, "../../../../test/distributed/fsdp/test_utils.py": None, "../../../../test/distributed/fsdp/test_wrap.py": None, + "../../../../test/distributed/test_backends.py": None, + "../../../../test/distributed/test_c10d_common.py": None, + "../../../../test/distributed/test_c10d_functional_native.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1508 + #RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path + "test_reduce_scatter_tensor_coalesced", + "test_reduce_scatter_tensor_single", + # https://github.com/intel/torch-xpu-ops/issues/1525 + # ValueError: trying to initialize the default process group twice! + "test_inductor_all_gather_into_tensor_coalesced", + "test_inductor_all_gather_into_tensor_single", + "test_inductor_all_reduce_coalesced", + "test_inductor_all_reduce_non_contig_input", + "test_inductor_all_reduce_single", + "test_inductor_all_to_all_single", + "test_inductor_broadcast", + "test_inductor_inplace_op_on_view", + "test_inductor_reduce_scatter_tensor_coalesced", + "test_inductor_reduce_scatter_tensor_single", + "test_inductor_reuse_buffer_after_inplace_collective", + "test_ranks_and_tag", + "test_wait_tensor", + ), + "../../../../test/distributed/test_c10d_logger.py": None, + "../../../../test/distributed/test_c10d_object_collectives.py": ( + # RuntimeError: Process 0 terminated or timed out after 300.09047198295593 seconds + # https://github.com/intel/torch-xpu-ops/issues/1535 + "test_gather_object_cpu", + "test_gather_object_xpu", + "test_gather_object_list_cpu", + "test_gather_object_list_xpu", + ), + "../../../../test/distributed/test_compute_comm_reordering.py": None, + "../../../../test/distributed/test_control_collectives.py": None, + "../../../../test/distributed/test_device_mesh.py": None, + "../../../../test/distributed/test_dynamo_distributed.py": ( + # AttributeError:'torch._C._distributed_c10d.ProcessGroupXCCL' object has no attribute '_set_default_timeout' + "test_asymmetric_compilation", + "test_asymmetric_compilation_with_fx_cache", + # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device. + "test_compiled_flex_attention_full_model_ddp", + "test_compiled_flex_attention_local_ddp", + # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ + # https://github.com/intel/torch-xpu-ops/issues/1527 + "test_compiler_collectives_automatic_dynamic_scalar", + "test_compiler_collectives_automatic_dynamic_speculation_divergence", + "test_compiler_collectives_automatic_dynamic_tensor", + "test_compiler_collectives_dim_mismatch", + "test_compiler_collectives_graph_break_empty_graph_still_collective", + "test_compiler_collectives_missing_source", + "test_compiler_collectives_scalar_missing_source", + "test_compiler_collectives_type_mismatch", + "test_ddp_activation_checkpointing", + "test_ddp_baseline_aot_eager_multiprocess", + "test_fsdp_activation_checkpointing", + "test_fsdp_aot_eager", + "test_fsdp_inductor", + "test_fsdp_setattr", + "test_fsdp_unspecialized_forced_getattr_inline", + "test_fsdp_unspecialized_forced_getattr_no_inline", + # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES) + # https://github.com/intel/torch-xpu-ops/issues/1526 + "test_get_pg_attr", + ), + "../../../../test/distributed/test_fake_pg.py": None, + "../../../../test/distributed/test_functional_api.py": ( + # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES) + # https://github.com/intel/torch-xpu-ops/issues/1526 + "test_tracing_xpu", + "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu" + ), + "../../../../test/distributed/test_multi_threaded_pg.py": ( + # oneccl not support multi-threaded well, so skip it first. + "test_bwd_sees_fwd_pg", + ), + "../../../../test/distributed/test_store.py": None, + "../../../../test/distributed/pipelining/test_backward.py": None, + "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, + "../../../../test/distributed/pipelining/test_backward.py": None, + "../../../../test/distributed/pipelining/test_microbatch.py": None, + "../../../../test/distributed/pipelining/test_pipe.py": None, + "../../../../test/distributed/pipelining/test_schedule.py": None, + "../../../../test/distributed/pipelining/test_transformer.py": None, + "../../../../test/distributed/pipelining/test_unflatten.py": None, +} + +skip_dict_python = { + "distributed/test_c10d_ops_xccl.py": None, + "distributed/test_c10d_xccl.py": None, + "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. + "../../../../test/distributed/pipelining/test_stage.py": None, } From d856e950310ed44446d81d9b37250b7b7d4fbcc3 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Thu, 3 Apr 2025 01:36:16 -0700 Subject: [PATCH 04/41] Fixed pylint error Signed-off-by: Cheng, Penghui --- test/xpu/run_distributed_local.py | 7 +++---- test/xpu/skip_list_dist_local.py | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index 982f05409..a5f0c8098 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -11,7 +11,7 @@ os.environ["CCL_ATL_TRANSPORT"] = "ofi" os.environ["CCL_SEND"] = "direct" -os.environ["CCL_RECV"] = "direct" +os.environ["CCL_RECV"] = "direct" os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining" # Get the xelink group card affinity ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") @@ -36,13 +36,12 @@ else: affinity = affinity + ',' + str(j-2) gpu_dict[i] = affinity - - + max_affinity = "" for key, value in gpu_dict.items(): if len(value) > len(max_affinity): max_affinity = value - + os.environ["ZE_AFFINITY_MASK"] = str(max_affinity) print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK"))) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index d65b7aee6..6ce62b8ca 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -96,7 +96,7 @@ # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device. "test_compiled_flex_attention_full_model_ddp", "test_compiled_flex_attention_local_ddp", - # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ + # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ # https://github.com/intel/torch-xpu-ops/issues/1527 "test_compiler_collectives_automatic_dynamic_scalar", "test_compiler_collectives_automatic_dynamic_speculation_divergence", @@ -131,13 +131,13 @@ ), "../../../../test/distributed/test_store.py": None, "../../../../test/distributed/pipelining/test_backward.py": None, - "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, "../../../../test/distributed/pipelining/test_backward.py": None, "../../../../test/distributed/pipelining/test_microbatch.py": None, "../../../../test/distributed/pipelining/test_pipe.py": None, "../../../../test/distributed/pipelining/test_schedule.py": None, "../../../../test/distributed/pipelining/test_transformer.py": None, "../../../../test/distributed/pipelining/test_unflatten.py": None, + "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, } skip_dict_python = { From 28a259e59448bb70958a818d3f50fee62f2ebfa2 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Thu, 3 Apr 2025 02:01:55 -0700 Subject: [PATCH 05/41] Fixed pylint error Signed-off-by: Cheng, Penghui --- test/xpu/run_distributed_local.py | 38 ++++++++++++++++--------------- test/xpu/skip_list_dist_local.py | 17 +++++++------- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index a5f0c8098..d4db4785a 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -17,29 +17,29 @@ ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") if ret == 0: gpu_dict = {} - with open("topology.log", "r") as file: + with open("topology.log") as file: lines = file.readlines() for line in lines: - if "CPU Affinity" in line: - continue - line = line.strip() - if line.startswith("GPU "): - items = line.split(' ') - items = [x for x in items if x] - gpu_id = items[1] - i = gpu_id.split('/')[0] - affinity = "" - for j, item in enumerate(items): - if "SYS" not in item and ( "XL" in item or "S" in item ): - if len(affinity) == 0: - affinity = str(j-2) - else: - affinity = affinity + ',' + str(j-2) - gpu_dict[i] = affinity + if "CPU Affinity" in line: + continue + line = line.strip() + if line.startswith("GPU "): + items = line.split(" ") + items = [x for x in items if x] + gpu_id = items[1] + i = gpu_id.split("/")[0] + affinity = "" + for j, item in enumerate(items): + if "SYS" not in item and ("XL" in item or "S" in item): + if len(affinity) == 0: + affinity = str(j - 2) + else: + affinity = affinity + "," + str(j - 2) + gpu_dict[i] = affinity max_affinity = "" for key, value in gpu_dict.items(): - if len(value) > len(max_affinity): + if len(value) > len(max_affinity): max_affinity = value os.environ["ZE_AFFINITY_MASK"] = str(max_affinity) @@ -49,6 +49,7 @@ print("xpu-smi topology failed") sys.exit(255) + # run python test def run(test_command): result = subprocess.run(test_command, capture_output=True, text=True) @@ -56,6 +57,7 @@ def run(test_command): print(result.stderr) return result + for key in skip_dict_python: skip_list = skip_dict_python[key] test_command = ["python", key] diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 6ce62b8ca..0ac46961e 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -1,10 +1,10 @@ skip_dict = { "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None, # https://github.com/intel/torch-xpu-ops/issues/1536 - #"../../../../test/distributed/fsdp/test_distributed_checkpoint.py": ( + # "../../../../test/distributed/fsdp/test_distributed_checkpoint.py": ( # "test_distributed_checkpoint_state_dict_type0_xpu", # "test_distributed_checkpoint_state_dict_type1_xpu", - #), + # ), "../../../../test/distributed/fsdp/test_fsdp_apply.py": None, "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None, "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None, @@ -37,11 +37,11 @@ "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": ( "test_use_orig_params", ), - # Performance check, skip - #"../../../../test/distributed/fsdp/test_fsdp_overlap.py": ( + # Performance check, skip + # "../../../../test/distributed/fsdp/test_fsdp_overlap.py": ( # "test_forward_overlap", # "test_forward_overlap_xpu", - #), + # ), "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None, "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None, "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None, @@ -58,7 +58,7 @@ "../../../../test/distributed/test_c10d_common.py": None, "../../../../test/distributed/test_c10d_functional_native.py": ( # https://github.com/intel/torch-xpu-ops/issues/1508 - #RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path + # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path "test_reduce_scatter_tensor_coalesced", "test_reduce_scatter_tensor_single", # https://github.com/intel/torch-xpu-ops/issues/1525 @@ -123,7 +123,7 @@ # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES) # https://github.com/intel/torch-xpu-ops/issues/1526 "test_tracing_xpu", - "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu" + "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu", ), "../../../../test/distributed/test_multi_threaded_pg.py": ( # oneccl not support multi-threaded well, so skip it first. @@ -131,7 +131,6 @@ ), "../../../../test/distributed/test_store.py": None, "../../../../test/distributed/pipelining/test_backward.py": None, - "../../../../test/distributed/pipelining/test_backward.py": None, "../../../../test/distributed/pipelining/test_microbatch.py": None, "../../../../test/distributed/pipelining/test_pipe.py": None, "../../../../test/distributed/pipelining/test_schedule.py": None, @@ -143,6 +142,6 @@ skip_dict_python = { "distributed/test_c10d_ops_xccl.py": None, "distributed/test_c10d_xccl.py": None, - "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. + "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. "../../../../test/distributed/pipelining/test_stage.py": None, } From 62e9ff75ced8a311c1e52c61fd49c97622075378 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Sat, 5 Apr 2025 19:25:50 +0000 Subject: [PATCH 06/41] add distributed ut in CI --- .github/scripts/ut_result_check.sh | 10 +-- .github/workflows/_linux_build.yml | 6 +- .github/workflows/_linux_ut.yml | 140 +++++++++++++++++++++++++++++ .github/workflows/pull.yml | 25 ++++++ 4 files changed, 175 insertions(+), 6 deletions(-) diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh index 3fb1a1997..32dbed489 100644 --- a/.github/scripts/ut_result_check.sh +++ b/.github/scripts/ut_result_check.sh @@ -72,14 +72,14 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then echo -e "[PASS] UT ${ut_suite} test Pass" fi fi -if [[ "${ut_suite}" == 'xpu_distributed' ]]; then - grep -E "^FAILED|have failures" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log - num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed.log") +if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then + grep -E "ERROR" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log + num_failed_distributed=$(wc -l < "./${ut_suite}_test_failed.log") echo -e "=========================================================================" echo -e "Show Failed cases in ${ut_suite} xpu distributed" echo -e "=========================================================================" - cat "./${ut_suite}_xpu_distributed_test_failed.log" - ((num_failed=num_failed_xpu_distributed)) + cat "./${ut_suite}_test_failed.log" + ((num_failed=num_failed_distributed)) if [[ $num_failed -gt 0 ]]; then echo -e "[ERROR] UT ${ut_suite} test Fail" exit 1 diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index ee9381c9c..3ed1c3d4e 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -70,7 +70,11 @@ jobs: source activate xpu_build cd ../ && rm -rf pytorch pip install requests - git clone https://github.com/pytorch/pytorch pytorch + if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then + git clone https://github.com/daisyden/pytorch.git pytorch + else + git clone https://github.com/pytorch/pytorch pytorch + fi cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) # apply PRs for stock pytorch python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index a11528a3e..aa631c6dd 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -435,3 +435,143 @@ jobs: with: name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-xpu_distributed path: ${{ github.workspace }}/ut_log + + pytorch_distributed_test: + runs-on: ${{ inputs.runner }} + if: contains(inputs.ut, 'pytorch_distributed') + timeout-minutes: 900 + env: + NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} + DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} + steps: + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Prepare Stock Pytorch + run: | + pwd + which conda && conda clean -ay + conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \ + rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} + conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ../ && rm -rf pytorch + pip install requests + git clone https://github.com/daisyden/pytorch.git pytorch + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + fi + fi + - name: Triton Installation + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ../pytorch + TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton" + if [ -z ${{ inputs.triton }} ]; then + TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" + else + TRITON_COMMIT_ID="${{ inputs.triton }}" + fi + echo ${TRITON_REPO}@${TRITON_COMMIT_ID} + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python" + fi + - name: Download Pytorch wheel + if: ${{ inputs.pytorch != 'nightly_wheel' }} + uses: actions/download-artifact@v4 + with: + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }} + path: ${{ github.workspace }} + - name: Install Pytorch XPU + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + source .github/scripts/env.sh ${{ inputs.pytorch }} + pip install mkl-static==2025.0.1 mkl-include==2025.0.1 + if [[ ${{ inputs.abi }} == '0' ]]; then + export _GLIBCXX_USE_CXX11_ABI=0 + else + export _GLIBCXX_USE_CXX11_ABI=1 + fi + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd ../pytorch + export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"} + pip install -r requirements.txt + pip install --force-reinstall ${{ github.workspace }}/torch*.whl + git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. + else + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu + TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') + cd ../pytorch + git reset --hard && git checkout ${TORCH_COMMIT_ID} + TORCH_XPU_OPS_COMMIT=$(${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log + cd ${{ github.workspace }} + sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope + - name: UT Test Results Check + shell: bash + run: | + function contains() { + contains_status="echo 'Start $2 ...'" + { + [[ $1 =~ (^|,)$2($|,) ]] + } || { + echo "[Warning] $2 is not suppotted type! Skipped!" + contains_status="continue" + } + } + set -xe + echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" + cd ${{ github.workspace }}/ut_log/pytorch_distributed + cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ + bash ut_result_check.sh 'pytorch_distributed' + - name: Upload Inductor XPU UT Log + if: ${{ ! cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed + path: ${{ github.workspace }}/ut_log diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index ec2a73a20..9cf7ef458 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -66,6 +66,31 @@ jobs: pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }} ut: op_regression,op_regression_dev1,op_extended,op_ut,xpu_distributed runner: linux.idc.xpu + + preci-linux-build-distributed: + # Don't run on forked repos and draft PRs + secrets: inherit + if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }} + name: preci-linux-distributed + needs: preci-lint-check + permissions: + issues: write + uses: ./.github/workflows/_linux_build.yml + with: + pytorch: distributed_2.8 + runner: pvc_e2e + + preci-ut-distributed: + # Don't run on forked repos and draft PRs + secrets: inherit + if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }} + name: preci-linux-distributed + needs: preci-linux-build-distributed + uses: ./.github/workflows/_linux_ut.yml + with: + pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }} + ut: pytorch_distributed + runner: pvc_e2e Inductor-XPU-E2E-CI-Tests: name: preci-linux / e2e_test From 119d2fb5b20a32990eeb0377ce490f2fe3f89894 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Sat, 5 Apr 2025 19:52:17 +0000 Subject: [PATCH 07/41] update if condition --- .github/workflows/_linux_build.yml | 26 ++++++++++++++------------ .github/workflows/_linux_ut.yml | 2 +- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 3ed1c3d4e..eda5de367 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -72,20 +72,22 @@ jobs: pip install requests if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then git clone https://github.com/daisyden/pytorch.git pytorch + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + git submodule sync && git submodule update --init --recursive else git clone https://github.com/pytorch/pytorch pytorch - fi - cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) - # apply PRs for stock pytorch - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git status && git show -s - git submodule sync && git submodule update --init --recursive - if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then - echo "Don't replace torch-xpu-ops!" - else - rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ - # Workaround for torch-xpu-ops ci test - sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + fi fi - name: Build Pytorch XPU run: | diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index aa631c6dd..907c5cd2a 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -49,7 +49,7 @@ permissions: read-all jobs: ut_test: runs-on: ${{ inputs.runner }} - if: ${{ inputs.ut != 'xpu_distributed' }} + if: ${{ inputs.ut != 'xpu_distributed' || inputs.ut != 'pytorch_distributed' }} timeout-minutes: 900 env: NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} From 5ff20baae6dba5dee9d6c2ea83773a436229e299 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Sat, 5 Apr 2025 23:02:20 +0000 Subject: [PATCH 08/41] keep_torch_xpu_ops --- .github/workflows/pull.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 9cf7ef458..f0b1b8e22 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -78,6 +78,7 @@ jobs: uses: ./.github/workflows/_linux_build.yml with: pytorch: distributed_2.8 + keep_torch_xpu_ops: true runner: pvc_e2e preci-ut-distributed: From cc472d7823415596734eb9c7e7afb0a3b8c7203b Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Sun, 6 Apr 2025 19:24:08 +0000 Subject: [PATCH 09/41] update keyword in distributed ut check --- .github/scripts/ut_result_check.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh index 32dbed489..9bf611786 100644 --- a/.github/scripts/ut_result_check.sh +++ b/.github/scripts/ut_result_check.sh @@ -73,10 +73,10 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then fi fi if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then - grep -E "ERROR" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log + grep -E "^FAILED|have failures" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log num_failed_distributed=$(wc -l < "./${ut_suite}_test_failed.log") echo -e "=========================================================================" - echo -e "Show Failed cases in ${ut_suite} xpu distributed" + echo -e "Show Failed cases in ${ut_suite}" echo -e "=========================================================================" cat "./${ut_suite}_test_failed.log" ((num_failed=num_failed_distributed)) From 60dbd6eb19a407058eb5f1e6c4972df7fed94fe1 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Mon, 7 Apr 2025 13:37:10 +0000 Subject: [PATCH 10/41] update pytorch build --- .github/workflows/_linux_build.yml | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index eda5de367..3ed1c3d4e 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -72,22 +72,20 @@ jobs: pip install requests if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then git clone https://github.com/daisyden/pytorch.git pytorch - cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) - git submodule sync && git submodule update --init --recursive else git clone https://github.com/pytorch/pytorch pytorch - cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) - # apply PRs for stock pytorch - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git status && git show -s - git submodule sync && git submodule update --init --recursive - if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then - echo "Don't replace torch-xpu-ops!" - else - rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ - # Workaround for torch-xpu-ops ci test - sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt - fi + fi + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt fi - name: Build Pytorch XPU run: | From af0bca95baf745631876e918dfd4ab6b6823778c Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Wed, 2 Apr 2025 06:01:16 -0700 Subject: [PATCH 11/41] enable fsdp cases based on local branch --- test/xpu/run_distributed_local.py | 63 +++++++++++++++++++++++++++++++ test/xpu/skip_list_dist_local.py | 57 ++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100644 test/xpu/run_distributed_local.py create mode 100644 test/xpu/skip_list_dist_local.py diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py new file mode 100644 index 000000000..8074b3292 --- /dev/null +++ b/test/xpu/run_distributed_local.py @@ -0,0 +1,63 @@ +import os +import subprocess +import sys + +from skip_list_dist_local import skip_dict +from xpu_test_utils import launch_test + +res = 0 +fail_test = [] + +# Get the xelink group card affinity +ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") +if ret == 0: + gpu_dict = {} + with open("topology.log", "r") as file: + lines = file.readlines() + for line in lines: + if "CPU Affinity" in line: + continue + line = line.strip() + if line.startswith("GPU "): + items = line.split(' ') + items = [x for x in items if x] + gpu_id = items[1] + i = gpu_id.split('/')[0] + affinity = "" + for j, item in enumerate(items): + if "SYS" not in item and ( "XL" in item or "S" in item ): + if len(affinity) == 0: + affinity = str(j-2) + else: + affinity = affinity + ',' + str(j-2) + gpu_dict[i] = affinity + + + max_affinity = "" + for key, value in gpu_dict.items(): + if len(value) > len(max_affinity): + max_affinity = value + + os.environ["ZE_AFFINITY_MASK"] = str(max_affinity) + print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK"))) + +else: + print("xpu-smi topology failed") + sys.exit(255) + +# run pytest with skiplist +for key in skip_dict: + skip_list = skip_dict[key] + fail = launch_test(key, skip_list) + res += fail + if fail: + fail_test.append(key) + +if fail_test: + print(",".join(fail_test) + " have failures") + +exit_code = os.WEXITSTATUS(res) +if exit_code == 0: + sys.exit(res) +else: + sys.exit(exit_code) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py new file mode 100644 index 000000000..08f90c6b5 --- /dev/null +++ b/test/xpu/skip_list_dist_local.py @@ -0,0 +1,57 @@ +skip_dict = { + "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None, + # https://github.com/intel/torch-xpu-ops/issues/1536 + #"../../../../test/distributed/fsdp/test_distributed_checkpoint.py": ( + # "test_distributed_checkpoint_state_dict_type0_xpu", + # "test_distributed_checkpoint_state_dict_type1_xpu", + #), + "../../../../test/distributed/fsdp/test_fsdp_apply.py": None, + "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None, + "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None, + "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": None, + "../../../../test/distributed/fsdp/test_fsdp_comm.py": None, + "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None, + "../../../../test/distributed/fsdp/test_fsdp_core.py": ( + "test_delayed_optim_step_offload_true_no_shard_xpu", + "test_transformer_no_grad_mixed_precision_True_xpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None, + "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None, + "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": None, + "../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None, + "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": None, + "../../../../test/distributed/fsdp/test_fsdp_fx.py": None, + "../../../../test/distributed/fsdp/test_fsdp_grad_acc.py": None, + "../../../../test/distributed/fsdp/test_fsdp_hybrid_shard.py": None, + "../../../../test/distributed/fsdp/test_fsdp_ignored_modules.py": None, + "../../../../test/distributed/fsdp/test_fsdp_input.py": None, + "../../../../test/distributed/fsdp/test_fsdp_memory.py": None, + "../../../../test/distributed/fsdp/test_fsdp_meta.py": None, + "../../../../test/distributed/fsdp/test_fsdp_misc.py": ( + "test_fsdp_zero2_eval_with_prefetch", + ), + "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None, + "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None, + "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None, + # https://github.com/intel/torch-xpu-ops/issues/1537 + "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": ( + "test_use_orig_params", + ), + # Performance check, skip + #"../../../../test/distributed/fsdp/test_fsdp_overlap.py": ( + # "test_forward_overlap", + # "test_forward_overlap_xpu", + #), + "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None, + "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None, + "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None, + "../../../../test/distributed/fsdp/test_fsdp_tp_integration.py": None, + "../../../../test/distributed/fsdp/test_fsdp_traversal.py": None, + "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None, + "../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None, + "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": None, + "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": None, + "../../../../test/distributed/fsdp/test_shard_utils.py": None, + "../../../../test/distributed/fsdp/test_utils.py": None, + "../../../../test/distributed/fsdp/test_wrap.py": None, +} From 6885a00cdf79029a72ff85938bdf330937ada7e4 Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Wed, 2 Apr 2025 19:46:24 -0700 Subject: [PATCH 12/41] add 2025.0 WA --- test/xpu/run_distributed_local.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index 8074b3292..b6a9ef60c 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -8,6 +8,9 @@ res = 0 fail_test = [] +os.environ["CCL_ATL_TRANSPORT"] = "ofi" +os.environ["CCL_SEND"] = "direct" +os.environ["CCL_RECV"] = "direct" # Get the xelink group card affinity ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") if ret == 0: From cd013d7882b28620cf0b81aace3f212bcbedaca9 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Thu, 3 Apr 2025 01:27:44 -0700 Subject: [PATCH 13/41] Update distributed UT cases in DDP and PP Signed-off-by: Cheng, Penghui --- test/xpu/run_distributed_local.py | 29 +++++++++- test/xpu/skip_list_dist_local.py | 91 +++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+), 2 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index b6a9ef60c..982f05409 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -2,15 +2,17 @@ import subprocess import sys -from skip_list_dist_local import skip_dict +from skip_list_dist_local import skip_dict, skip_dict_python from xpu_test_utils import launch_test res = 0 +res2 = 0 fail_test = [] os.environ["CCL_ATL_TRANSPORT"] = "ofi" os.environ["CCL_SEND"] = "direct" os.environ["CCL_RECV"] = "direct" +os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining" # Get the xelink group card affinity ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") if ret == 0: @@ -48,6 +50,29 @@ print("xpu-smi topology failed") sys.exit(255) +# run python test +def run(test_command): + result = subprocess.run(test_command, capture_output=True, text=True) + print(result.stdout) + print(result.stderr) + return result + +for key in skip_dict_python: + skip_list = skip_dict_python[key] + test_command = ["python", key] + fail = run(test_command) + if fail.returncode: + for line in fail.stderr.split("\n"): + if "FAIL: " in line: + is_error = True + for skip_case in skip_list: + if skip_case in line: + print("Skiped error: ", key + " " + skip_case) + is_error = False + if is_error: + res2 += fail.returncode + fail_test.append("".join(key + " " + line)) + # run pytest with skiplist for key in skip_dict: skip_list = skip_dict[key] @@ -61,6 +86,6 @@ exit_code = os.WEXITSTATUS(res) if exit_code == 0: - sys.exit(res) + sys.exit(res2) else: sys.exit(exit_code) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 08f90c6b5..d65b7aee6 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -54,4 +54,95 @@ "../../../../test/distributed/fsdp/test_shard_utils.py": None, "../../../../test/distributed/fsdp/test_utils.py": None, "../../../../test/distributed/fsdp/test_wrap.py": None, + "../../../../test/distributed/test_backends.py": None, + "../../../../test/distributed/test_c10d_common.py": None, + "../../../../test/distributed/test_c10d_functional_native.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1508 + #RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path + "test_reduce_scatter_tensor_coalesced", + "test_reduce_scatter_tensor_single", + # https://github.com/intel/torch-xpu-ops/issues/1525 + # ValueError: trying to initialize the default process group twice! + "test_inductor_all_gather_into_tensor_coalesced", + "test_inductor_all_gather_into_tensor_single", + "test_inductor_all_reduce_coalesced", + "test_inductor_all_reduce_non_contig_input", + "test_inductor_all_reduce_single", + "test_inductor_all_to_all_single", + "test_inductor_broadcast", + "test_inductor_inplace_op_on_view", + "test_inductor_reduce_scatter_tensor_coalesced", + "test_inductor_reduce_scatter_tensor_single", + "test_inductor_reuse_buffer_after_inplace_collective", + "test_ranks_and_tag", + "test_wait_tensor", + ), + "../../../../test/distributed/test_c10d_logger.py": None, + "../../../../test/distributed/test_c10d_object_collectives.py": ( + # RuntimeError: Process 0 terminated or timed out after 300.09047198295593 seconds + # https://github.com/intel/torch-xpu-ops/issues/1535 + "test_gather_object_cpu", + "test_gather_object_xpu", + "test_gather_object_list_cpu", + "test_gather_object_list_xpu", + ), + "../../../../test/distributed/test_compute_comm_reordering.py": None, + "../../../../test/distributed/test_control_collectives.py": None, + "../../../../test/distributed/test_device_mesh.py": None, + "../../../../test/distributed/test_dynamo_distributed.py": ( + # AttributeError:'torch._C._distributed_c10d.ProcessGroupXCCL' object has no attribute '_set_default_timeout' + "test_asymmetric_compilation", + "test_asymmetric_compilation_with_fx_cache", + # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device. + "test_compiled_flex_attention_full_model_ddp", + "test_compiled_flex_attention_local_ddp", + # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ + # https://github.com/intel/torch-xpu-ops/issues/1527 + "test_compiler_collectives_automatic_dynamic_scalar", + "test_compiler_collectives_automatic_dynamic_speculation_divergence", + "test_compiler_collectives_automatic_dynamic_tensor", + "test_compiler_collectives_dim_mismatch", + "test_compiler_collectives_graph_break_empty_graph_still_collective", + "test_compiler_collectives_missing_source", + "test_compiler_collectives_scalar_missing_source", + "test_compiler_collectives_type_mismatch", + "test_ddp_activation_checkpointing", + "test_ddp_baseline_aot_eager_multiprocess", + "test_fsdp_activation_checkpointing", + "test_fsdp_aot_eager", + "test_fsdp_inductor", + "test_fsdp_setattr", + "test_fsdp_unspecialized_forced_getattr_inline", + "test_fsdp_unspecialized_forced_getattr_no_inline", + # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES) + # https://github.com/intel/torch-xpu-ops/issues/1526 + "test_get_pg_attr", + ), + "../../../../test/distributed/test_fake_pg.py": None, + "../../../../test/distributed/test_functional_api.py": ( + # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES) + # https://github.com/intel/torch-xpu-ops/issues/1526 + "test_tracing_xpu", + "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu" + ), + "../../../../test/distributed/test_multi_threaded_pg.py": ( + # oneccl not support multi-threaded well, so skip it first. + "test_bwd_sees_fwd_pg", + ), + "../../../../test/distributed/test_store.py": None, + "../../../../test/distributed/pipelining/test_backward.py": None, + "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, + "../../../../test/distributed/pipelining/test_backward.py": None, + "../../../../test/distributed/pipelining/test_microbatch.py": None, + "../../../../test/distributed/pipelining/test_pipe.py": None, + "../../../../test/distributed/pipelining/test_schedule.py": None, + "../../../../test/distributed/pipelining/test_transformer.py": None, + "../../../../test/distributed/pipelining/test_unflatten.py": None, +} + +skip_dict_python = { + "distributed/test_c10d_ops_xccl.py": None, + "distributed/test_c10d_xccl.py": None, + "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. + "../../../../test/distributed/pipelining/test_stage.py": None, } From cd92f232de04270a17571df0989be7f32f679fcf Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Thu, 3 Apr 2025 01:36:16 -0700 Subject: [PATCH 14/41] Fixed pylint error Signed-off-by: Cheng, Penghui --- test/xpu/run_distributed_local.py | 7 +++---- test/xpu/skip_list_dist_local.py | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index 982f05409..a5f0c8098 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -11,7 +11,7 @@ os.environ["CCL_ATL_TRANSPORT"] = "ofi" os.environ["CCL_SEND"] = "direct" -os.environ["CCL_RECV"] = "direct" +os.environ["CCL_RECV"] = "direct" os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining" # Get the xelink group card affinity ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") @@ -36,13 +36,12 @@ else: affinity = affinity + ',' + str(j-2) gpu_dict[i] = affinity - - + max_affinity = "" for key, value in gpu_dict.items(): if len(value) > len(max_affinity): max_affinity = value - + os.environ["ZE_AFFINITY_MASK"] = str(max_affinity) print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK"))) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index d65b7aee6..6ce62b8ca 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -96,7 +96,7 @@ # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device. "test_compiled_flex_attention_full_model_ddp", "test_compiled_flex_attention_local_ddp", - # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ + # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ # https://github.com/intel/torch-xpu-ops/issues/1527 "test_compiler_collectives_automatic_dynamic_scalar", "test_compiler_collectives_automatic_dynamic_speculation_divergence", @@ -131,13 +131,13 @@ ), "../../../../test/distributed/test_store.py": None, "../../../../test/distributed/pipelining/test_backward.py": None, - "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, "../../../../test/distributed/pipelining/test_backward.py": None, "../../../../test/distributed/pipelining/test_microbatch.py": None, "../../../../test/distributed/pipelining/test_pipe.py": None, "../../../../test/distributed/pipelining/test_schedule.py": None, "../../../../test/distributed/pipelining/test_transformer.py": None, "../../../../test/distributed/pipelining/test_unflatten.py": None, + "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, } skip_dict_python = { From 413c2b09b48eba42bfc67ed70fb03973edef50a5 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Thu, 3 Apr 2025 02:01:55 -0700 Subject: [PATCH 15/41] Fixed pylint error Signed-off-by: Cheng, Penghui --- test/xpu/run_distributed_local.py | 38 ++++++++++++++++--------------- test/xpu/skip_list_dist_local.py | 17 +++++++------- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index a5f0c8098..d4db4785a 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -17,29 +17,29 @@ ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") if ret == 0: gpu_dict = {} - with open("topology.log", "r") as file: + with open("topology.log") as file: lines = file.readlines() for line in lines: - if "CPU Affinity" in line: - continue - line = line.strip() - if line.startswith("GPU "): - items = line.split(' ') - items = [x for x in items if x] - gpu_id = items[1] - i = gpu_id.split('/')[0] - affinity = "" - for j, item in enumerate(items): - if "SYS" not in item and ( "XL" in item or "S" in item ): - if len(affinity) == 0: - affinity = str(j-2) - else: - affinity = affinity + ',' + str(j-2) - gpu_dict[i] = affinity + if "CPU Affinity" in line: + continue + line = line.strip() + if line.startswith("GPU "): + items = line.split(" ") + items = [x for x in items if x] + gpu_id = items[1] + i = gpu_id.split("/")[0] + affinity = "" + for j, item in enumerate(items): + if "SYS" not in item and ("XL" in item or "S" in item): + if len(affinity) == 0: + affinity = str(j - 2) + else: + affinity = affinity + "," + str(j - 2) + gpu_dict[i] = affinity max_affinity = "" for key, value in gpu_dict.items(): - if len(value) > len(max_affinity): + if len(value) > len(max_affinity): max_affinity = value os.environ["ZE_AFFINITY_MASK"] = str(max_affinity) @@ -49,6 +49,7 @@ print("xpu-smi topology failed") sys.exit(255) + # run python test def run(test_command): result = subprocess.run(test_command, capture_output=True, text=True) @@ -56,6 +57,7 @@ def run(test_command): print(result.stderr) return result + for key in skip_dict_python: skip_list = skip_dict_python[key] test_command = ["python", key] diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 6ce62b8ca..0ac46961e 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -1,10 +1,10 @@ skip_dict = { "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None, # https://github.com/intel/torch-xpu-ops/issues/1536 - #"../../../../test/distributed/fsdp/test_distributed_checkpoint.py": ( + # "../../../../test/distributed/fsdp/test_distributed_checkpoint.py": ( # "test_distributed_checkpoint_state_dict_type0_xpu", # "test_distributed_checkpoint_state_dict_type1_xpu", - #), + # ), "../../../../test/distributed/fsdp/test_fsdp_apply.py": None, "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None, "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None, @@ -37,11 +37,11 @@ "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": ( "test_use_orig_params", ), - # Performance check, skip - #"../../../../test/distributed/fsdp/test_fsdp_overlap.py": ( + # Performance check, skip + # "../../../../test/distributed/fsdp/test_fsdp_overlap.py": ( # "test_forward_overlap", # "test_forward_overlap_xpu", - #), + # ), "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None, "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None, "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None, @@ -58,7 +58,7 @@ "../../../../test/distributed/test_c10d_common.py": None, "../../../../test/distributed/test_c10d_functional_native.py": ( # https://github.com/intel/torch-xpu-ops/issues/1508 - #RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path + # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path "test_reduce_scatter_tensor_coalesced", "test_reduce_scatter_tensor_single", # https://github.com/intel/torch-xpu-ops/issues/1525 @@ -123,7 +123,7 @@ # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES) # https://github.com/intel/torch-xpu-ops/issues/1526 "test_tracing_xpu", - "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu" + "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu", ), "../../../../test/distributed/test_multi_threaded_pg.py": ( # oneccl not support multi-threaded well, so skip it first. @@ -131,7 +131,6 @@ ), "../../../../test/distributed/test_store.py": None, "../../../../test/distributed/pipelining/test_backward.py": None, - "../../../../test/distributed/pipelining/test_backward.py": None, "../../../../test/distributed/pipelining/test_microbatch.py": None, "../../../../test/distributed/pipelining/test_pipe.py": None, "../../../../test/distributed/pipelining/test_schedule.py": None, @@ -143,6 +142,6 @@ skip_dict_python = { "distributed/test_c10d_ops_xccl.py": None, "distributed/test_c10d_xccl.py": None, - "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. + "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. "../../../../test/distributed/pipelining/test_stage.py": None, } From ab68eeef12b5546c9d5ff7000b222442ce88ca3f Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Sat, 5 Apr 2025 19:25:50 +0000 Subject: [PATCH 16/41] add distributed ut in CI --- .github/scripts/ut_result_check.sh | 10 +-- .github/workflows/_linux_build.yml | 6 +- .github/workflows/_linux_ut.yml | 140 +++++++++++++++++++++++++++++ .github/workflows/pull.yml | 25 ++++++ 4 files changed, 175 insertions(+), 6 deletions(-) diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh index 3fb1a1997..32dbed489 100644 --- a/.github/scripts/ut_result_check.sh +++ b/.github/scripts/ut_result_check.sh @@ -72,14 +72,14 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then echo -e "[PASS] UT ${ut_suite} test Pass" fi fi -if [[ "${ut_suite}" == 'xpu_distributed' ]]; then - grep -E "^FAILED|have failures" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log - num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed.log") +if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then + grep -E "ERROR" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log + num_failed_distributed=$(wc -l < "./${ut_suite}_test_failed.log") echo -e "=========================================================================" echo -e "Show Failed cases in ${ut_suite} xpu distributed" echo -e "=========================================================================" - cat "./${ut_suite}_xpu_distributed_test_failed.log" - ((num_failed=num_failed_xpu_distributed)) + cat "./${ut_suite}_test_failed.log" + ((num_failed=num_failed_distributed)) if [[ $num_failed -gt 0 ]]; then echo -e "[ERROR] UT ${ut_suite} test Fail" exit 1 diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index b67be9f29..f17d02a0c 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -65,7 +65,11 @@ jobs: source activate xpu_build cd ../ && rm -rf pytorch pip install requests - git clone https://github.com/pytorch/pytorch pytorch + if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then + git clone https://github.com/daisyden/pytorch.git pytorch + else + git clone https://github.com/pytorch/pytorch pytorch + fi cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) # apply PRs for stock pytorch python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index e2e21bbfb..1edd00a7c 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -420,3 +420,143 @@ jobs: with: name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_distributed path: ${{ github.workspace }}/ut_log + + pytorch_distributed_test: + runs-on: ${{ inputs.runner }} + if: contains(inputs.ut, 'pytorch_distributed') + timeout-minutes: 900 + env: + NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} + DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} + steps: + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Prepare Stock Pytorch + run: | + pwd + which conda && conda clean -ay + conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \ + rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} + conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ../ && rm -rf pytorch + pip install requests + git clone https://github.com/daisyden/pytorch.git pytorch + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + fi + fi + - name: Triton Installation + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ../pytorch + TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton" + if [ -z ${{ inputs.triton }} ]; then + TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" + else + TRITON_COMMIT_ID="${{ inputs.triton }}" + fi + echo ${TRITON_REPO}@${TRITON_COMMIT_ID} + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python" + fi + - name: Download Pytorch wheel + if: ${{ inputs.pytorch != 'nightly_wheel' }} + uses: actions/download-artifact@v4 + with: + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }} + path: ${{ github.workspace }} + - name: Install Pytorch XPU + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + source .github/scripts/env.sh ${{ inputs.pytorch }} + pip install mkl-static==2025.0.1 mkl-include==2025.0.1 + if [[ ${{ inputs.abi }} == '0' ]]; then + export _GLIBCXX_USE_CXX11_ABI=0 + else + export _GLIBCXX_USE_CXX11_ABI=1 + fi + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd ../pytorch + export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"} + pip install -r requirements.txt + pip install --force-reinstall ${{ github.workspace }}/torch*.whl + git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. + else + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu + TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') + cd ../pytorch + git reset --hard && git checkout ${TORCH_COMMIT_ID} + TORCH_XPU_OPS_COMMIT=$(${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log + cd ${{ github.workspace }} + sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope + - name: UT Test Results Check + shell: bash + run: | + function contains() { + contains_status="echo 'Start $2 ...'" + { + [[ $1 =~ (^|,)$2($|,) ]] + } || { + echo "[Warning] $2 is not suppotted type! Skipped!" + contains_status="continue" + } + } + set -xe + echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" + cd ${{ github.workspace }}/ut_log/pytorch_distributed + cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ + bash ut_result_check.sh 'pytorch_distributed' + - name: Upload Inductor XPU UT Log + if: ${{ ! cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed + path: ${{ github.workspace }}/ut_log diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 3dd204e32..be9d35397 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -66,6 +66,31 @@ jobs: pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }} ut: op_regression,op_regression_dev1,op_extended,op_ut,xpu_distributed runner: linux.idc.xpu + + preci-linux-build-distributed: + # Don't run on forked repos and draft PRs + secrets: inherit + if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }} + name: preci-linux-distributed + needs: preci-lint-check + permissions: + issues: write + uses: ./.github/workflows/_linux_build.yml + with: + pytorch: distributed_2.8 + runner: pvc_e2e + + preci-ut-distributed: + # Don't run on forked repos and draft PRs + secrets: inherit + if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }} + name: preci-linux-distributed + needs: preci-linux-build-distributed + uses: ./.github/workflows/_linux_ut.yml + with: + pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }} + ut: pytorch_distributed + runner: pvc_e2e Inductor-XPU-E2E-CI-Tests: name: preci-linux / e2e_test From c5ec1405e405404d2f3f991d8ffbc213f6f2da5a Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Sat, 5 Apr 2025 19:52:17 +0000 Subject: [PATCH 17/41] update if condition --- .github/workflows/_linux_build.yml | 26 ++++++++++++++------------ .github/workflows/_linux_ut.yml | 2 +- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index f17d02a0c..e31d1e27b 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -67,20 +67,22 @@ jobs: pip install requests if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then git clone https://github.com/daisyden/pytorch.git pytorch + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + git submodule sync && git submodule update --init --recursive else git clone https://github.com/pytorch/pytorch pytorch - fi - cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) - # apply PRs for stock pytorch - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git status && git show -s - git submodule sync && git submodule update --init --recursive - if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then - echo "Don't replace torch-xpu-ops!" - else - rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ - # Workaround for torch-xpu-ops ci test - sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + fi fi - name: Build Pytorch XPU run: | diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 1edd00a7c..94dacaf54 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -44,7 +44,7 @@ permissions: read-all jobs: ut_test: runs-on: ${{ inputs.runner }} - if: ${{ inputs.ut != 'xpu_distributed' }} + if: ${{ inputs.ut != 'xpu_distributed' || inputs.ut != 'pytorch_distributed' }} timeout-minutes: 900 env: NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} From edc9e1b5bcde0adf04d47a634ab413cbae41c05a Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Sat, 5 Apr 2025 23:02:20 +0000 Subject: [PATCH 18/41] keep_torch_xpu_ops --- .github/workflows/pull.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index be9d35397..eec6b2893 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -78,6 +78,7 @@ jobs: uses: ./.github/workflows/_linux_build.yml with: pytorch: distributed_2.8 + keep_torch_xpu_ops: true runner: pvc_e2e preci-ut-distributed: From 6c9e99adf2288f6652d0ccc8b84749e353800b85 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Sun, 6 Apr 2025 19:24:08 +0000 Subject: [PATCH 19/41] update keyword in distributed ut check --- .github/scripts/ut_result_check.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh index 32dbed489..9bf611786 100644 --- a/.github/scripts/ut_result_check.sh +++ b/.github/scripts/ut_result_check.sh @@ -73,10 +73,10 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then fi fi if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then - grep -E "ERROR" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log + grep -E "^FAILED|have failures" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log num_failed_distributed=$(wc -l < "./${ut_suite}_test_failed.log") echo -e "=========================================================================" - echo -e "Show Failed cases in ${ut_suite} xpu distributed" + echo -e "Show Failed cases in ${ut_suite}" echo -e "=========================================================================" cat "./${ut_suite}_test_failed.log" ((num_failed=num_failed_distributed)) From bdfa8536c16191cede8c9fd5710e1b90a8e526cc Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Mon, 7 Apr 2025 13:37:10 +0000 Subject: [PATCH 20/41] update pytorch build --- .github/workflows/_linux_build.yml | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index e31d1e27b..f17d02a0c 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -67,22 +67,20 @@ jobs: pip install requests if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then git clone https://github.com/daisyden/pytorch.git pytorch - cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) - git submodule sync && git submodule update --init --recursive else git clone https://github.com/pytorch/pytorch pytorch - cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) - # apply PRs for stock pytorch - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git status && git show -s - git submodule sync && git submodule update --init --recursive - if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then - echo "Don't replace torch-xpu-ops!" - else - rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ - # Workaround for torch-xpu-ops ci test - sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt - fi + fi + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt fi - name: Build Pytorch XPU run: | From 0e77f3030f4e03c4b2cbadf19e1d3cf7c523d744 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Mon, 7 Apr 2025 14:55:26 +0000 Subject: [PATCH 21/41] update if condition --- .github/workflows/_linux_ut.yml | 2 +- .github/workflows/pull.yml | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 94dacaf54..deddcc5db 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -44,7 +44,7 @@ permissions: read-all jobs: ut_test: runs-on: ${{ inputs.runner }} - if: ${{ inputs.ut != 'xpu_distributed' || inputs.ut != 'pytorch_distributed' }} + if: ${{ inputs.ut != 'xpu_distributed' && inputs.ut != 'pytorch_distributed' }} timeout-minutes: 900 env: NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index eec6b2893..be9d35397 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -78,7 +78,6 @@ jobs: uses: ./.github/workflows/_linux_build.yml with: pytorch: distributed_2.8 - keep_torch_xpu_ops: true runner: pvc_e2e preci-ut-distributed: From 4076a1a940d148137f9f530c5efface6ba2365d4 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Mon, 7 Apr 2025 18:12:34 +0000 Subject: [PATCH 22/41] resolve Artifact name conflict --- .github/workflows/_linux_build.yml | 4 ++-- .github/workflows/_linux_ut.yml | 15 +++++---------- .github/workflows/pull.yml | 4 ++-- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index f17d02a0c..ae6c2064c 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -171,11 +171,11 @@ jobs: if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ env.TORCH_COMMIT_ID }} path: ${{ github.workspace }}/torch*.whl - name: Upload Build Log if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: - name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }}-${{ env.TORCH_COMMIT_ID }} path: ${{ github.workspace }}/pytorch_*.log diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index deddcc5db..0e8265639 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -95,7 +95,7 @@ jobs: if: ${{ inputs.pytorch != 'nightly_wheel' }} uses: actions/download-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }} path: ${{ github.workspace }} - name: Install Pytorch XPU run: | @@ -339,7 +339,7 @@ jobs: if: ${{ inputs.pytorch != 'nightly_wheel' }} uses: actions/download-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }} path: ${{ github.workspace }} - name: Install Pytorch XPU run: | @@ -474,18 +474,13 @@ jobs: if: ${{ inputs.pytorch != 'nightly_wheel' }} uses: actions/download-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }} path: ${{ github.workspace }} - name: Install Pytorch XPU run: | source activate xpu_op_${ZE_AFFINITY_MASK} source .github/scripts/env.sh ${{ inputs.pytorch }} pip install mkl-static==2025.0.1 mkl-include==2025.0.1 - if [[ ${{ inputs.abi }} == '0' ]]; then - export _GLIBCXX_USE_CXX11_ABI=0 - else - export _GLIBCXX_USE_CXX11_ABI=1 - fi if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then cd ../pytorch export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"} @@ -534,7 +529,7 @@ jobs: echo -e "[ERROR] XCCL is not enabled" exit 1 fi - timeout 10000 python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log + python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log cd ${{ github.workspace }} sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope - name: UT Test Results Check @@ -558,5 +553,5 @@ jobs: if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: - name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-pytorch_distributed path: ${{ github.workspace }}/ut_log diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index be9d35397..0e9ee9f63 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -88,7 +88,7 @@ jobs: needs: preci-linux-build-distributed uses: ./.github/workflows/_linux_ut.yml with: - pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }} + pytorch: ${{ needs.preci-linux-build-distributed.outputs.torch_commit_id }} ut: pytorch_distributed runner: pvc_e2e @@ -137,7 +137,7 @@ jobs: if: ${{ inputs.pytorch }} != 'nightly_wheel' uses: actions/download-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ needs.preci-linux-build.outputs.torch_commit_id }} path: ${{ github.workspace }} - name: Install Pytorch XPU run: | From 5596ac4436e9d6b1b0367915b3d52ea25c408b5b Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Mon, 7 Apr 2025 23:41:37 -0700 Subject: [PATCH 23/41] enabled test_sharder.py on xpu --- test/xpu/skip_list_dist_local.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 0ac46961e..218746b71 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -137,6 +137,7 @@ "../../../../test/distributed/pipelining/test_transformer.py": None, "../../../../test/distributed/pipelining/test_unflatten.py": None, "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, + "../../../../test/distributed/_shard/test_sharder.py": None, } skip_dict_python = { From 2ed797354aab68575dc8c4ee0f746c9eef9eadac Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Wed, 9 Apr 2025 00:18:27 -0700 Subject: [PATCH 24/41] Enabled UT for test/distributed/tensor Signed-off-by: Cheng, Penghui --- test/xpu/skip_list_dist_local.py | 79 ++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 0ac46961e..42cdebf19 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -136,7 +136,85 @@ "../../../../test/distributed/pipelining/test_schedule.py": None, "../../../../test/distributed/pipelining/test_transformer.py": None, "../../../../test/distributed/pipelining/test_unflatten.py": None, + "../../../../test/distributed/tensor/parallel/test_micro_pipeline_tp.py": ( + # NotImplementedError: The operator 'symm_mem::fused_matmul_reduce_scatter' + # is not currently implemented for the XPU device + # https://github.com/intel/torch-xpu-ops/issues/1547 + "test_dtensor_seq_par_shard_dim_0", + "test_dtensor_seq_par_shard_dim_1", + "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_0", + "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_1", + "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_0", + "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_1", + "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_2", + # AssertionError: 'fused_all_gather_matmul' not found in '# AOT ID: ......' + # https://github.com/intel/torch-xpu-ops/issues/1548 + "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_False", + "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_True", + "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_False", + "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_True", + "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_False", + "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_True", + # AssertionError: 'fused_all_gather_scaled_matmul' not found in 'graph():\n......' + # https://github.com/intel/torch-xpu-ops/issues/1549 + "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_True", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_True", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_True", + # NotImplementedError: The operator 'aten::_scaled_mm.out' is not currently implemented for the XPU device. + # https://github.com/intel/torch-xpu-ops/issues/1550 + "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_True", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_True", + # NotImplementedError: The operator 'symm_mem::fused_scaled_matmul_reduce_scatter' + # is not currently implemented for the XPU device. + # https://github.com/intel/torch-xpu-ops/issues/1551 + "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_0", + "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_1", + "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_0", + "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_1", + "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_2", + "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_0", + "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_1", + "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_2", + ), + "../../../../test/distributed/tensor/parallel/test_tp_examples.py": ( + # RuntimeError: aten.add.Tensor: got mixed torch.Tensor and DTensor, need to convert all torch.Tensor to DTensor before calling distributed operators! + # https://github.com/intel/torch-xpu-ops/issues/1555 + "test/distributed/tensor/parallel/test_tp_examples.py::DistTensorParallelExampleTest::test_transformer_req_grad_seq_parallel_float32_thaw_all", + "test_transformer_req_grad_seq_parallel_float32_thaw_layers_0_attention_wv__layers_0_feed_forward_w1__layers_1_feed_forward_w2__layers_1_ffn_norm__output__tok_embeddings", + "test_transformer_req_grad_seq_parallel_float32_thaw_layers_1_ffn_norm__norm__output__tok_embeddings", + "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output__tok_embeddings", + "test_transformer_req_grad_seq_parallel_float32_thaw_output__tok_embeddings", + "test_transformer_training_is_seq_parallel_False_float32", + "test_transformer_training_is_seq_parallel_True_float32", + # NotImplementedError: Operator aten._scaled_dot_product_fused_attention_overrideable.default does not have a sharding strategy registered. + # https://github.com/intel/torch-xpu-ops/issues/1556 + "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output", + ), "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, + "../../../../test/distributed/tensor/parallel/test_parallelize_api.py": None, + "../../../../test/distributed/tensor/parallel/test_tp_style.py": None, + "../../../../test/distributed/tensor/test_api.py": None, + "../../../../test/distributed/tensor/test_attention.py": None, + "../../../../test/distributed/tensor/test_common_rules.py": None, + "../../../../test/distributed/tensor/test_dtensor.py": None, + "../../../../test/distributed/tensor/test_dtensor_compile.py": None, + "../../../../test/distributed/tensor/test_experimental_ops.py": None, + "../../../../test/distributed/tensor/test_init.py": None, + "../../../../test/distributed/tensor/test_math_ops.py": ( + # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path + # https://github.com/intel/torch-xpu-ops/issues/1508 + "test_mean", + "test_nll_loss_and_cross_entropy", + ), + "../../../../test/distributed/tensor/test_random_ops.py": None, + "../../../../test/distributed/tensor/test_redistribute.py": None, + "../../../../test/distributed/tensor/test_tensor_ops.py": None, + "../../../../test/distributed/tensor/experimental/test_register_sharding.py": None, } skip_dict_python = { @@ -144,4 +222,5 @@ "distributed/test_c10d_xccl.py": None, "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. "../../../../test/distributed/pipelining/test_stage.py": None, + "../../../../test/distributed/pipelining/test_transformer.py": None, } From 5bab858cbde56b7319c43690157aee43d06917f3 Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Wed, 9 Apr 2025 23:57:58 -0700 Subject: [PATCH 25/41] add FSDP2 cases, improved check-ut.py for summary, do ZE_AFFINITY_MASK configuration before import torch --- .github/scripts/check-ut.py | 5 ++++- test/xpu/run_distributed_local.py | 3 ++- test/xpu/skip_list_dist_local.py | 17 +++++++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py index 8cd490bc8..9d9e4edfd 100644 --- a/.github/scripts/check-ut.py +++ b/.github/scripts/check-ut.py @@ -30,7 +30,8 @@ def get_result(case): def get_message(case): if not case.result: return "" - return f"{case.result[0].message.splitlines()[0]}" + #return f" for line in {case.result[0].message.splitlines()}" + return [item for item in case.result[0].message.splitlines() if "Error:" in item] def print_md_row(row, print_header): if print_header: @@ -75,6 +76,8 @@ def print_suite(suite): category = 'op_extended' elif 'op_ut' in ut: category = 'op_ut' + else: + category = "default" row = { 'Category': category, 'UT': ut, diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index d4db4785a..1c2435e15 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -3,7 +3,6 @@ import sys from skip_list_dist_local import skip_dict, skip_dict_python -from xpu_test_utils import launch_test res = 0 res2 = 0 @@ -50,6 +49,8 @@ sys.exit(255) +from xpu_test_utils import launch_test + # run python test def run(test_command): result = subprocess.run(test_command, capture_output=True, text=True) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 5629046d9..a41c91f18 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -216,6 +216,23 @@ "../../../../test/distributed/tensor/test_tensor_ops.py": None, "../../../../test/distributed/tensor/experimental/test_register_sharding.py": None, "../../../../test/distributed/_shard/test_sharder.py": None, + # FSDP2 + "../../../../test/distributed/_composable/fsdp/test_fully_shard_autograd.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_extensions.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_frozen.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_init.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_logging.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_memory.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_overlap.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_state.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_training.py": None, } skip_dict_python = { From f1b824d7764ddf88989f1960519a84dc449fbb56 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Thu, 10 Apr 2025 01:27:23 -0700 Subject: [PATCH 26/41] Skip test_schedule_multiproc.py for hang error Signed-off-by: Cheng, Penghui --- test/xpu/skip_list_dist_local.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 5629046d9..b2984fb17 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -221,7 +221,7 @@ skip_dict_python = { "distributed/test_c10d_ops_xccl.py": None, "distributed/test_c10d_xccl.py": None, - "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. + # "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. "../../../../test/distributed/pipelining/test_stage.py": None, "../../../../test/distributed/pipelining/test_transformer.py": None, } From 43a296c7f98f69793a941db5cd0cdcca66fc5578 Mon Sep 17 00:00:00 2001 From: "Zhong, Ruijie" Date: Tue, 1 Apr 2025 09:35:56 +0000 Subject: [PATCH 27/41] Update UT summary --- .github/workflows/_linux_ut.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index b409d5774..a7650f3f8 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -182,6 +182,18 @@ jobs: cd ../pytorch/third_party/torch-xpu-ops/test/xpu timeout 10000 python run_test_with_skip.py 2>${{ github.workspace }}/ut_log/op_ut/op_ut_with_skip_test_error.log | tee ${{ github.workspace }}/ut_log/op_ut/op_ut_with_skip_test.log cp *.xml ${{ github.workspace }}/ut_log + find op_ut_with_skip_nn op_ut_with_skip_quantization/core -type f -exec sh -c ' + dir_path=$(dirname "$1"); + case "$dir_path" in + *"op_ut_with_skip_quantization/core"*) + dir_name="op_ut_with_skip_quantization_core";; + *) + dir_name=$(basename "$dir_path");; + esac; + mv "$1" "$dir_path/${dir_name}_$(basename "$1")" + ' _ {} \; + cp op_ut_with_skip_nn/*.xml ${{ github.workspace }}/ut_log + cp op_ut_with_skip_quantization/core/*.xml ${{ github.workspace }}/ut_log # Cases run with a on-demand white list, since some suites are too # slow to go through all operators on CPU. So add cases on-demand # when XPU implementatoin is done. From 0f684ac1e2f5b4db44dc62d59be0b069ba934e49 Mon Sep 17 00:00:00 2001 From: "Zhong, Ruijie" Date: Thu, 10 Apr 2025 00:34:12 -0700 Subject: [PATCH 28/41] Update ut summary for more details --- .github/scripts/check-ut.py | 51 ++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py index 8cd490bc8..cd0b7b405 100644 --- a/.github/scripts/check-ut.py +++ b/.github/scripts/check-ut.py @@ -30,7 +30,54 @@ def get_result(case): def get_message(case): if not case.result: return "" - return f"{case.result[0].message.splitlines()[0]}" + full_text = case.result[0].text if hasattr(case.result[0], 'text') else case.result[0].message + if not full_text: + return "" + error_types = [ + "RuntimeError", + "ValueError", + "TypeError", + "AttributeError", + "KeyError", + "IndexError", + "ImportError", + "AssertionError", + "Exception", + "OSError", + "Failed", + "TimeoutError", + "asyncio.TimeoutError", + "FileNotFoundError", + "PermissionError", + ] + + error_messages = [] + current_error = None + capture_next_lines = False + indent_level = 0 + + for line in full_text.splitlines(): + stripped_line = line.strip() + if not stripped_line: + continue + + for error_type in error_types: + if stripped_line.startswith(error_type + ": "): + current_error = error_type + error_msg = stripped_line[len(error_type)+2:] + error_messages.append(f"{error_type}: {error_msg}") + capture_next_lines = True + indent_level = 0 + break + elif f"{error_type}:" in stripped_line and "Traceback" not in stripped_line: + current_error = error_type + error_msg = stripped_line.split(f'{error_type}:')[-1].strip() + error_messages.append(f"{error_type}: {error_msg}") + capture_next_lines = True + indent_level = 0 + break + + return "\n".join(error_messages) if error_messages else f"{case.result[0].message.splitlines()[0]}" def print_md_row(row, print_header): if print_header: @@ -75,6 +122,8 @@ def print_suite(suite): category = 'op_extended' elif 'op_ut' in ut: category = 'op_ut' + else: + category = "unknown" row = { 'Category': category, 'UT': ut, From d1828de5acf482a23bbaab4ab936ab00e59665cd Mon Sep 17 00:00:00 2001 From: "Zhong, Ruijie" Date: Thu, 10 Apr 2025 00:40:27 -0700 Subject: [PATCH 29/41] align the lint check --- .github/scripts/check-ut.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py index cd0b7b405..0cf82d159 100644 --- a/.github/scripts/check-ut.py +++ b/.github/scripts/check-ut.py @@ -35,7 +35,7 @@ def get_message(case): return "" error_types = [ "RuntimeError", - "ValueError", + "ValueError", "TypeError", "AttributeError", "KeyError", From b8dc74bc9ca31053085e337137e9a7b9020d240e Mon Sep 17 00:00:00 2001 From: "Zhong, Ruijie" Date: Thu, 10 Apr 2025 00:43:37 -0700 Subject: [PATCH 30/41] remove unneccessary parm --- .github/scripts/check-ut.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py index 0cf82d159..d9c3d66a7 100644 --- a/.github/scripts/check-ut.py +++ b/.github/scripts/check-ut.py @@ -52,7 +52,6 @@ def get_message(case): ] error_messages = [] - current_error = None capture_next_lines = False indent_level = 0 @@ -63,14 +62,12 @@ def get_message(case): for error_type in error_types: if stripped_line.startswith(error_type + ": "): - current_error = error_type error_msg = stripped_line[len(error_type)+2:] error_messages.append(f"{error_type}: {error_msg}") capture_next_lines = True indent_level = 0 break elif f"{error_type}:" in stripped_line and "Traceback" not in stripped_line: - current_error = error_type error_msg = stripped_line.split(f'{error_type}:')[-1].strip() error_messages.append(f"{error_type}: {error_msg}") capture_next_lines = True From f7a2fd3b70a6870af8c2f78387a6a8c4332b7468 Mon Sep 17 00:00:00 2001 From: "Zhong, Ruijie" Date: Thu, 10 Apr 2025 18:21:58 -0700 Subject: [PATCH 31/41] change the delimiter --- .github/scripts/check-ut.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py index d9c3d66a7..91ff39ee7 100644 --- a/.github/scripts/check-ut.py +++ b/.github/scripts/check-ut.py @@ -74,7 +74,7 @@ def get_message(case): indent_level = 0 break - return "\n".join(error_messages) if error_messages else f"{case.result[0].message.splitlines()[0]}" + return " | ".join(error_messages) if error_messages else f"{case.result[0].message.splitlines()[0]}" def print_md_row(row, print_header): if print_header: From 3d60d1f9ec06584008021212f08e1ccaa5885a18 Mon Sep 17 00:00:00 2001 From: "Zhong, Ruijie" Date: Thu, 10 Apr 2025 18:22:39 -0700 Subject: [PATCH 32/41] change the delimiter --- .github/scripts/check-ut.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py index 91ff39ee7..8aa069ebe 100644 --- a/.github/scripts/check-ut.py +++ b/.github/scripts/check-ut.py @@ -74,7 +74,7 @@ def get_message(case): indent_level = 0 break - return " | ".join(error_messages) if error_messages else f"{case.result[0].message.splitlines()[0]}" + return " ; ".join(error_messages) if error_messages else f"{case.result[0].message.splitlines()[0]}" def print_md_row(row, print_header): if print_header: From b7797296834fc4171df0dbdd68a590f4289a30e0 Mon Sep 17 00:00:00 2001 From: "Zhong, Ruijie" Date: Thu, 10 Apr 2025 23:58:59 -0700 Subject: [PATCH 33/41] add NotImplementedError check --- .github/scripts/check-ut.py | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py index 8aa069ebe..e67fa812d 100644 --- a/.github/scripts/check-ut.py +++ b/.github/scripts/check-ut.py @@ -49,6 +49,7 @@ def get_message(case): "asyncio.TimeoutError", "FileNotFoundError", "PermissionError", + "NotImplementedError", ] error_messages = [] From f696faad63d48e4a2e65a15340c998aedc9d529d Mon Sep 17 00:00:00 2001 From: Cheng Penghui Date: Mon, 14 Apr 2025 23:14:30 -0700 Subject: [PATCH 34/41] refine error log for test files without pytest Signed-off-by: Cheng Penghui --- test/xpu/run_distributed_local.py | 52 ++++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 11 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index d4db4785a..96761cd82 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -8,6 +8,7 @@ res = 0 res2 = 0 fail_test = [] +error_log = "" os.environ["CCL_ATL_TRANSPORT"] = "ofi" os.environ["CCL_SEND"] = "direct" @@ -59,20 +60,49 @@ def run(test_command): for key in skip_dict_python: - skip_list = skip_dict_python[key] + skip_list = skip_dict_python[key] if skip_dict_python[key] else [] test_command = ["python", key] fail = run(test_command) if fail.returncode: - for line in fail.stderr.split("\n"): - if "FAIL: " in line: - is_error = True - for skip_case in skip_list: - if skip_case in line: - print("Skiped error: ", key + " " + skip_case) - is_error = False - if is_error: - res2 += fail.returncode - fail_test.append("".join(key + " " + line)) + num_skipped = 0 + num_err = 0 + for i, err in enumerate(fail.stderr.split("FAIL: ")): + if i == 0 and len(err) > 0: + error_log += err + continue + is_skipped = False + for skip_case in skip_list: + if skip_case in err: + print("Skipped error: ", key + " " + skip_case) + num_skipped += 1 + is_skipped = True + break + if not is_skipped: + num_err += 1 + res2 += fail.returncode + if i == len(fail.stderr.split("FAIL: ")) - 1: + error_log += "FAIL: " + for line in err.split("\n"): + if line.startswith("FAILED (failures="): + num_errs = line.split("=")[1].split(")")[0].strip() + error_log += ("FAILED (failures=" + str(int(num_errs) - num_skipped) + f" skipped {num_skipped} cases" + ")\n") + else: + error_log += (line + "\n") + else: + error_log += ("FAIL: " + err) + else: + if i == len(fail.stderr.split("FAIL: ")) - 1: + error_log += "FAIL: " + for line in err.split("\n"): + if line.startswith("FAILED (failures="): + num_errs = line.split("=")[1].split(")")[0].strip() + error_log += ("FAILED (failures=" + str(int(num_errs) - num_skipped) + f" skipped {num_skipped} cases" + ")\n") + + if num_err > 0: + fail_test.append(key) + renamed_key = key.replace("../../../../", "").replace("/", "_") + with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f: + f.write(error_log) # run pytest with skiplist for key in skip_dict: From 00326ac761623a735718105609c6e0cb05686a7c Mon Sep 17 00:00:00 2001 From: Cheng Penghui Date: Tue, 15 Apr 2025 01:50:09 -0700 Subject: [PATCH 35/41] Fixed error for create log file without pytest Signed-off-by: Cheng Penghui --- test/xpu/run_distributed_local.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index 46a0be814..46905cef1 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -64,9 +64,9 @@ def run(test_command): skip_list = skip_dict_python[key] if skip_dict_python[key] else [] test_command = ["python", key] fail = run(test_command) + num_skipped = 0 + num_err = 0 if fail.returncode: - num_skipped = 0 - num_err = 0 for i, err in enumerate(fail.stderr.split("FAIL: ")): if i == 0 and len(err) > 0: error_log += err @@ -99,11 +99,16 @@ def run(test_command): num_errs = line.split("=")[1].split(")")[0].strip() error_log += ("FAILED (failures=" + str(int(num_errs) - num_skipped) + f" skipped {num_skipped} cases" + ")\n") + renamed_key = key.replace("../../../../", "").replace("/", "_") if num_err > 0: fail_test.append(key) - renamed_key = key.replace("../../../../", "").replace("/", "_") with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f: f.write(error_log) + else: + import pdb;pdb.set_trace() + with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f: + f.write(fail.stdout) + f.write(fail.stderr) # run pytest with skiplist for key in skip_dict: From 8ad304c6881dc9ad625eb89bc11f85e224124c5a Mon Sep 17 00:00:00 2001 From: "Zhong, Ruijie" Date: Tue, 15 Apr 2025 02:34:04 -0700 Subject: [PATCH 36/41] add log summary function --- .github/scripts/check-ut.py | 287 ++++++++++++++++++++++++------------ 1 file changed, 192 insertions(+), 95 deletions(-) diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py index e67fa812d..fb636ec6a 100644 --- a/.github/scripts/check-ut.py +++ b/.github/scripts/check-ut.py @@ -1,22 +1,47 @@ import argparse import sys import os +import re from junitparser import JUnitXml, Error, Failure, Skipped -parser = argparse.ArgumentParser() -parser.add_argument('junitxml', nargs='+') +parser = argparse.ArgumentParser(description='Test results analyzer') +parser.add_argument('input_files', nargs='+', help='JUnit XML files or log files') args = parser.parse_args() failures = [] -suites = [] +summaries = [] + +error_types = [ + "RuntimeError", + "ValueError", + "TypeError", + "AttributeError", + "KeyError", + "IndexError", + "ImportError", + "AssertionError", + "Exception", + "OSError", + "Failed", + "TimeoutError", + "asyncio.TimeoutError", + "FileNotFoundError", + "PermissionError", + "NotImplementedError", +] def get_classname(case): - return ' '.join(case.classname.split()) + return ' '.join(case.classname.split()) if hasattr(case, 'classname') else case.get('classname', '') def get_name(case): + if isinstance(case, dict): + return case.get('name', '') return ' '.join(case.name.split()) def get_result(case): + if isinstance(case, dict): + return case.get('status', 'failed') + result = "passed" if case.result: if isinstance(case.result[0], Error): @@ -28,29 +53,14 @@ def get_result(case): return result def get_message(case): + if isinstance(case, dict): + return case.get('error', '') + if not case.result: return "" full_text = case.result[0].text if hasattr(case.result[0], 'text') else case.result[0].message if not full_text: return "" - error_types = [ - "RuntimeError", - "ValueError", - "TypeError", - "AttributeError", - "KeyError", - "IndexError", - "ImportError", - "AssertionError", - "Exception", - "OSError", - "Failed", - "TimeoutError", - "asyncio.TimeoutError", - "FileNotFoundError", - "PermissionError", - "NotImplementedError", - ] error_messages = [] capture_next_lines = False @@ -77,86 +87,173 @@ def get_message(case): return " ; ".join(error_messages) if error_messages else f"{case.result[0].message.splitlines()[0]}" -def print_md_row(row, print_header): +def print_md_row(row, print_header=False): if print_header: - header = " | ".join([f"{key}" for key, _ in row.items()]) + header = " | ".join([f"{key}" for key in row.keys()]) print(f"| {header} |") - header = " | ".join(["-"*len(key) for key, _ in row.items()]) + header = " | ".join(["---"] * len(row)) print(f"| {header} |") - row = " | ".join([f"{value}" for _, value in row.items()]) - print(f"| {row} |") + row_values = " | ".join([f"{value}" for value in row.values()]) + print(f"| {row_values} |") + +def print_failures(): + if not failures: + return -def print_cases(cases): + print("### Test Failures") print_header = True - for case in cases: - classname = get_classname(case) - name = get_name(case) - result = get_result(case) - message = get_message(case) - row = { - 'Class name': classname, - 'Test name': name, - 'Status': result, - 'Message': message, - } - print_md_row(row, print_header) + for case in failures: + print_md_row({ + 'Class name': get_classname(case), + 'Test name': get_name(case), + 'Status': get_result(case), + 'Message': get_message(case), + 'Source': case['source'] if isinstance(case, dict) else 'XML' + }, print_header) print_header = False -def print_suite(suite): - print_header = True - for suite in suites: - ut = args.junitxml[0] - del(args.junitxml[0]) - ut = os.path.basename(ut).split('.')[0] - tests = suite.tests - skipped = suite.skipped - failures = suite.failures - errors = suite.errors - if ut == 'op_regression': - category = 'op_regression' - elif ut == 'op_regression_dev1': - category = 'op_regression_dev1' - elif ut == 'op_extended': - category = 'op_extended' - elif 'op_ut' in ut: - category = 'op_ut' +def parse_log_file(log_file): + with open(log_file, 'r', encoding='utf-8') as f: + content = f.read() + + ut_name = os.path.splitext(os.path.basename(log_file))[0] + summary = { + 'Category': determine_category(ut_name), + 'UT': ut_name, + 'Test cases': 0, + 'Passed': 0, + 'Skipped': 0, + 'Failures': 0, + 'Errors': 0, + 'Source': 'Log' + } + + # Extract test counts + test_run_match = re.search(r"Ran (\d+) tests in [\d.]+s", content) + if test_run_match: + summary['Test cases'] = int(test_run_match.group(1)) + + # Extract skipped case number + skipped_match = re.search(r"skipped[ =](\d+)", content, re.IGNORECASE) + if skipped_match: + summary['Skipped'] = int(skipped_match.group(1)) + else: + skipped_match = re.search(r"skipped (\d+) cases?", content, re.IGNORECASE) + if skipped_match: + summary['Skipped'] = int(skipped_match.group(1)) + + # Extract failures + failure_blocks = re.findall(r"(FAIL:.*?)(?:\n\n|\n=+\n|\Z)", content, re.DOTALL) + exist_test_names = set() + failures_number = 0 + + for block in failure_blocks: + case_match = re.match(r"FAIL: (\w+) \(__mp_main__\.(\w+)\)", block) + if not case_match: + continue + + test_name = case_match.group(1) + if test_name in exist_test_names: + continue + exist_test_names.add(test_name) + + error_msg = [] + error_pattern = r"(" + "|".join(error_types) + r"):.*?(?=\n\S|\n\n|\n=+\n|\Z)" + error_matches = re.finditer(error_pattern, block, re.DOTALL) + if not error_matches and "Traceback" in block: + error_msg.append("Unknown error (see traceback)") else: - category = "unknown" - row = { - 'Category': category, - 'UT': ut, - 'Test cases': tests, - 'Passed': tests-skipped-failures-errors, - 'Skipped': skipped, - 'Failures': failures, - 'Errors': errors, - } - print_md_row(row, print_header) + for match in error_matches: + error_msg.append(match.group(0).strip()) + + failures.append({ + 'classname': ut_name, + 'name': f"{case_match.group(2)}:{test_name}", + 'error': " ".join(error_msg), + 'status': 'failed', + 'source': 'Log' + }) + failures_number += 1 + + if failures_number > summary['Failures']: + summary['Failures'] = failures_number + summary['Passed'] = summary['Test cases'] - summary['Failures'] - summary['Skipped'] + + return summary + +def determine_category(ut): + if ut == 'op_regression': + return 'op_regression' + elif ut == 'op_regression_dev1': + return 'op_regression_dev1' + elif ut == 'op_extended': + return 'op_extended' + elif 'op_ut' in ut: + return 'op_ut' + else: + return 'unknown' + +def process_log_file(log_file): + try: + summary = parse_log_file(log_file) + summaries.append(summary) + except Exception as e: + print(f"Error processing {log_file}: {e}", file=sys.stderr) + +def process_xml_file(xml_file): + try: + xml = JUnitXml.fromfile(xml_file) + ut = os.path.basename(xml_file).split('.')[0] + category = determine_category(ut) + + for suite in xml: + suite_summary = { + 'Category': category, + 'UT': ut, + 'Test cases': suite.tests, + 'Passed': suite.tests - suite.skipped - suite.failures - suite.errors, + 'Skipped': suite.skipped, + 'Failures': suite.failures, + 'Errors': suite.errors, + 'Source': 'XML' + } + summaries.append(suite_summary) + + for case in suite: + if get_result(case) not in ["passed", "skipped"]: + failures.append(case) + except Exception as e: + print(f"Error processing {xml_file}: {e}", file=sys.stderr) + +def print_summary(): + print("### Results Summary") + print_header = True + + for summary in summaries: + print_md_row({ + 'Category': summary['Category'], + 'UT': summary['UT'], + 'Test cases': summary['Test cases'], + 'Passed': summary['Passed'], + 'Skipped': summary['Skipped'], + 'Failures': summary['Failures'], + 'Errors': summary['Errors'], + 'Source': summary['Source'] + }, print_header) print_header = False -xmls = [ JUnitXml.fromfile(f) for f in args.junitxml ] -for idx, xml in enumerate(xmls): - for suite in xml: - suites.append(suite) - for case in suite: - classname = get_classname(case) - name = get_name(case) - result = get_result(case) - if result not in ["passed", "skipped"]: - failures.append(case) - -printed = False -def print_break(needed): - if needed: - print("") - -if failures: - print_break(printed) - print("### Failures") - print_cases(failures) - printed = True - -print("### Results Summary") -print_suite(suites) - -sys.exit(0) +def main(): + for input_file in args.input_files: + if input_file.endswith('.log'): + process_log_file(input_file) + elif input_file.endswith('.xml'): + process_xml_file(input_file) + else: + print(f"Skipping unknown file type: {input_file}", file=sys.stderr) + + print_failures() + print_summary() + + +if __name__ == "__main__": + main() From 4627a25c1a36a338a1f0b20eed07eeae868d248d Mon Sep 17 00:00:00 2001 From: "Zhong, Ruijie" Date: Tue, 15 Apr 2025 02:37:17 -0700 Subject: [PATCH 37/41] align the lint check --- .github/scripts/check-ut.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py index fb636ec6a..5758c4e6d 100644 --- a/.github/scripts/check-ut.py +++ b/.github/scripts/check-ut.py @@ -113,7 +113,7 @@ def print_failures(): print_header = False def parse_log_file(log_file): - with open(log_file, 'r', encoding='utf-8') as f: + with open(log_file, encoding='utf-8') as f: content = f.read() ut_name = os.path.splitext(os.path.basename(log_file))[0] From 59c609e66945c3b4d2dae80a3f909256451be4e3 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Tue, 15 Apr 2025 23:07:01 -0700 Subject: [PATCH 38/41] Skipped cases rasied issue Signed-off-by: Cheng, Penghui --- test/xpu/run_distributed_local.py | 3 - test/xpu/skip_list_dist_local.py | 271 +++++++++++++++++++++++++++--- 2 files changed, 246 insertions(+), 28 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index 46a0be814..63a588416 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -9,9 +9,6 @@ fail_test = [] error_log = "" -os.environ["CCL_ATL_TRANSPORT"] = "ofi" -os.environ["CCL_SEND"] = "direct" -os.environ["CCL_RECV"] = "direct" os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining" # Get the xelink group card affinity ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index e6a2a34f3..9ec4c59e0 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -7,19 +7,120 @@ # ), "../../../../test/distributed/fsdp/test_fsdp_apply.py": None, "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None, - "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None, - "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": None, + "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_basic_checkpoint_end_to_end_cpu_offload1_offload_activations_False_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_True_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False", + "test_checkpoint_submodule_use_reentrant_False_xpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_ddp_parity_xpu", + ), "../../../../test/distributed/fsdp/test_fsdp_comm.py": None, - "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None, + "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_bf16_hook_has_wrapping_False_sharding_strategy0", + "test_bf16_hook_has_wrapping_False_sharding_strategy1", + "test_bf16_hook_has_wrapping_False_sharding_strategy2", + "test_bf16_hook_has_wrapping_True_sharding_strategy0", + "test_bf16_hook_has_wrapping_True_sharding_strategy1", + "test_bf16_hook_has_wrapping_True_sharding_strategy2", + "test_fp16_hook_has_wrapping_False_sharding_strategy1", + "test_fp16_hook_has_wrapping_False_sharding_strategy2", + "test_fp16_hook_has_wrapping_True_sharding_strategy0", + "test_fp16_hook_has_wrapping_True_sharding_strategy1", + "test_fp16_hook_has_wrapping_True_sharding_strategy2", + ), "../../../../test/distributed/fsdp/test_fsdp_core.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 "test_delayed_optim_step_offload_true_no_shard_xpu", "test_transformer_no_grad_mixed_precision_True_xpu", + "test_delayed_optim_step_offload_false_no_shard_xpu", + "test_delayed_optim_step_offload_false_none_xpu", + "test_delayed_optim_step_offload_false_shard_grad_op_xpu", + "test_delayed_optim_step_offload_true_none_xpu", + "test_delayed_optim_step_offload_true_shard_grad_op_xpu", + "test_delayed_reduce_scatter_offload_false_no_shard_xpu", + "test_delayed_reduce_scatter_offload_false_none_xpu", + "test_delayed_reduce_scatter_offload_false_shard_grad_op_xpu", + "test_delayed_reduce_scatter_offload_true_none_xpu", + "test_delayed_reduce_scatter_offload_true_shard_grad_op_xpu", + "test_mixture_of_experts_offload_false_no_shard_xpu", + "test_mixture_of_experts_offload_false_none_xpu", + "test_mixture_of_experts_offload_false_shard_grad_op_xpu", + "test_mixture_of_experts_offload_true_none_xpu", + "test_mixture_of_experts_offload_true_shard_grad_op_xpu", + "test_mixture_of_experts_with_delay_before_free_offload_false_no_shard_xpu", + "test_mixture_of_experts_with_delay_before_free_offload_false_none_xpu", + "test_mixture_of_experts_with_delay_before_free_offload_false_shard_grad_op_xpu", + "test_mixture_of_experts_with_delay_before_free_offload_true_none_xpu", + "test_mixture_of_experts_with_delay_before_free_offload_true_shard_grad_op_xpu", + "test_nested_always_wrap_model_offload_false_no_shard_xpu", + "test_nested_always_wrap_model_offload_false_none_xpu", + "test_nested_always_wrap_model_offload_false_shard_grad_op_xpu", + "test_nested_always_wrap_model_offload_true_none_xpu", + "test_nested_always_wrap_model_offload_true_shard_grad_op_xpu", + "test_nested_wrapped_model_offload_false_no_shard_xpu", + "test_nested_wrapped_model_offload_false_none_xpu", + "test_nested_wrapped_model_offload_false_shard_grad_op_xpu", + "test_nested_wrapped_model_offload_true_none_xpu", + "test_nested_wrapped_model_offload_true_shard_grad_op_xpu", + "test_transformer_offload_false_none_xpu", + "test_transformer_offload_false_shard_grad_op_xpu", + "test_transformer_offload_true_none_xpu", + "test_transformer_offload_true_shard_grad_op_xpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + " test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_False_xpu", ), - "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None, "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None, - "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": None, + "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_hooks_multi_traversal_xpu", + "test_parity_with_ddp_xpu", + "test_parity_with_non_frozen_fsdp_xpu", + ), "../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None, - "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": None, + "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True ", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True", + ), "../../../../test/distributed/fsdp/test_fsdp_fx.py": None, "../../../../test/distributed/fsdp/test_fsdp_grad_acc.py": None, "../../../../test/distributed/fsdp/test_fsdp_hybrid_shard.py": None, @@ -28,28 +129,89 @@ "../../../../test/distributed/fsdp/test_fsdp_memory.py": None, "../../../../test/distributed/fsdp/test_fsdp_meta.py": None, "../../../../test/distributed/fsdp/test_fsdp_misc.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1535 "test_fsdp_zero2_eval_with_prefetch", + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_fsdp_optimizer_overlap", ), "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None, - "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None, + "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_multi_forward_cpu", + ), "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None, # https://github.com/intel/torch-xpu-ops/issues/1537 "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_flatten_sharded_optim_state_dict_nested", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_False", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_True", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_True_use_diff_optim_inputs_False", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_True_use_diff_optim_inputs_True", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_False", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_True", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_True_use_diff_optim_inputs_False", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_True_use_diff_optim_inputs_True", + "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_False", + "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_True", + "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_False", + "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_True", + "test_rekey_optim_state_dict_to_ids_state_dict_type0_use_multiple_param_groups_False", + "test_rekey_optim_state_dict_to_ids_state_dict_type0_use_multiple_param_groups_True", + "test_rekey_optim_state_dict_to_ids_state_dict_type1_use_multiple_param_groups_False", + "test_rekey_optim_state_dict_to_ids_state_dict_type1_use_multiple_param_groups_True", + "test_rekey_optim_state_dict_to_names", + "test_scatter_full_optim_state_dict_nested_halve_world_size", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_False", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_True", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_False", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_True", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_False", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_True", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_False", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_True", + "test_shard_full_optim_state_dict_nested_halve_world_size", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_False", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_True", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_False", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_True", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_False", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_True", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_False", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_True", "test_use_orig_params", ), # Performance check, skip # "../../../../test/distributed/fsdp/test_fsdp_overlap.py": ( + # # https://github.com/intel/torch-xpu-ops/issues/1504 # "test_forward_overlap", # "test_forward_overlap_xpu", # ), "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None, - "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None, - "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None, + "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_fsdp_ddp_parity_with_grad_scaler_offload_false_none_none_none", + "test_fsdp_ddp_parity_with_grad_scaler_offload_false_shard_grad_op_none_none", + "test_fsdp_ddp_parity_with_grad_scaler_offload_true_none_none_none", + "test_fsdp_ddp_parity_with_grad_scaler_offload_true_shard_grad_op_none_none", + ), + "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_state_dict_save_load_flow_state_dict_type_local_state_dict", + "test_state_dict_save_load_flow_state_dict_type_sharded_state_dict", + "test_state_dict_save_load_flow_state_dict_type_state_dict", + ), "../../../../test/distributed/fsdp/test_fsdp_tp_integration.py": None, "../../../../test/distributed/fsdp/test_fsdp_traversal.py": None, "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None, "../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None, - "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": None, + "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_diff_hyperparams_sharding_strategy_str_full_shard", + "test_diff_hyperparams_sharding_strategy_str_no_shard", + "test_diff_hyperparams_sharding_strategy_str_shard_grad_op", + "test_no_sync_correctness", + ), "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": None, "../../../../test/distributed/fsdp/test_shard_utils.py": None, "../../../../test/distributed/fsdp/test_utils.py": None, @@ -127,11 +289,20 @@ ), "../../../../test/distributed/test_multi_threaded_pg.py": ( # oneccl not support multi-threaded well, so skip it first. + # https://github.com/intel/torch-xpu-ops/issues/1509 "test_bwd_sees_fwd_pg", ), "../../../../test/distributed/test_store.py": None, - "../../../../test/distributed/pipelining/test_backward.py": None, - "../../../../test/distributed/pipelining/test_microbatch.py": None, + "../../../../test/distributed/pipelining/test_backward.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_stage_backward_weight_multiple_iters_xpu", + "test_stage_backward_weight_xpu", + "test_stage_backward_xpu", + ), + "../../../../test/distributed/pipelining/test_microbatch.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_chunk_spec_xpu", + ), "../../../../test/distributed/pipelining/test_pipe.py": None, "../../../../test/distributed/pipelining/test_schedule.py": None, "../../../../test/distributed/pipelining/test_transformer.py": None, @@ -184,7 +355,7 @@ "../../../../test/distributed/tensor/parallel/test_tp_examples.py": ( # RuntimeError: aten.add.Tensor: got mixed torch.Tensor and DTensor, need to convert all torch.Tensor to DTensor before calling distributed operators! # https://github.com/intel/torch-xpu-ops/issues/1555 - "test/distributed/tensor/parallel/test_tp_examples.py::DistTensorParallelExampleTest::test_transformer_req_grad_seq_parallel_float32_thaw_all", + "test_transformer_req_grad_seq_parallel_float32_thaw_all", "test_transformer_req_grad_seq_parallel_float32_thaw_layers_0_attention_wv__layers_0_feed_forward_w1__layers_1_feed_forward_w2__layers_1_ffn_norm__output__tok_embeddings", "test_transformer_req_grad_seq_parallel_float32_thaw_layers_1_ffn_norm__norm__output__tok_embeddings", "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output__tok_embeddings", @@ -201,9 +372,29 @@ "../../../../test/distributed/tensor/test_api.py": None, "../../../../test/distributed/tensor/test_attention.py": None, "../../../../test/distributed/tensor/test_common_rules.py": None, - "../../../../test/distributed/tensor/test_dtensor.py": None, - "../../../../test/distributed/tensor/test_dtensor_compile.py": None, - "../../../../test/distributed/tensor/test_experimental_ops.py": None, + "../../../../test/distributed/tensor/test_dtensor.py": ( + # Passed with updated test code for world_size 8 + "test_auto_implicit_replication", + "test_default_value_sub_mesh", + "test_device_mesh_nd", + "test_dtensor_2d_mesh", + "test_dtensor_api_device_mesh_context_manager", + "test_dtensor_device_mesh_device_conversion", + "test_dtensor_spec_local_shard_offset", + "test_from_local_sub_mesh", + "test_implicit_replication", + "test_metadata_consistency_check", + "test_redistribute_sub_mesh", + "test_split_tensor_1D", + ), + "../../../../test/distributed/tensor/test_dtensor_compile.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_2d_fsdp_tp_compile", + ), + "../../../../test/distributed/tensor/test_experimental_ops.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1535 + "test_bernoulli", + ), "../../../../test/distributed/tensor/test_init.py": None, "../../../../test/distributed/tensor/test_math_ops.py": ( # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path @@ -211,28 +402,58 @@ "test_mean", "test_nll_loss_and_cross_entropy", ), - "../../../../test/distributed/tensor/test_random_ops.py": None, - "../../../../test/distributed/tensor/test_redistribute.py": None, + "../../../../test/distributed/tensor/test_random_ops.py": ( + # Need to update world size + "test_hsdp_tp_model_meta_init", + ), + "../../../../test/distributed/tensor/test_redistribute.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_redistribute_shard_dim_multi_dim_mesh", + ), "../../../../test/distributed/tensor/test_tensor_ops.py": None, "../../../../test/distributed/tensor/experimental/test_register_sharding.py": None, "../../../../test/distributed/_shard/test_sharder.py": None, # FSDP2 "../../../../test/distributed/_composable/fsdp/test_fully_shard_autograd.py": None, - "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": None, - "../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_clip_grad_norm_2d", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1571 + "test_set_reduce_scatter_divide_factor", + ), "../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": None, "../../../../test/distributed/_composable/fsdp/test_fully_shard_extensions.py": None, "../../../../test/distributed/_composable/fsdp/test_fully_shard_frozen.py": None, - "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1508 + "test_gradient_scaler", + ), "../../../../test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py": None, "../../../../test/distributed/_composable/fsdp/test_fully_shard_init.py": None, "../../../../test/distributed/_composable/fsdp/test_fully_shard_logging.py": None, - "../../../../test/distributed/_composable/fsdp/test_fully_shard_memory.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_memory.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1535 + "test_fully_shard_training_memory", + ), "../../../../test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py": None, - "../../../../test/distributed/_composable/fsdp/test_fully_shard_overlap.py": None, - "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_overlap.py": ( + # Performance test, should skip + "test_fully_shard_training_overlap", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1572 + "test_dp_state_dict_cpu_offload", + ), "../../../../test/distributed/_composable/fsdp/test_fully_shard_state.py": None, - "../../../../test/distributed/_composable/fsdp/test_fully_shard_training.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_training.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1508 + "test_post_optim_event", + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_train_parity_multi_group_unshard_async_op", + "test_train_parity_with_activation_checkpointing", + ), } skip_dict_python = { From 1a30e75c19c6b977a00938d53909ae8cdfa4d164 Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Wed, 16 Apr 2025 23:07:49 -0700 Subject: [PATCH 39/41] enable RAG based similar issue search --- .github/scripts/check-ut.py | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py index 7e7c6ecd4..eda195e3a 100644 --- a/.github/scripts/check-ut.py +++ b/.github/scripts/check-ut.py @@ -78,7 +78,7 @@ def get_message(case): error_messages.append(f"{error_type}: {error_msg}") capture_next_lines = True indent_level = 0 - break + break elif f"{error_type}:" in stripped_line and "Traceback" not in stripped_line: error_msg = stripped_line.split(f'{error_type}:')[-1].strip() error_messages.append(f"{error_type}: {error_msg}") @@ -98,6 +98,37 @@ def print_md_row(row, print_header=False): row_values = " | ".join([f"{value}" for value in row.values()]) print(f"| {row_values} |") +def get_similar_issues(classname, name, result, message): + import requests + + os.environ["http_proxy"] = "" + os.environ["https_proxy"] = "" + DEFAULT_HOST_IP = "10.112.100.138" + + def QnA(request, host_ip=DEFAULT_HOST_IP): + import json + url = f"http://{host_ip}:8888/v1/chatqna" + + headers = {"Content-Type": "application/json"} + + response = requests.post(url, headers=headers, json=request) + return response + + prompt = f"unit test {name} {result} with {message}, is it a known issue? If yes, what is the issue id? And what is the owner and root cuase?" + + request = { + "messages": prompt, + "stream": False + } + + response = QnA (request) + if response.status_code==200: + result = response.json()["choices"][0]["message"]["content"] + answer = result.split("")[-1].strip() + answer = answer.split("**Answer:**")[-1].strip() + return answer + return "" + def print_failures(): if not failures: return @@ -105,11 +136,13 @@ def print_failures(): print("### Test Failures") print_header = True for case in failures: + issue = get_similar_issues(get_classname(case), get_name(case), get_result(case), get_message(case)) print_md_row({ 'Class name': get_classname(case), 'Test name': get_name(case), 'Status': get_result(case), 'Message': get_message(case), + 'Similar issue': issue, 'Source': case['source'] if isinstance(case, dict) else 'XML' }, print_header) print_header = False From b4431a4edbebe27076eeef45af4c912248d4a7c5 Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Fri, 25 Apr 2025 00:39:52 -0700 Subject: [PATCH 40/41] create ut_failure_list.csv to support offline triage, instead of access RAG difrectly --- .github/scripts/check-ut.py | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py index eda195e3a..5778b5fd8 100644 --- a/.github/scripts/check-ut.py +++ b/.github/scripts/check-ut.py @@ -89,7 +89,7 @@ def get_message(case): return " ; ".join(error_messages) if error_messages else f"{case.result[0].message.splitlines()[0]}" -def print_md_row(row, print_header=False): +def print_md_row(row, print_header=False, fail_list=None): if print_header: header = " | ".join([f"{key}" for key in row.keys()]) print(f"| {header} |") @@ -98,6 +98,10 @@ def print_md_row(row, print_header=False): row_values = " | ".join([f"{value}" for value in row.values()]) print(f"| {row_values} |") + if fail_list != None: + fail_list.write(f"| {row_values} |\n") + + def get_similar_issues(classname, name, result, message): import requests @@ -133,19 +137,23 @@ def print_failures(): if not failures: return - print("### Test Failures") - print_header = True - for case in failures: - issue = get_similar_issues(get_classname(case), get_name(case), get_result(case), get_message(case)) - print_md_row({ - 'Class name': get_classname(case), - 'Test name': get_name(case), - 'Status': get_result(case), - 'Message': get_message(case), - 'Similar issue': issue, - 'Source': case['source'] if isinstance(case, dict) else 'XML' - }, print_header) - print_header = False + with open("ut_failure_list.csv", "w") as fail_list: + fail_list.write("sep=\'|\''.\n") + + print("### Test Failures") + print_header = True + for case in failures: + #issue = get_similar_issues(get_classname(case), get_name(case), get_result(case), get_message(case)) + print_md_row({ + 'Class name': get_classname(case), + 'Test name': get_name(case), + 'Status': get_result(case), + 'Message': get_message(case), + #'Similar issue': issue, + 'Source': case['source'] if isinstance(case, dict) else 'XML' + }, print_header, fail_list) + + print_header = False def parse_log_file(log_file): with open(log_file, encoding='utf-8') as f: @@ -264,6 +272,7 @@ def print_summary(): print("### Results Summary") print_header = True + for summary in summaries: print_md_row({ 'Category': summary['Category'], From c8f27e65fc0d33aa35129c811c87a1a24e9e5216 Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Fri, 25 Apr 2025 00:54:15 -0700 Subject: [PATCH 41/41] upload ut_failure_list.csv --- .github/workflows/_linux_ut.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 8a4cc0b45..afd8afdf0 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -266,6 +266,7 @@ jobs: source activate xpu_op_${ZE_AFFINITY_MASK} pip install junitparser python .github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml >> $GITHUB_STEP_SUMMARY || true + if [ -f "ut_failure_list.csv"];then cp ut_failure_list.csv ${{ github.workspace }}/ut_log/. fi - name: UT Test Results Check shell: bash run: | @@ -704,3 +705,9 @@ jobs: with: name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed path: ${{ github.workspace }}/ut_log + - name: Upload XPU UT Log + if: ${{ ! cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: XPU-UT-Failure-List-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed + path: ${{ github.workspace }}/ut_log/ut_failure_list.csv