From a2c3f35f047e6b711d3ccf4983856482563b1c13 Mon Sep 17 00:00:00 2001
From: Daisy Deng <daisy.deng@intel.com>
Date: Wed, 2 Apr 2025 06:01:16 -0700
Subject: [PATCH 01/41] enable fsdp cases based on local branch

---
 test/xpu/run_distributed_local.py | 63 +++++++++++++++++++++++++++++++
 test/xpu/skip_list_dist_local.py  | 57 ++++++++++++++++++++++++++++
 2 files changed, 120 insertions(+)
 create mode 100644 test/xpu/run_distributed_local.py
 create mode 100644 test/xpu/skip_list_dist_local.py

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
new file mode 100644
index 000000000..8074b3292
--- /dev/null
+++ b/test/xpu/run_distributed_local.py
@@ -0,0 +1,63 @@
+import os
+import subprocess
+import sys
+
+from skip_list_dist_local import skip_dict
+from xpu_test_utils import launch_test
+
+res = 0
+fail_test = []
+
+# Get the xelink group card affinity
+ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
+if ret == 0:
+    gpu_dict = {}
+    with open("topology.log", "r") as file:
+        lines = file.readlines()
+        for line in lines:
+           if "CPU Affinity" in line:
+              continue
+           line = line.strip()
+           if line.startswith("GPU "):
+               items = line.split(' ')
+               items = [x for x in items if x]
+               gpu_id = items[1]
+               i = gpu_id.split('/')[0]
+               affinity = ""
+               for j, item in enumerate(items):
+                   if "SYS" not in item and ( "XL" in item or "S" in item ):
+                      if len(affinity) == 0:
+                          affinity = str(j-2)
+                      else:
+                          affinity = affinity + ',' + str(j-2)
+               gpu_dict[i] = affinity
+    
+    
+    max_affinity = ""
+    for key, value in gpu_dict.items():
+        if  len(value) > len(max_affinity):
+            max_affinity = value
+    
+    os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
+    print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))
+
+else:
+    print("xpu-smi topology failed")
+    sys.exit(255)
+
+# run pytest with skiplist
+for key in skip_dict:
+    skip_list = skip_dict[key]
+    fail = launch_test(key, skip_list)
+    res += fail
+    if fail:
+        fail_test.append(key)
+
+if fail_test:
+    print(",".join(fail_test) + " have failures")
+
+exit_code = os.WEXITSTATUS(res)
+if exit_code == 0:
+    sys.exit(res)
+else:
+    sys.exit(exit_code)
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
new file mode 100644
index 000000000..08f90c6b5
--- /dev/null
+++ b/test/xpu/skip_list_dist_local.py
@@ -0,0 +1,57 @@
+skip_dict = {
+    "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None,
+    # https://github.com/intel/torch-xpu-ops/issues/1536
+    #"../../../../test/distributed/fsdp/test_distributed_checkpoint.py": (
+    #    "test_distributed_checkpoint_state_dict_type0_xpu",
+    #    "test_distributed_checkpoint_state_dict_type1_xpu",
+    #),
+    "../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_core.py": (
+        "test_delayed_optim_step_offload_true_no_shard_xpu",
+        "test_transformer_no_grad_mixed_precision_True_xpu",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_fx.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_grad_acc.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_hybrid_shard.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_ignored_modules.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_input.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_memory.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_meta.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_misc.py": (
+        "test_fsdp_zero2_eval_with_prefetch",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None,
+    # https://github.com/intel/torch-xpu-ops/issues/1537
+    "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": (
+        "test_use_orig_params",
+    ),
+    # Performance check, skip 
+    #"../../../../test/distributed/fsdp/test_fsdp_overlap.py": (
+    #    "test_forward_overlap",
+    #    "test_forward_overlap_xpu",
+    #),
+    "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_tp_integration.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_traversal.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": None,
+    "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": None,
+    "../../../../test/distributed/fsdp/test_shard_utils.py": None,
+    "../../../../test/distributed/fsdp/test_utils.py": None,
+    "../../../../test/distributed/fsdp/test_wrap.py": None,
+}

From e772d23680c67301d6e9e5a47b741fc622c49158 Mon Sep 17 00:00:00 2001
From: Daisy Deng <daisy.deng@intel.com>
Date: Wed, 2 Apr 2025 19:46:24 -0700
Subject: [PATCH 02/41] add 2025.0 WA

---
 test/xpu/run_distributed_local.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index 8074b3292..b6a9ef60c 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -8,6 +8,9 @@
 res = 0
 fail_test = []
 
+os.environ["CCL_ATL_TRANSPORT"] = "ofi"
+os.environ["CCL_SEND"] = "direct"
+os.environ["CCL_RECV"] = "direct" 
 # Get the xelink group card affinity
 ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
 if ret == 0:

From cbd34cd308e4cd601561c3ce64e44c408b94f730 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Thu, 3 Apr 2025 01:27:44 -0700
Subject: [PATCH 03/41] Update distributed UT cases in DDP and PP

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/run_distributed_local.py | 29 +++++++++-
 test/xpu/skip_list_dist_local.py  | 91 +++++++++++++++++++++++++++++++
 2 files changed, 118 insertions(+), 2 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index b6a9ef60c..982f05409 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -2,15 +2,17 @@
 import subprocess
 import sys
 
-from skip_list_dist_local import skip_dict
+from skip_list_dist_local import skip_dict, skip_dict_python
 from xpu_test_utils import launch_test
 
 res = 0
+res2 = 0
 fail_test = []
 
 os.environ["CCL_ATL_TRANSPORT"] = "ofi"
 os.environ["CCL_SEND"] = "direct"
 os.environ["CCL_RECV"] = "direct" 
+os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining"
 # Get the xelink group card affinity
 ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
 if ret == 0:
@@ -48,6 +50,29 @@
     print("xpu-smi topology failed")
     sys.exit(255)
 
+# run python test
+def run(test_command):
+    result = subprocess.run(test_command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+    return result
+
+for key in skip_dict_python:
+    skip_list = skip_dict_python[key]
+    test_command = ["python", key]
+    fail = run(test_command)
+    if fail.returncode:
+        for line in fail.stderr.split("\n"):
+            if "FAIL: " in line:
+                is_error = True
+                for skip_case in skip_list:
+                    if skip_case in line:
+                        print("Skiped error: ", key + " " + skip_case)
+                        is_error = False
+                if is_error:
+                    res2 += fail.returncode
+                    fail_test.append("".join(key + " " + line))
+
 # run pytest with skiplist
 for key in skip_dict:
     skip_list = skip_dict[key]
@@ -61,6 +86,6 @@
 
 exit_code = os.WEXITSTATUS(res)
 if exit_code == 0:
-    sys.exit(res)
+    sys.exit(res2)
 else:
     sys.exit(exit_code)
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 08f90c6b5..d65b7aee6 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -54,4 +54,95 @@
     "../../../../test/distributed/fsdp/test_shard_utils.py": None,
     "../../../../test/distributed/fsdp/test_utils.py": None,
     "../../../../test/distributed/fsdp/test_wrap.py": None,
+    "../../../../test/distributed/test_backends.py": None,
+    "../../../../test/distributed/test_c10d_common.py": None,
+    "../../../../test/distributed/test_c10d_functional_native.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1508
+        #RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
+        "test_reduce_scatter_tensor_coalesced",
+        "test_reduce_scatter_tensor_single",
+        # https://github.com/intel/torch-xpu-ops/issues/1525
+        # ValueError: trying to initialize the default process group twice!
+        "test_inductor_all_gather_into_tensor_coalesced",
+        "test_inductor_all_gather_into_tensor_single",
+        "test_inductor_all_reduce_coalesced",
+        "test_inductor_all_reduce_non_contig_input",
+        "test_inductor_all_reduce_single",
+        "test_inductor_all_to_all_single",
+        "test_inductor_broadcast",
+        "test_inductor_inplace_op_on_view",
+        "test_inductor_reduce_scatter_tensor_coalesced",
+        "test_inductor_reduce_scatter_tensor_single",
+        "test_inductor_reuse_buffer_after_inplace_collective",
+        "test_ranks_and_tag",
+        "test_wait_tensor",
+    ),
+    "../../../../test/distributed/test_c10d_logger.py": None,
+    "../../../../test/distributed/test_c10d_object_collectives.py": (
+        # RuntimeError: Process 0 terminated or timed out after 300.09047198295593 seconds
+        # https://github.com/intel/torch-xpu-ops/issues/1535
+        "test_gather_object_cpu",
+        "test_gather_object_xpu",
+        "test_gather_object_list_cpu",
+        "test_gather_object_list_xpu",
+    ),
+    "../../../../test/distributed/test_compute_comm_reordering.py": None,
+    "../../../../test/distributed/test_control_collectives.py": None,
+    "../../../../test/distributed/test_device_mesh.py": None,
+    "../../../../test/distributed/test_dynamo_distributed.py": (
+        # AttributeError:'torch._C._distributed_c10d.ProcessGroupXCCL' object has no attribute '_set_default_timeout'
+        "test_asymmetric_compilation",
+        "test_asymmetric_compilation_with_fx_cache",
+        # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device.
+        "test_compiled_flex_attention_full_model_ddp",
+        "test_compiled_flex_attention_local_ddp",
+        # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ 
+        # https://github.com/intel/torch-xpu-ops/issues/1527
+        "test_compiler_collectives_automatic_dynamic_scalar",
+        "test_compiler_collectives_automatic_dynamic_speculation_divergence",
+        "test_compiler_collectives_automatic_dynamic_tensor",
+        "test_compiler_collectives_dim_mismatch",
+        "test_compiler_collectives_graph_break_empty_graph_still_collective",
+        "test_compiler_collectives_missing_source",
+        "test_compiler_collectives_scalar_missing_source",
+        "test_compiler_collectives_type_mismatch",
+        "test_ddp_activation_checkpointing",
+        "test_ddp_baseline_aot_eager_multiprocess",
+        "test_fsdp_activation_checkpointing",
+        "test_fsdp_aot_eager",
+        "test_fsdp_inductor",
+        "test_fsdp_setattr",
+        "test_fsdp_unspecialized_forced_getattr_inline",
+        "test_fsdp_unspecialized_forced_getattr_no_inline",
+        # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES)
+        # https://github.com/intel/torch-xpu-ops/issues/1526
+        "test_get_pg_attr",
+    ),
+    "../../../../test/distributed/test_fake_pg.py": None,
+    "../../../../test/distributed/test_functional_api.py": (
+        # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES)
+        # https://github.com/intel/torch-xpu-ops/issues/1526
+        "test_tracing_xpu",
+        "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu"
+    ),
+    "../../../../test/distributed/test_multi_threaded_pg.py": (
+        # oneccl not support multi-threaded well, so skip it first.
+        "test_bwd_sees_fwd_pg",
+    ),
+    "../../../../test/distributed/test_store.py": None,
+    "../../../../test/distributed/pipelining/test_backward.py": None,
+    "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None,
+    "../../../../test/distributed/pipelining/test_backward.py": None,
+    "../../../../test/distributed/pipelining/test_microbatch.py": None,
+    "../../../../test/distributed/pipelining/test_pipe.py": None,
+    "../../../../test/distributed/pipelining/test_schedule.py": None,
+    "../../../../test/distributed/pipelining/test_transformer.py": None,
+    "../../../../test/distributed/pipelining/test_unflatten.py": None,
+}
+
+skip_dict_python = {
+    "distributed/test_c10d_ops_xccl.py": None,
+    "distributed/test_c10d_xccl.py": None,
+    "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error.
+    "../../../../test/distributed/pipelining/test_stage.py": None,
 }

From d856e950310ed44446d81d9b37250b7b7d4fbcc3 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Thu, 3 Apr 2025 01:36:16 -0700
Subject: [PATCH 04/41] Fixed pylint error

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/run_distributed_local.py | 7 +++----
 test/xpu/skip_list_dist_local.py  | 4 ++--
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index 982f05409..a5f0c8098 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -11,7 +11,7 @@
 
 os.environ["CCL_ATL_TRANSPORT"] = "ofi"
 os.environ["CCL_SEND"] = "direct"
-os.environ["CCL_RECV"] = "direct" 
+os.environ["CCL_RECV"] = "direct"
 os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining"
 # Get the xelink group card affinity
 ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
@@ -36,13 +36,12 @@
                       else:
                           affinity = affinity + ',' + str(j-2)
                gpu_dict[i] = affinity
-    
-    
+
     max_affinity = ""
     for key, value in gpu_dict.items():
         if  len(value) > len(max_affinity):
             max_affinity = value
-    
+
     os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
     print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))
 
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index d65b7aee6..6ce62b8ca 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -96,7 +96,7 @@
         # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device.
         "test_compiled_flex_attention_full_model_ddp",
         "test_compiled_flex_attention_local_ddp",
-        # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ 
+        # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__
         # https://github.com/intel/torch-xpu-ops/issues/1527
         "test_compiler_collectives_automatic_dynamic_scalar",
         "test_compiler_collectives_automatic_dynamic_speculation_divergence",
@@ -131,13 +131,13 @@
     ),
     "../../../../test/distributed/test_store.py": None,
     "../../../../test/distributed/pipelining/test_backward.py": None,
-    "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None,
     "../../../../test/distributed/pipelining/test_backward.py": None,
     "../../../../test/distributed/pipelining/test_microbatch.py": None,
     "../../../../test/distributed/pipelining/test_pipe.py": None,
     "../../../../test/distributed/pipelining/test_schedule.py": None,
     "../../../../test/distributed/pipelining/test_transformer.py": None,
     "../../../../test/distributed/pipelining/test_unflatten.py": None,
+    "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None,
 }
 
 skip_dict_python = {

From 28a259e59448bb70958a818d3f50fee62f2ebfa2 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Thu, 3 Apr 2025 02:01:55 -0700
Subject: [PATCH 05/41] Fixed pylint error

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/run_distributed_local.py | 38 ++++++++++++++++---------------
 test/xpu/skip_list_dist_local.py  | 17 +++++++-------
 2 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index a5f0c8098..d4db4785a 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -17,29 +17,29 @@
 ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
 if ret == 0:
     gpu_dict = {}
-    with open("topology.log", "r") as file:
+    with open("topology.log") as file:
         lines = file.readlines()
         for line in lines:
-           if "CPU Affinity" in line:
-              continue
-           line = line.strip()
-           if line.startswith("GPU "):
-               items = line.split(' ')
-               items = [x for x in items if x]
-               gpu_id = items[1]
-               i = gpu_id.split('/')[0]
-               affinity = ""
-               for j, item in enumerate(items):
-                   if "SYS" not in item and ( "XL" in item or "S" in item ):
-                      if len(affinity) == 0:
-                          affinity = str(j-2)
-                      else:
-                          affinity = affinity + ',' + str(j-2)
-               gpu_dict[i] = affinity
+            if "CPU Affinity" in line:
+                continue
+            line = line.strip()
+            if line.startswith("GPU "):
+                items = line.split(" ")
+                items = [x for x in items if x]
+                gpu_id = items[1]
+                i = gpu_id.split("/")[0]
+                affinity = ""
+                for j, item in enumerate(items):
+                    if "SYS" not in item and ("XL" in item or "S" in item):
+                        if len(affinity) == 0:
+                            affinity = str(j - 2)
+                        else:
+                            affinity = affinity + "," + str(j - 2)
+                gpu_dict[i] = affinity
 
     max_affinity = ""
     for key, value in gpu_dict.items():
-        if  len(value) > len(max_affinity):
+        if len(value) > len(max_affinity):
             max_affinity = value
 
     os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
@@ -49,6 +49,7 @@
     print("xpu-smi topology failed")
     sys.exit(255)
 
+
 # run python test
 def run(test_command):
     result = subprocess.run(test_command, capture_output=True, text=True)
@@ -56,6 +57,7 @@ def run(test_command):
     print(result.stderr)
     return result
 
+
 for key in skip_dict_python:
     skip_list = skip_dict_python[key]
     test_command = ["python", key]
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 6ce62b8ca..0ac46961e 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -1,10 +1,10 @@
 skip_dict = {
     "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None,
     # https://github.com/intel/torch-xpu-ops/issues/1536
-    #"../../../../test/distributed/fsdp/test_distributed_checkpoint.py": (
+    # "../../../../test/distributed/fsdp/test_distributed_checkpoint.py": (
     #    "test_distributed_checkpoint_state_dict_type0_xpu",
     #    "test_distributed_checkpoint_state_dict_type1_xpu",
-    #),
+    # ),
     "../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None,
@@ -37,11 +37,11 @@
     "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": (
         "test_use_orig_params",
     ),
-    # Performance check, skip 
-    #"../../../../test/distributed/fsdp/test_fsdp_overlap.py": (
+    # Performance check, skip
+    # "../../../../test/distributed/fsdp/test_fsdp_overlap.py": (
     #    "test_forward_overlap",
     #    "test_forward_overlap_xpu",
-    #),
+    # ),
     "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None,
@@ -58,7 +58,7 @@
     "../../../../test/distributed/test_c10d_common.py": None,
     "../../../../test/distributed/test_c10d_functional_native.py": (
         # https://github.com/intel/torch-xpu-ops/issues/1508
-        #RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
+        # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
         "test_reduce_scatter_tensor_coalesced",
         "test_reduce_scatter_tensor_single",
         # https://github.com/intel/torch-xpu-ops/issues/1525
@@ -123,7 +123,7 @@
         # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES)
         # https://github.com/intel/torch-xpu-ops/issues/1526
         "test_tracing_xpu",
-        "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu"
+        "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu",
     ),
     "../../../../test/distributed/test_multi_threaded_pg.py": (
         # oneccl not support multi-threaded well, so skip it first.
@@ -131,7 +131,6 @@
     ),
     "../../../../test/distributed/test_store.py": None,
     "../../../../test/distributed/pipelining/test_backward.py": None,
-    "../../../../test/distributed/pipelining/test_backward.py": None,
     "../../../../test/distributed/pipelining/test_microbatch.py": None,
     "../../../../test/distributed/pipelining/test_pipe.py": None,
     "../../../../test/distributed/pipelining/test_schedule.py": None,
@@ -143,6 +142,6 @@
 skip_dict_python = {
     "distributed/test_c10d_ops_xccl.py": None,
     "distributed/test_c10d_xccl.py": None,
-    "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error.
+    "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None,  # Hang error.
     "../../../../test/distributed/pipelining/test_stage.py": None,
 }

From 62e9ff75ced8a311c1e52c61fd49c97622075378 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Sat, 5 Apr 2025 19:25:50 +0000
Subject: [PATCH 06/41] add distributed ut in CI

---
 .github/scripts/ut_result_check.sh |  10 +--
 .github/workflows/_linux_build.yml |   6 +-
 .github/workflows/_linux_ut.yml    | 140 +++++++++++++++++++++++++++++
 .github/workflows/pull.yml         |  25 ++++++
 4 files changed, 175 insertions(+), 6 deletions(-)

diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh
index 3fb1a1997..32dbed489 100644
--- a/.github/scripts/ut_result_check.sh
+++ b/.github/scripts/ut_result_check.sh
@@ -72,14 +72,14 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then
       echo -e "[PASS] UT ${ut_suite} test Pass"
     fi
 fi
-if [[ "${ut_suite}" == 'xpu_distributed' ]]; then
-    grep -E "^FAILED|have failures" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
-    num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed.log")
+if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then
+    grep -E "ERROR" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log
+    num_failed_distributed=$(wc -l < "./${ut_suite}_test_failed.log")
     echo -e "========================================================================="
     echo -e "Show Failed cases in ${ut_suite} xpu distributed"
     echo -e "========================================================================="
-    cat "./${ut_suite}_xpu_distributed_test_failed.log"
-    ((num_failed=num_failed_xpu_distributed))
+    cat "./${ut_suite}_test_failed.log"
+    ((num_failed=num_failed_distributed))
     if [[ $num_failed -gt 0 ]]; then
       echo -e "[ERROR] UT ${ut_suite} test Fail"
       exit 1
diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index ee9381c9c..3ed1c3d4e 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -70,7 +70,11 @@ jobs:
           source activate xpu_build
           cd ../ && rm -rf pytorch
           pip install requests
-          git clone https://github.com/pytorch/pytorch pytorch
+          if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then
+            git clone https://github.com/daisyden/pytorch.git pytorch
+          else
+            git clone https://github.com/pytorch/pytorch pytorch
+          fi
           cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
           # apply PRs for stock pytorch
           python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index a11528a3e..aa631c6dd 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -435,3 +435,143 @@ jobs:
         with:
           name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-xpu_distributed
           path: ${{ github.workspace }}/ut_log
+
+  pytorch_distributed_test:
+    runs-on: ${{ inputs.runner }}
+    if: contains(inputs.ut, 'pytorch_distributed')
+    timeout-minutes: 900
+    env:
+      NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+      DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+    steps:
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Prepare Stock Pytorch
+        run: |
+          pwd
+          which conda && conda clean -ay
+          conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \
+                rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
+          conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ../ && rm -rf pytorch
+          pip install requests
+          git clone https://github.com/daisyden/pytorch.git pytorch
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            # apply PRs for stock pytorch
+            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+            git status && git show -s
+            git submodule sync && git submodule update --init --recursive
+            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+              echo "Don't replace torch-xpu-ops!"
+            else
+              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+              # Workaround for torch-xpu-ops ci test
+              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+            fi
+          fi
+      - name: Triton Installation
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ../pytorch
+          TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
+          if [ -z ${{ inputs.triton }} ]; then
+            TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)"
+          else
+            TRITON_COMMIT_ID="${{ inputs.triton }}"
+          fi
+          echo ${TRITON_REPO}@${TRITON_COMMIT_ID}
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python"
+          fi
+      - name: Download Pytorch wheel
+        if: ${{ inputs.pytorch != 'nightly_wheel' }}
+        uses: actions/download-artifact@v4
+        with:
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}
+          path: ${{ github.workspace }}
+      - name: Install Pytorch XPU
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          pip install mkl-static==2025.0.1 mkl-include==2025.0.1
+          if [[ ${{ inputs.abi }} == '0' ]]; then
+            export _GLIBCXX_USE_CXX11_ABI=0
+          else
+            export _GLIBCXX_USE_CXX11_ABI=1
+          fi
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            cd ../pytorch
+            export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+            pip install -r requirements.txt
+            pip install --force-reinstall ${{ github.workspace }}/torch*.whl
+            git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
+          else
+            pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
+            TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
+            cd ../pytorch
+            git reset --hard && git checkout ${TORCH_COMMIT_ID}
+            TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
+            rm -rf third_party/torch-xpu-ops
+            git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
+            cd third_party/torch-xpu-ops
+            git checkout ${TORCH_XPU_OPS_COMMIT}
+            cd ../..
+            python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py
+          fi
+          pip install -r .ci/docker/requirements-ci.txt
+      - name: Torch Config
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          python -c "import torch; print(torch.__config__.show())"
+          python -c "import torch; print(torch.__config__.parallel_info())"
+          python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
+          python -c "import triton; print(triton.__version__)"
+
+          cd ..
+          python pytorch/torch/utils/collect_env.py
+          rm -rf /tmp/torchinductor_*
+          rm -rf ~/.triton/cache
+      - name: Run Torch XPU Distributed UT
+        run: |
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          pip install pytest
+          cd ${{ github.workspace }}
+          sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk
+          sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope
+          mkdir -p ut_log/pytorch_distributed
+          cd ../pytorch/third_party/torch-xpu-ops/test/xpu
+          XCCL_EANBLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())")
+          if [[ "${XCCL_ENABLE}}" == 'False' ]]; then
+            echo -e "[ERROR] XCCL is not enabled"
+            exit 1
+          fi
+          timeout 10000 python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log
+          cd ${{ github.workspace }}
+          sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
+      - name: UT Test Results Check
+        shell: bash
+        run: |
+          function contains() {
+              contains_status="echo 'Start $2 ...'"
+              {
+                [[ $1 =~ (^|,)$2($|,) ]]
+              } || {
+                echo "[Warning] $2 is not suppotted type! Skipped!"
+                contains_status="continue"
+              }
+          }
+          set -xe
+          echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+          cd ${{ github.workspace }}/ut_log/pytorch_distributed
+          cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./
+          bash ut_result_check.sh 'pytorch_distributed'
+      - name: Upload Inductor XPU UT Log
+        if: ${{ ! cancelled() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed
+          path: ${{ github.workspace }}/ut_log
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index ec2a73a20..9cf7ef458 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -66,6 +66,31 @@ jobs:
       pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }}
       ut: op_regression,op_regression_dev1,op_extended,op_ut,xpu_distributed
       runner: linux.idc.xpu
+  
+  preci-linux-build-distributed:
+    # Don't run on forked repos and draft PRs
+    secrets: inherit
+    if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
+    name: preci-linux-distributed
+    needs: preci-lint-check
+    permissions:
+      issues: write
+    uses: ./.github/workflows/_linux_build.yml
+    with:
+      pytorch: distributed_2.8
+      runner: pvc_e2e
+
+  preci-ut-distributed:
+    # Don't run on forked repos and draft PRs
+    secrets: inherit
+    if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
+    name: preci-linux-distributed
+    needs: preci-linux-build-distributed
+    uses: ./.github/workflows/_linux_ut.yml
+    with:
+      pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }}
+      ut: pytorch_distributed
+      runner: pvc_e2e
 
   Inductor-XPU-E2E-CI-Tests:
     name: preci-linux / e2e_test

From 119d2fb5b20a32990eeb0377ce490f2fe3f89894 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Sat, 5 Apr 2025 19:52:17 +0000
Subject: [PATCH 07/41] update if condition

---
 .github/workflows/_linux_build.yml | 26 ++++++++++++++------------
 .github/workflows/_linux_ut.yml    |  2 +-
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index 3ed1c3d4e..eda5de367 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -72,20 +72,22 @@ jobs:
           pip install requests
           if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then
             git clone https://github.com/daisyden/pytorch.git pytorch
+            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            git submodule sync && git submodule update --init --recursive
           else
             git clone https://github.com/pytorch/pytorch pytorch
-          fi
-          cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
-          # apply PRs for stock pytorch
-          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
-          git status && git show -s
-          git submodule sync && git submodule update --init --recursive
-          if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
-            echo "Don't replace torch-xpu-ops!"
-          else
-            rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
-            # Workaround for torch-xpu-ops ci test
-            sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            # apply PRs for stock pytorch
+            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+            git status && git show -s
+            git submodule sync && git submodule update --init --recursive
+            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+              echo "Don't replace torch-xpu-ops!"
+            else
+              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+              # Workaround for torch-xpu-ops ci test
+              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+            fi
           fi
       - name: Build Pytorch XPU
         run: |
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index aa631c6dd..907c5cd2a 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -49,7 +49,7 @@ permissions: read-all
 jobs:
   ut_test:
     runs-on: ${{ inputs.runner }} 
-    if: ${{ inputs.ut != 'xpu_distributed' }}
+    if: ${{ inputs.ut != 'xpu_distributed' || inputs.ut != 'pytorch_distributed' }}
     timeout-minutes: 900
     env:
       NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}

From 5ff20baae6dba5dee9d6c2ea83773a436229e299 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Sat, 5 Apr 2025 23:02:20 +0000
Subject: [PATCH 08/41] keep_torch_xpu_ops

---
 .github/workflows/pull.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 9cf7ef458..f0b1b8e22 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -78,6 +78,7 @@ jobs:
     uses: ./.github/workflows/_linux_build.yml
     with:
       pytorch: distributed_2.8
+      keep_torch_xpu_ops: true
       runner: pvc_e2e
 
   preci-ut-distributed:

From cc472d7823415596734eb9c7e7afb0a3b8c7203b Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Sun, 6 Apr 2025 19:24:08 +0000
Subject: [PATCH 09/41] update keyword in distributed ut check

---
 .github/scripts/ut_result_check.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh
index 32dbed489..9bf611786 100644
--- a/.github/scripts/ut_result_check.sh
+++ b/.github/scripts/ut_result_check.sh
@@ -73,10 +73,10 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then
     fi
 fi
 if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then
-    grep -E "ERROR" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log
+    grep -E "^FAILED|have failures" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log
     num_failed_distributed=$(wc -l < "./${ut_suite}_test_failed.log")
     echo -e "========================================================================="
-    echo -e "Show Failed cases in ${ut_suite} xpu distributed"
+    echo -e "Show Failed cases in ${ut_suite}"
     echo -e "========================================================================="
     cat "./${ut_suite}_test_failed.log"
     ((num_failed=num_failed_distributed))

From 60dbd6eb19a407058eb5f1e6c4972df7fed94fe1 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Mon, 7 Apr 2025 13:37:10 +0000
Subject: [PATCH 10/41] update pytorch build

---
 .github/workflows/_linux_build.yml | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index eda5de367..3ed1c3d4e 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -72,22 +72,20 @@ jobs:
           pip install requests
           if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then
             git clone https://github.com/daisyden/pytorch.git pytorch
-            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
-            git submodule sync && git submodule update --init --recursive
           else
             git clone https://github.com/pytorch/pytorch pytorch
-            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
-            # apply PRs for stock pytorch
-            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
-            git status && git show -s
-            git submodule sync && git submodule update --init --recursive
-            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
-              echo "Don't replace torch-xpu-ops!"
-            else
-              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
-              # Workaround for torch-xpu-ops ci test
-              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
-            fi
+          fi
+          cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+          # apply PRs for stock pytorch
+          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+          git status && git show -s
+          git submodule sync && git submodule update --init --recursive
+          if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+            echo "Don't replace torch-xpu-ops!"
+          else
+            rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+            # Workaround for torch-xpu-ops ci test
+            sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
           fi
       - name: Build Pytorch XPU
         run: |

From af0bca95baf745631876e918dfd4ab6b6823778c Mon Sep 17 00:00:00 2001
From: Daisy Deng <daisy.deng@intel.com>
Date: Wed, 2 Apr 2025 06:01:16 -0700
Subject: [PATCH 11/41] enable fsdp cases based on local branch

---
 test/xpu/run_distributed_local.py | 63 +++++++++++++++++++++++++++++++
 test/xpu/skip_list_dist_local.py  | 57 ++++++++++++++++++++++++++++
 2 files changed, 120 insertions(+)
 create mode 100644 test/xpu/run_distributed_local.py
 create mode 100644 test/xpu/skip_list_dist_local.py

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
new file mode 100644
index 000000000..8074b3292
--- /dev/null
+++ b/test/xpu/run_distributed_local.py
@@ -0,0 +1,63 @@
+import os
+import subprocess
+import sys
+
+from skip_list_dist_local import skip_dict
+from xpu_test_utils import launch_test
+
+res = 0
+fail_test = []
+
+# Get the xelink group card affinity
+ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
+if ret == 0:
+    gpu_dict = {}
+    with open("topology.log", "r") as file:
+        lines = file.readlines()
+        for line in lines:
+           if "CPU Affinity" in line:
+              continue
+           line = line.strip()
+           if line.startswith("GPU "):
+               items = line.split(' ')
+               items = [x for x in items if x]
+               gpu_id = items[1]
+               i = gpu_id.split('/')[0]
+               affinity = ""
+               for j, item in enumerate(items):
+                   if "SYS" not in item and ( "XL" in item or "S" in item ):
+                      if len(affinity) == 0:
+                          affinity = str(j-2)
+                      else:
+                          affinity = affinity + ',' + str(j-2)
+               gpu_dict[i] = affinity
+    
+    
+    max_affinity = ""
+    for key, value in gpu_dict.items():
+        if  len(value) > len(max_affinity):
+            max_affinity = value
+    
+    os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
+    print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))
+
+else:
+    print("xpu-smi topology failed")
+    sys.exit(255)
+
+# run pytest with skiplist
+for key in skip_dict:
+    skip_list = skip_dict[key]
+    fail = launch_test(key, skip_list)
+    res += fail
+    if fail:
+        fail_test.append(key)
+
+if fail_test:
+    print(",".join(fail_test) + " have failures")
+
+exit_code = os.WEXITSTATUS(res)
+if exit_code == 0:
+    sys.exit(res)
+else:
+    sys.exit(exit_code)
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
new file mode 100644
index 000000000..08f90c6b5
--- /dev/null
+++ b/test/xpu/skip_list_dist_local.py
@@ -0,0 +1,57 @@
+skip_dict = {
+    "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None,
+    # https://github.com/intel/torch-xpu-ops/issues/1536
+    #"../../../../test/distributed/fsdp/test_distributed_checkpoint.py": (
+    #    "test_distributed_checkpoint_state_dict_type0_xpu",
+    #    "test_distributed_checkpoint_state_dict_type1_xpu",
+    #),
+    "../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_core.py": (
+        "test_delayed_optim_step_offload_true_no_shard_xpu",
+        "test_transformer_no_grad_mixed_precision_True_xpu",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_fx.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_grad_acc.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_hybrid_shard.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_ignored_modules.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_input.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_memory.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_meta.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_misc.py": (
+        "test_fsdp_zero2_eval_with_prefetch",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None,
+    # https://github.com/intel/torch-xpu-ops/issues/1537
+    "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": (
+        "test_use_orig_params",
+    ),
+    # Performance check, skip 
+    #"../../../../test/distributed/fsdp/test_fsdp_overlap.py": (
+    #    "test_forward_overlap",
+    #    "test_forward_overlap_xpu",
+    #),
+    "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_tp_integration.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_traversal.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": None,
+    "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": None,
+    "../../../../test/distributed/fsdp/test_shard_utils.py": None,
+    "../../../../test/distributed/fsdp/test_utils.py": None,
+    "../../../../test/distributed/fsdp/test_wrap.py": None,
+}

From 6885a00cdf79029a72ff85938bdf330937ada7e4 Mon Sep 17 00:00:00 2001
From: Daisy Deng <daisy.deng@intel.com>
Date: Wed, 2 Apr 2025 19:46:24 -0700
Subject: [PATCH 12/41] add 2025.0 WA

---
 test/xpu/run_distributed_local.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index 8074b3292..b6a9ef60c 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -8,6 +8,9 @@
 res = 0
 fail_test = []
 
+os.environ["CCL_ATL_TRANSPORT"] = "ofi"
+os.environ["CCL_SEND"] = "direct"
+os.environ["CCL_RECV"] = "direct" 
 # Get the xelink group card affinity
 ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
 if ret == 0:

From cd013d7882b28620cf0b81aace3f212bcbedaca9 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Thu, 3 Apr 2025 01:27:44 -0700
Subject: [PATCH 13/41] Update distributed UT cases in DDP and PP

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/run_distributed_local.py | 29 +++++++++-
 test/xpu/skip_list_dist_local.py  | 91 +++++++++++++++++++++++++++++++
 2 files changed, 118 insertions(+), 2 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index b6a9ef60c..982f05409 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -2,15 +2,17 @@
 import subprocess
 import sys
 
-from skip_list_dist_local import skip_dict
+from skip_list_dist_local import skip_dict, skip_dict_python
 from xpu_test_utils import launch_test
 
 res = 0
+res2 = 0
 fail_test = []
 
 os.environ["CCL_ATL_TRANSPORT"] = "ofi"
 os.environ["CCL_SEND"] = "direct"
 os.environ["CCL_RECV"] = "direct" 
+os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining"
 # Get the xelink group card affinity
 ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
 if ret == 0:
@@ -48,6 +50,29 @@
     print("xpu-smi topology failed")
     sys.exit(255)
 
+# run python test
+def run(test_command):
+    result = subprocess.run(test_command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+    return result
+
+for key in skip_dict_python:
+    skip_list = skip_dict_python[key]
+    test_command = ["python", key]
+    fail = run(test_command)
+    if fail.returncode:
+        for line in fail.stderr.split("\n"):
+            if "FAIL: " in line:
+                is_error = True
+                for skip_case in skip_list:
+                    if skip_case in line:
+                        print("Skiped error: ", key + " " + skip_case)
+                        is_error = False
+                if is_error:
+                    res2 += fail.returncode
+                    fail_test.append("".join(key + " " + line))
+
 # run pytest with skiplist
 for key in skip_dict:
     skip_list = skip_dict[key]
@@ -61,6 +86,6 @@
 
 exit_code = os.WEXITSTATUS(res)
 if exit_code == 0:
-    sys.exit(res)
+    sys.exit(res2)
 else:
     sys.exit(exit_code)
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 08f90c6b5..d65b7aee6 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -54,4 +54,95 @@
     "../../../../test/distributed/fsdp/test_shard_utils.py": None,
     "../../../../test/distributed/fsdp/test_utils.py": None,
     "../../../../test/distributed/fsdp/test_wrap.py": None,
+    "../../../../test/distributed/test_backends.py": None,
+    "../../../../test/distributed/test_c10d_common.py": None,
+    "../../../../test/distributed/test_c10d_functional_native.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1508
+        #RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
+        "test_reduce_scatter_tensor_coalesced",
+        "test_reduce_scatter_tensor_single",
+        # https://github.com/intel/torch-xpu-ops/issues/1525
+        # ValueError: trying to initialize the default process group twice!
+        "test_inductor_all_gather_into_tensor_coalesced",
+        "test_inductor_all_gather_into_tensor_single",
+        "test_inductor_all_reduce_coalesced",
+        "test_inductor_all_reduce_non_contig_input",
+        "test_inductor_all_reduce_single",
+        "test_inductor_all_to_all_single",
+        "test_inductor_broadcast",
+        "test_inductor_inplace_op_on_view",
+        "test_inductor_reduce_scatter_tensor_coalesced",
+        "test_inductor_reduce_scatter_tensor_single",
+        "test_inductor_reuse_buffer_after_inplace_collective",
+        "test_ranks_and_tag",
+        "test_wait_tensor",
+    ),
+    "../../../../test/distributed/test_c10d_logger.py": None,
+    "../../../../test/distributed/test_c10d_object_collectives.py": (
+        # RuntimeError: Process 0 terminated or timed out after 300.09047198295593 seconds
+        # https://github.com/intel/torch-xpu-ops/issues/1535
+        "test_gather_object_cpu",
+        "test_gather_object_xpu",
+        "test_gather_object_list_cpu",
+        "test_gather_object_list_xpu",
+    ),
+    "../../../../test/distributed/test_compute_comm_reordering.py": None,
+    "../../../../test/distributed/test_control_collectives.py": None,
+    "../../../../test/distributed/test_device_mesh.py": None,
+    "../../../../test/distributed/test_dynamo_distributed.py": (
+        # AttributeError:'torch._C._distributed_c10d.ProcessGroupXCCL' object has no attribute '_set_default_timeout'
+        "test_asymmetric_compilation",
+        "test_asymmetric_compilation_with_fx_cache",
+        # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device.
+        "test_compiled_flex_attention_full_model_ddp",
+        "test_compiled_flex_attention_local_ddp",
+        # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ 
+        # https://github.com/intel/torch-xpu-ops/issues/1527
+        "test_compiler_collectives_automatic_dynamic_scalar",
+        "test_compiler_collectives_automatic_dynamic_speculation_divergence",
+        "test_compiler_collectives_automatic_dynamic_tensor",
+        "test_compiler_collectives_dim_mismatch",
+        "test_compiler_collectives_graph_break_empty_graph_still_collective",
+        "test_compiler_collectives_missing_source",
+        "test_compiler_collectives_scalar_missing_source",
+        "test_compiler_collectives_type_mismatch",
+        "test_ddp_activation_checkpointing",
+        "test_ddp_baseline_aot_eager_multiprocess",
+        "test_fsdp_activation_checkpointing",
+        "test_fsdp_aot_eager",
+        "test_fsdp_inductor",
+        "test_fsdp_setattr",
+        "test_fsdp_unspecialized_forced_getattr_inline",
+        "test_fsdp_unspecialized_forced_getattr_no_inline",
+        # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES)
+        # https://github.com/intel/torch-xpu-ops/issues/1526
+        "test_get_pg_attr",
+    ),
+    "../../../../test/distributed/test_fake_pg.py": None,
+    "../../../../test/distributed/test_functional_api.py": (
+        # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES)
+        # https://github.com/intel/torch-xpu-ops/issues/1526
+        "test_tracing_xpu",
+        "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu"
+    ),
+    "../../../../test/distributed/test_multi_threaded_pg.py": (
+        # oneccl not support multi-threaded well, so skip it first.
+        "test_bwd_sees_fwd_pg",
+    ),
+    "../../../../test/distributed/test_store.py": None,
+    "../../../../test/distributed/pipelining/test_backward.py": None,
+    "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None,
+    "../../../../test/distributed/pipelining/test_backward.py": None,
+    "../../../../test/distributed/pipelining/test_microbatch.py": None,
+    "../../../../test/distributed/pipelining/test_pipe.py": None,
+    "../../../../test/distributed/pipelining/test_schedule.py": None,
+    "../../../../test/distributed/pipelining/test_transformer.py": None,
+    "../../../../test/distributed/pipelining/test_unflatten.py": None,
+}
+
+skip_dict_python = {
+    "distributed/test_c10d_ops_xccl.py": None,
+    "distributed/test_c10d_xccl.py": None,
+    "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error.
+    "../../../../test/distributed/pipelining/test_stage.py": None,
 }

From cd92f232de04270a17571df0989be7f32f679fcf Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Thu, 3 Apr 2025 01:36:16 -0700
Subject: [PATCH 14/41] Fixed pylint error

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/run_distributed_local.py | 7 +++----
 test/xpu/skip_list_dist_local.py  | 4 ++--
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index 982f05409..a5f0c8098 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -11,7 +11,7 @@
 
 os.environ["CCL_ATL_TRANSPORT"] = "ofi"
 os.environ["CCL_SEND"] = "direct"
-os.environ["CCL_RECV"] = "direct" 
+os.environ["CCL_RECV"] = "direct"
 os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining"
 # Get the xelink group card affinity
 ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
@@ -36,13 +36,12 @@
                       else:
                           affinity = affinity + ',' + str(j-2)
                gpu_dict[i] = affinity
-    
-    
+
     max_affinity = ""
     for key, value in gpu_dict.items():
         if  len(value) > len(max_affinity):
             max_affinity = value
-    
+
     os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
     print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))
 
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index d65b7aee6..6ce62b8ca 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -96,7 +96,7 @@
         # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device.
         "test_compiled_flex_attention_full_model_ddp",
         "test_compiled_flex_attention_local_ddp",
-        # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ 
+        # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__
         # https://github.com/intel/torch-xpu-ops/issues/1527
         "test_compiler_collectives_automatic_dynamic_scalar",
         "test_compiler_collectives_automatic_dynamic_speculation_divergence",
@@ -131,13 +131,13 @@
     ),
     "../../../../test/distributed/test_store.py": None,
     "../../../../test/distributed/pipelining/test_backward.py": None,
-    "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None,
     "../../../../test/distributed/pipelining/test_backward.py": None,
     "../../../../test/distributed/pipelining/test_microbatch.py": None,
     "../../../../test/distributed/pipelining/test_pipe.py": None,
     "../../../../test/distributed/pipelining/test_schedule.py": None,
     "../../../../test/distributed/pipelining/test_transformer.py": None,
     "../../../../test/distributed/pipelining/test_unflatten.py": None,
+    "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None,
 }
 
 skip_dict_python = {

From 413c2b09b48eba42bfc67ed70fb03973edef50a5 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Thu, 3 Apr 2025 02:01:55 -0700
Subject: [PATCH 15/41] Fixed pylint error

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/run_distributed_local.py | 38 ++++++++++++++++---------------
 test/xpu/skip_list_dist_local.py  | 17 +++++++-------
 2 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index a5f0c8098..d4db4785a 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -17,29 +17,29 @@
 ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
 if ret == 0:
     gpu_dict = {}
-    with open("topology.log", "r") as file:
+    with open("topology.log") as file:
         lines = file.readlines()
         for line in lines:
-           if "CPU Affinity" in line:
-              continue
-           line = line.strip()
-           if line.startswith("GPU "):
-               items = line.split(' ')
-               items = [x for x in items if x]
-               gpu_id = items[1]
-               i = gpu_id.split('/')[0]
-               affinity = ""
-               for j, item in enumerate(items):
-                   if "SYS" not in item and ( "XL" in item or "S" in item ):
-                      if len(affinity) == 0:
-                          affinity = str(j-2)
-                      else:
-                          affinity = affinity + ',' + str(j-2)
-               gpu_dict[i] = affinity
+            if "CPU Affinity" in line:
+                continue
+            line = line.strip()
+            if line.startswith("GPU "):
+                items = line.split(" ")
+                items = [x for x in items if x]
+                gpu_id = items[1]
+                i = gpu_id.split("/")[0]
+                affinity = ""
+                for j, item in enumerate(items):
+                    if "SYS" not in item and ("XL" in item or "S" in item):
+                        if len(affinity) == 0:
+                            affinity = str(j - 2)
+                        else:
+                            affinity = affinity + "," + str(j - 2)
+                gpu_dict[i] = affinity
 
     max_affinity = ""
     for key, value in gpu_dict.items():
-        if  len(value) > len(max_affinity):
+        if len(value) > len(max_affinity):
             max_affinity = value
 
     os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
@@ -49,6 +49,7 @@
     print("xpu-smi topology failed")
     sys.exit(255)
 
+
 # run python test
 def run(test_command):
     result = subprocess.run(test_command, capture_output=True, text=True)
@@ -56,6 +57,7 @@ def run(test_command):
     print(result.stderr)
     return result
 
+
 for key in skip_dict_python:
     skip_list = skip_dict_python[key]
     test_command = ["python", key]
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 6ce62b8ca..0ac46961e 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -1,10 +1,10 @@
 skip_dict = {
     "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None,
     # https://github.com/intel/torch-xpu-ops/issues/1536
-    #"../../../../test/distributed/fsdp/test_distributed_checkpoint.py": (
+    # "../../../../test/distributed/fsdp/test_distributed_checkpoint.py": (
     #    "test_distributed_checkpoint_state_dict_type0_xpu",
     #    "test_distributed_checkpoint_state_dict_type1_xpu",
-    #),
+    # ),
     "../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None,
@@ -37,11 +37,11 @@
     "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": (
         "test_use_orig_params",
     ),
-    # Performance check, skip 
-    #"../../../../test/distributed/fsdp/test_fsdp_overlap.py": (
+    # Performance check, skip
+    # "../../../../test/distributed/fsdp/test_fsdp_overlap.py": (
     #    "test_forward_overlap",
     #    "test_forward_overlap_xpu",
-    #),
+    # ),
     "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None,
@@ -58,7 +58,7 @@
     "../../../../test/distributed/test_c10d_common.py": None,
     "../../../../test/distributed/test_c10d_functional_native.py": (
         # https://github.com/intel/torch-xpu-ops/issues/1508
-        #RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
+        # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
         "test_reduce_scatter_tensor_coalesced",
         "test_reduce_scatter_tensor_single",
         # https://github.com/intel/torch-xpu-ops/issues/1525
@@ -123,7 +123,7 @@
         # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES)
         # https://github.com/intel/torch-xpu-ops/issues/1526
         "test_tracing_xpu",
-        "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu"
+        "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu",
     ),
     "../../../../test/distributed/test_multi_threaded_pg.py": (
         # oneccl not support multi-threaded well, so skip it first.
@@ -131,7 +131,6 @@
     ),
     "../../../../test/distributed/test_store.py": None,
     "../../../../test/distributed/pipelining/test_backward.py": None,
-    "../../../../test/distributed/pipelining/test_backward.py": None,
     "../../../../test/distributed/pipelining/test_microbatch.py": None,
     "../../../../test/distributed/pipelining/test_pipe.py": None,
     "../../../../test/distributed/pipelining/test_schedule.py": None,
@@ -143,6 +142,6 @@
 skip_dict_python = {
     "distributed/test_c10d_ops_xccl.py": None,
     "distributed/test_c10d_xccl.py": None,
-    "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error.
+    "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None,  # Hang error.
     "../../../../test/distributed/pipelining/test_stage.py": None,
 }

From ab68eeef12b5546c9d5ff7000b222442ce88ca3f Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Sat, 5 Apr 2025 19:25:50 +0000
Subject: [PATCH 16/41] add distributed ut in CI

---
 .github/scripts/ut_result_check.sh |  10 +--
 .github/workflows/_linux_build.yml |   6 +-
 .github/workflows/_linux_ut.yml    | 140 +++++++++++++++++++++++++++++
 .github/workflows/pull.yml         |  25 ++++++
 4 files changed, 175 insertions(+), 6 deletions(-)

diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh
index 3fb1a1997..32dbed489 100644
--- a/.github/scripts/ut_result_check.sh
+++ b/.github/scripts/ut_result_check.sh
@@ -72,14 +72,14 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then
       echo -e "[PASS] UT ${ut_suite} test Pass"
     fi
 fi
-if [[ "${ut_suite}" == 'xpu_distributed' ]]; then
-    grep -E "^FAILED|have failures" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
-    num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed.log")
+if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then
+    grep -E "ERROR" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log
+    num_failed_distributed=$(wc -l < "./${ut_suite}_test_failed.log")
     echo -e "========================================================================="
     echo -e "Show Failed cases in ${ut_suite} xpu distributed"
     echo -e "========================================================================="
-    cat "./${ut_suite}_xpu_distributed_test_failed.log"
-    ((num_failed=num_failed_xpu_distributed))
+    cat "./${ut_suite}_test_failed.log"
+    ((num_failed=num_failed_distributed))
     if [[ $num_failed -gt 0 ]]; then
       echo -e "[ERROR] UT ${ut_suite} test Fail"
       exit 1
diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index b67be9f29..f17d02a0c 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -65,7 +65,11 @@ jobs:
           source activate xpu_build
           cd ../ && rm -rf pytorch
           pip install requests
-          git clone https://github.com/pytorch/pytorch pytorch
+          if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then
+            git clone https://github.com/daisyden/pytorch.git pytorch
+          else
+            git clone https://github.com/pytorch/pytorch pytorch
+          fi
           cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
           # apply PRs for stock pytorch
           python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index e2e21bbfb..1edd00a7c 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -420,3 +420,143 @@ jobs:
         with:
           name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_distributed
           path: ${{ github.workspace }}/ut_log
+
+  pytorch_distributed_test:
+    runs-on: ${{ inputs.runner }}
+    if: contains(inputs.ut, 'pytorch_distributed')
+    timeout-minutes: 900
+    env:
+      NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+      DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+    steps:
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Prepare Stock Pytorch
+        run: |
+          pwd
+          which conda && conda clean -ay
+          conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \
+                rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
+          conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ../ && rm -rf pytorch
+          pip install requests
+          git clone https://github.com/daisyden/pytorch.git pytorch
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            # apply PRs for stock pytorch
+            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+            git status && git show -s
+            git submodule sync && git submodule update --init --recursive
+            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+              echo "Don't replace torch-xpu-ops!"
+            else
+              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+              # Workaround for torch-xpu-ops ci test
+              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+            fi
+          fi
+      - name: Triton Installation
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ../pytorch
+          TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
+          if [ -z ${{ inputs.triton }} ]; then
+            TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)"
+          else
+            TRITON_COMMIT_ID="${{ inputs.triton }}"
+          fi
+          echo ${TRITON_REPO}@${TRITON_COMMIT_ID}
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python"
+          fi
+      - name: Download Pytorch wheel
+        if: ${{ inputs.pytorch != 'nightly_wheel' }}
+        uses: actions/download-artifact@v4
+        with:
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}
+          path: ${{ github.workspace }}
+      - name: Install Pytorch XPU
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          pip install mkl-static==2025.0.1 mkl-include==2025.0.1
+          if [[ ${{ inputs.abi }} == '0' ]]; then
+            export _GLIBCXX_USE_CXX11_ABI=0
+          else
+            export _GLIBCXX_USE_CXX11_ABI=1
+          fi
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            cd ../pytorch
+            export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+            pip install -r requirements.txt
+            pip install --force-reinstall ${{ github.workspace }}/torch*.whl
+            git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
+          else
+            pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
+            TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
+            cd ../pytorch
+            git reset --hard && git checkout ${TORCH_COMMIT_ID}
+            TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
+            rm -rf third_party/torch-xpu-ops
+            git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
+            cd third_party/torch-xpu-ops
+            git checkout ${TORCH_XPU_OPS_COMMIT}
+            cd ../..
+            python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py
+          fi
+          pip install -r .ci/docker/requirements-ci.txt
+      - name: Torch Config
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          python -c "import torch; print(torch.__config__.show())"
+          python -c "import torch; print(torch.__config__.parallel_info())"
+          python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
+          python -c "import triton; print(triton.__version__)"
+
+          cd ..
+          python pytorch/torch/utils/collect_env.py
+          rm -rf /tmp/torchinductor_*
+          rm -rf ~/.triton/cache
+      - name: Run Torch XPU Distributed UT
+        run: |
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          pip install pytest
+          cd ${{ github.workspace }}
+          sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk
+          sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope
+          mkdir -p ut_log/pytorch_distributed
+          cd ../pytorch/third_party/torch-xpu-ops/test/xpu
+          XCCL_EANBLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())")
+          if [[ "${XCCL_ENABLE}}" == 'False' ]]; then
+            echo -e "[ERROR] XCCL is not enabled"
+            exit 1
+          fi
+          timeout 10000 python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log
+          cd ${{ github.workspace }}
+          sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
+      - name: UT Test Results Check
+        shell: bash
+        run: |
+          function contains() {
+              contains_status="echo 'Start $2 ...'"
+              {
+                [[ $1 =~ (^|,)$2($|,) ]]
+              } || {
+                echo "[Warning] $2 is not suppotted type! Skipped!"
+                contains_status="continue"
+              }
+          }
+          set -xe
+          echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+          cd ${{ github.workspace }}/ut_log/pytorch_distributed
+          cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./
+          bash ut_result_check.sh 'pytorch_distributed'
+      - name: Upload Inductor XPU UT Log
+        if: ${{ ! cancelled() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed
+          path: ${{ github.workspace }}/ut_log
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 3dd204e32..be9d35397 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -66,6 +66,31 @@ jobs:
       pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }}
       ut: op_regression,op_regression_dev1,op_extended,op_ut,xpu_distributed
       runner: linux.idc.xpu
+  
+  preci-linux-build-distributed:
+    # Don't run on forked repos and draft PRs
+    secrets: inherit
+    if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
+    name: preci-linux-distributed
+    needs: preci-lint-check
+    permissions:
+      issues: write
+    uses: ./.github/workflows/_linux_build.yml
+    with:
+      pytorch: distributed_2.8
+      runner: pvc_e2e
+
+  preci-ut-distributed:
+    # Don't run on forked repos and draft PRs
+    secrets: inherit
+    if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
+    name: preci-linux-distributed
+    needs: preci-linux-build-distributed
+    uses: ./.github/workflows/_linux_ut.yml
+    with:
+      pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }}
+      ut: pytorch_distributed
+      runner: pvc_e2e
 
   Inductor-XPU-E2E-CI-Tests:
     name: preci-linux / e2e_test

From c5ec1405e405404d2f3f991d8ffbc213f6f2da5a Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Sat, 5 Apr 2025 19:52:17 +0000
Subject: [PATCH 17/41] update if condition

---
 .github/workflows/_linux_build.yml | 26 ++++++++++++++------------
 .github/workflows/_linux_ut.yml    |  2 +-
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index f17d02a0c..e31d1e27b 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -67,20 +67,22 @@ jobs:
           pip install requests
           if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then
             git clone https://github.com/daisyden/pytorch.git pytorch
+            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            git submodule sync && git submodule update --init --recursive
           else
             git clone https://github.com/pytorch/pytorch pytorch
-          fi
-          cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
-          # apply PRs for stock pytorch
-          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
-          git status && git show -s
-          git submodule sync && git submodule update --init --recursive
-          if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
-            echo "Don't replace torch-xpu-ops!"
-          else
-            rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
-            # Workaround for torch-xpu-ops ci test
-            sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            # apply PRs for stock pytorch
+            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+            git status && git show -s
+            git submodule sync && git submodule update --init --recursive
+            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+              echo "Don't replace torch-xpu-ops!"
+            else
+              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+              # Workaround for torch-xpu-ops ci test
+              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+            fi
           fi
       - name: Build Pytorch XPU
         run: |
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 1edd00a7c..94dacaf54 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -44,7 +44,7 @@ permissions: read-all
 jobs:
   ut_test:
     runs-on: ${{ inputs.runner }} 
-    if: ${{ inputs.ut != 'xpu_distributed' }}
+    if: ${{ inputs.ut != 'xpu_distributed' || inputs.ut != 'pytorch_distributed' }}
     timeout-minutes: 900
     env:
       NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}

From edc9e1b5bcde0adf04d47a634ab413cbae41c05a Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Sat, 5 Apr 2025 23:02:20 +0000
Subject: [PATCH 18/41] keep_torch_xpu_ops

---
 .github/workflows/pull.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index be9d35397..eec6b2893 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -78,6 +78,7 @@ jobs:
     uses: ./.github/workflows/_linux_build.yml
     with:
       pytorch: distributed_2.8
+      keep_torch_xpu_ops: true
       runner: pvc_e2e
 
   preci-ut-distributed:

From 6c9e99adf2288f6652d0ccc8b84749e353800b85 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Sun, 6 Apr 2025 19:24:08 +0000
Subject: [PATCH 19/41] update keyword in distributed ut check

---
 .github/scripts/ut_result_check.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh
index 32dbed489..9bf611786 100644
--- a/.github/scripts/ut_result_check.sh
+++ b/.github/scripts/ut_result_check.sh
@@ -73,10 +73,10 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then
     fi
 fi
 if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then
-    grep -E "ERROR" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log
+    grep -E "^FAILED|have failures" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log
     num_failed_distributed=$(wc -l < "./${ut_suite}_test_failed.log")
     echo -e "========================================================================="
-    echo -e "Show Failed cases in ${ut_suite} xpu distributed"
+    echo -e "Show Failed cases in ${ut_suite}"
     echo -e "========================================================================="
     cat "./${ut_suite}_test_failed.log"
     ((num_failed=num_failed_distributed))

From bdfa8536c16191cede8c9fd5710e1b90a8e526cc Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Mon, 7 Apr 2025 13:37:10 +0000
Subject: [PATCH 20/41] update pytorch build

---
 .github/workflows/_linux_build.yml | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index e31d1e27b..f17d02a0c 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -67,22 +67,20 @@ jobs:
           pip install requests
           if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then
             git clone https://github.com/daisyden/pytorch.git pytorch
-            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
-            git submodule sync && git submodule update --init --recursive
           else
             git clone https://github.com/pytorch/pytorch pytorch
-            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
-            # apply PRs for stock pytorch
-            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
-            git status && git show -s
-            git submodule sync && git submodule update --init --recursive
-            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
-              echo "Don't replace torch-xpu-ops!"
-            else
-              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
-              # Workaround for torch-xpu-ops ci test
-              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
-            fi
+          fi
+          cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+          # apply PRs for stock pytorch
+          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+          git status && git show -s
+          git submodule sync && git submodule update --init --recursive
+          if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+            echo "Don't replace torch-xpu-ops!"
+          else
+            rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+            # Workaround for torch-xpu-ops ci test
+            sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
           fi
       - name: Build Pytorch XPU
         run: |

From 0e77f3030f4e03c4b2cbadf19e1d3cf7c523d744 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Mon, 7 Apr 2025 14:55:26 +0000
Subject: [PATCH 21/41] update if condition

---
 .github/workflows/_linux_ut.yml | 2 +-
 .github/workflows/pull.yml      | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 94dacaf54..deddcc5db 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -44,7 +44,7 @@ permissions: read-all
 jobs:
   ut_test:
     runs-on: ${{ inputs.runner }} 
-    if: ${{ inputs.ut != 'xpu_distributed' || inputs.ut != 'pytorch_distributed' }}
+    if: ${{ inputs.ut != 'xpu_distributed' && inputs.ut != 'pytorch_distributed' }}
     timeout-minutes: 900
     env:
       NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index eec6b2893..be9d35397 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -78,7 +78,6 @@ jobs:
     uses: ./.github/workflows/_linux_build.yml
     with:
       pytorch: distributed_2.8
-      keep_torch_xpu_ops: true
       runner: pvc_e2e
 
   preci-ut-distributed:

From 4076a1a940d148137f9f530c5efface6ba2365d4 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Mon, 7 Apr 2025 18:12:34 +0000
Subject: [PATCH 22/41] resolve Artifact name conflict

---
 .github/workflows/_linux_build.yml |  4 ++--
 .github/workflows/_linux_ut.yml    | 15 +++++----------
 .github/workflows/pull.yml         |  4 ++--
 3 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index f17d02a0c..ae6c2064c 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -171,11 +171,11 @@ jobs:
         if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
         with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ env.TORCH_COMMIT_ID }}
           path: ${{ github.workspace }}/torch*.whl
       - name: Upload Build Log
         if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
         with:
-          name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }}
+          name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }}-${{ env.TORCH_COMMIT_ID }}
           path: ${{ github.workspace }}/pytorch_*.log
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index deddcc5db..0e8265639 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -95,7 +95,7 @@ jobs:
         if: ${{ inputs.pytorch != 'nightly_wheel' }}
         uses: actions/download-artifact@v4
         with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }}
           path: ${{ github.workspace }}
       - name: Install Pytorch XPU
         run: |
@@ -339,7 +339,7 @@ jobs:
         if: ${{ inputs.pytorch != 'nightly_wheel' }}
         uses: actions/download-artifact@v4
         with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }}
           path: ${{ github.workspace }}
       - name: Install Pytorch XPU
         run: |
@@ -474,18 +474,13 @@ jobs:
         if: ${{ inputs.pytorch != 'nightly_wheel' }}
         uses: actions/download-artifact@v4
         with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }}
           path: ${{ github.workspace }}
       - name: Install Pytorch XPU
         run: |
           source activate xpu_op_${ZE_AFFINITY_MASK}
           source .github/scripts/env.sh ${{ inputs.pytorch }}
           pip install mkl-static==2025.0.1 mkl-include==2025.0.1
-          if [[ ${{ inputs.abi }} == '0' ]]; then
-            export _GLIBCXX_USE_CXX11_ABI=0
-          else
-            export _GLIBCXX_USE_CXX11_ABI=1
-          fi
           if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
             cd ../pytorch
             export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
@@ -534,7 +529,7 @@ jobs:
             echo -e "[ERROR] XCCL is not enabled"
             exit 1
           fi
-          timeout 10000 python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log
+          python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log
           cd ${{ github.workspace }}
           sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
       - name: UT Test Results Check
@@ -558,5 +553,5 @@ jobs:
         if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
         with:
-          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed
+          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-pytorch_distributed
           path: ${{ github.workspace }}/ut_log
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index be9d35397..0e9ee9f63 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -88,7 +88,7 @@ jobs:
     needs: preci-linux-build-distributed
     uses: ./.github/workflows/_linux_ut.yml
     with:
-      pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }}
+      pytorch: ${{ needs.preci-linux-build-distributed.outputs.torch_commit_id }}
       ut: pytorch_distributed
       runner: pvc_e2e
 
@@ -137,7 +137,7 @@ jobs:
         if: ${{ inputs.pytorch }} != 'nightly_wheel'
         uses: actions/download-artifact@v4
         with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ needs.preci-linux-build.outputs.torch_commit_id }}
           path: ${{ github.workspace }}
       - name: Install Pytorch XPU
         run: |

From 5596ac4436e9d6b1b0367915b3d52ea25c408b5b Mon Sep 17 00:00:00 2001
From: Daisy Deng <daisy.deng@intel.com>
Date: Mon, 7 Apr 2025 23:41:37 -0700
Subject: [PATCH 23/41] enabled test_sharder.py on xpu

---
 test/xpu/skip_list_dist_local.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 0ac46961e..218746b71 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -137,6 +137,7 @@
     "../../../../test/distributed/pipelining/test_transformer.py": None,
     "../../../../test/distributed/pipelining/test_unflatten.py": None,
     "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None,
+    "../../../../test/distributed/_shard/test_sharder.py": None,
 }
 
 skip_dict_python = {

From 2ed797354aab68575dc8c4ee0f746c9eef9eadac Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Wed, 9 Apr 2025 00:18:27 -0700
Subject: [PATCH 24/41] Enabled UT for test/distributed/tensor

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/skip_list_dist_local.py | 79 ++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 0ac46961e..42cdebf19 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -136,7 +136,85 @@
     "../../../../test/distributed/pipelining/test_schedule.py": None,
     "../../../../test/distributed/pipelining/test_transformer.py": None,
     "../../../../test/distributed/pipelining/test_unflatten.py": None,
+    "../../../../test/distributed/tensor/parallel/test_micro_pipeline_tp.py": (
+        # NotImplementedError: The operator 'symm_mem::fused_matmul_reduce_scatter'
+        # is not currently implemented for the XPU device
+        # https://github.com/intel/torch-xpu-ops/issues/1547
+        "test_dtensor_seq_par_shard_dim_0",
+        "test_dtensor_seq_par_shard_dim_1",
+        "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_0",
+        "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_1",
+        "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_0",
+        "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_1",
+        "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_2",
+        # AssertionError: 'fused_all_gather_matmul' not found in '# AOT ID: ......'
+        # https://github.com/intel/torch-xpu-ops/issues/1548
+        "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_False",
+        "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_True",
+        "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_False",
+        "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_True",
+        "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_False",
+        "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_True",
+        # AssertionError: 'fused_all_gather_scaled_matmul' not found in 'graph():\n......'
+        # https://github.com/intel/torch-xpu-ops/issues/1549
+        "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_False",
+        "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_True",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_False",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_True",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_False",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_True",
+        # NotImplementedError: The operator 'aten::_scaled_mm.out' is not currently implemented for the XPU device.
+        # https://github.com/intel/torch-xpu-ops/issues/1550
+        "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_False",
+        "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_True",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_False",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_True",
+        # NotImplementedError: The operator 'symm_mem::fused_scaled_matmul_reduce_scatter'
+        # is not currently implemented for the XPU device.
+        # https://github.com/intel/torch-xpu-ops/issues/1551
+        "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_0",
+        "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_1",
+        "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_0",
+        "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_1",
+        "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_2",
+        "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_0",
+        "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_1",
+        "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_2",
+    ),
+    "../../../../test/distributed/tensor/parallel/test_tp_examples.py": (
+        # RuntimeError: aten.add.Tensor: got mixed torch.Tensor and DTensor, need to convert all torch.Tensor to DTensor before calling distributed operators!
+        # https://github.com/intel/torch-xpu-ops/issues/1555
+        "test/distributed/tensor/parallel/test_tp_examples.py::DistTensorParallelExampleTest::test_transformer_req_grad_seq_parallel_float32_thaw_all",
+        "test_transformer_req_grad_seq_parallel_float32_thaw_layers_0_attention_wv__layers_0_feed_forward_w1__layers_1_feed_forward_w2__layers_1_ffn_norm__output__tok_embeddings",
+        "test_transformer_req_grad_seq_parallel_float32_thaw_layers_1_ffn_norm__norm__output__tok_embeddings",
+        "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output__tok_embeddings",
+        "test_transformer_req_grad_seq_parallel_float32_thaw_output__tok_embeddings",
+        "test_transformer_training_is_seq_parallel_False_float32",
+        "test_transformer_training_is_seq_parallel_True_float32",
+        # NotImplementedError: Operator aten._scaled_dot_product_fused_attention_overrideable.default does not have a sharding strategy registered.
+        # https://github.com/intel/torch-xpu-ops/issues/1556
+        "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output",
+    ),
     "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None,
+    "../../../../test/distributed/tensor/parallel/test_parallelize_api.py": None,
+    "../../../../test/distributed/tensor/parallel/test_tp_style.py": None,
+    "../../../../test/distributed/tensor/test_api.py": None,
+    "../../../../test/distributed/tensor/test_attention.py": None,
+    "../../../../test/distributed/tensor/test_common_rules.py": None,
+    "../../../../test/distributed/tensor/test_dtensor.py": None,
+    "../../../../test/distributed/tensor/test_dtensor_compile.py": None,
+    "../../../../test/distributed/tensor/test_experimental_ops.py": None,
+    "../../../../test/distributed/tensor/test_init.py": None,
+    "../../../../test/distributed/tensor/test_math_ops.py": (
+        # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
+        # https://github.com/intel/torch-xpu-ops/issues/1508
+        "test_mean",
+        "test_nll_loss_and_cross_entropy",
+    ),
+    "../../../../test/distributed/tensor/test_random_ops.py": None,
+    "../../../../test/distributed/tensor/test_redistribute.py": None,
+    "../../../../test/distributed/tensor/test_tensor_ops.py": None,
+    "../../../../test/distributed/tensor/experimental/test_register_sharding.py": None,
 }
 
 skip_dict_python = {
@@ -144,4 +222,5 @@
     "distributed/test_c10d_xccl.py": None,
     "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None,  # Hang error.
     "../../../../test/distributed/pipelining/test_stage.py": None,
+    "../../../../test/distributed/pipelining/test_transformer.py": None,
 }

From 5bab858cbde56b7319c43690157aee43d06917f3 Mon Sep 17 00:00:00 2001
From: Daisy Deng <daisy.deng@intel.com>
Date: Wed, 9 Apr 2025 23:57:58 -0700
Subject: [PATCH 25/41] add FSDP2 cases, improved check-ut.py for summary, do
 ZE_AFFINITY_MASK configuration before import torch

---
 .github/scripts/check-ut.py       |  5 ++++-
 test/xpu/run_distributed_local.py |  3 ++-
 test/xpu/skip_list_dist_local.py  | 17 +++++++++++++++++
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py
index 8cd490bc8..9d9e4edfd 100644
--- a/.github/scripts/check-ut.py
+++ b/.github/scripts/check-ut.py
@@ -30,7 +30,8 @@ def get_result(case):
 def get_message(case):
     if not case.result:
         return ""
-    return f"{case.result[0].message.splitlines()[0]}"
+    #return f" for line in {case.result[0].message.splitlines()}"
+    return [item for item in case.result[0].message.splitlines() if "Error:" in item]
 
 def print_md_row(row, print_header):
     if print_header:
@@ -75,6 +76,8 @@ def print_suite(suite):
             category = 'op_extended'
         elif 'op_ut' in ut:
             category = 'op_ut'
+        else:
+            category = "default"
         row = {
             'Category': category,
             'UT': ut,
diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index d4db4785a..1c2435e15 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -3,7 +3,6 @@
 import sys
 
 from skip_list_dist_local import skip_dict, skip_dict_python
-from xpu_test_utils import launch_test
 
 res = 0
 res2 = 0
@@ -50,6 +49,8 @@
     sys.exit(255)
 
 
+from xpu_test_utils import launch_test
+
 # run python test
 def run(test_command):
     result = subprocess.run(test_command, capture_output=True, text=True)
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 5629046d9..a41c91f18 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -216,6 +216,23 @@
     "../../../../test/distributed/tensor/test_tensor_ops.py": None,
     "../../../../test/distributed/tensor/experimental/test_register_sharding.py": None,
     "../../../../test/distributed/_shard/test_sharder.py": None,
+    # FSDP2
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_autograd.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_extensions.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_frozen.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_init.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_logging.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_memory.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_overlap.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_state.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_training.py": None,
 }
 
 skip_dict_python = {

From f1b824d7764ddf88989f1960519a84dc449fbb56 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Thu, 10 Apr 2025 01:27:23 -0700
Subject: [PATCH 26/41] Skip test_schedule_multiproc.py for hang error

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/skip_list_dist_local.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 5629046d9..b2984fb17 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -221,7 +221,7 @@
 skip_dict_python = {
     "distributed/test_c10d_ops_xccl.py": None,
     "distributed/test_c10d_xccl.py": None,
-    "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None,  # Hang error.
+    # "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None,  # Hang error.
     "../../../../test/distributed/pipelining/test_stage.py": None,
     "../../../../test/distributed/pipelining/test_transformer.py": None,
 }

From 43a296c7f98f69793a941db5cd0cdcca66fc5578 Mon Sep 17 00:00:00 2001
From: "Zhong, Ruijie" <ruijie.zhong@intel.com>
Date: Tue, 1 Apr 2025 09:35:56 +0000
Subject: [PATCH 27/41] Update UT summary

---
 .github/workflows/_linux_ut.yml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index b409d5774..a7650f3f8 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -182,6 +182,18 @@ jobs:
           cd ../pytorch/third_party/torch-xpu-ops/test/xpu
           timeout 10000 python run_test_with_skip.py 2>${{ github.workspace }}/ut_log/op_ut/op_ut_with_skip_test_error.log | tee ${{ github.workspace }}/ut_log/op_ut/op_ut_with_skip_test.log
           cp *.xml ${{ github.workspace }}/ut_log
+          find op_ut_with_skip_nn op_ut_with_skip_quantization/core -type f -exec sh -c '
+              dir_path=$(dirname "$1");
+              case "$dir_path" in
+                  *"op_ut_with_skip_quantization/core"*)
+                      dir_name="op_ut_with_skip_quantization_core";;
+                  *)
+                      dir_name=$(basename "$dir_path");;
+              esac;
+              mv "$1" "$dir_path/${dir_name}_$(basename "$1")"
+          ' _ {} \;
+          cp op_ut_with_skip_nn/*.xml ${{ github.workspace }}/ut_log
+          cp op_ut_with_skip_quantization/core/*.xml ${{ github.workspace }}/ut_log
           # Cases run with a on-demand white list, since some suites are too
           # slow to go through all operators on CPU. So add cases on-demand
           # when XPU implementatoin is done.

From 0f684ac1e2f5b4db44dc62d59be0b069ba934e49 Mon Sep 17 00:00:00 2001
From: "Zhong, Ruijie" <ruijie.zhong@intel.com>
Date: Thu, 10 Apr 2025 00:34:12 -0700
Subject: [PATCH 28/41] Update ut summary for more details

---
 .github/scripts/check-ut.py | 51 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 50 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py
index 8cd490bc8..cd0b7b405 100644
--- a/.github/scripts/check-ut.py
+++ b/.github/scripts/check-ut.py
@@ -30,7 +30,54 @@ def get_result(case):
 def get_message(case):
     if not case.result:
         return ""
-    return f"{case.result[0].message.splitlines()[0]}"
+    full_text = case.result[0].text if hasattr(case.result[0], 'text') else case.result[0].message
+    if not full_text:
+        return ""
+    error_types = [
+        "RuntimeError",
+        "ValueError", 
+        "TypeError",
+        "AttributeError",
+        "KeyError",
+        "IndexError",
+        "ImportError",
+        "AssertionError",
+        "Exception",
+        "OSError",
+        "Failed",
+        "TimeoutError",
+        "asyncio.TimeoutError",
+        "FileNotFoundError",
+        "PermissionError",
+    ]
+
+    error_messages = []
+    current_error = None
+    capture_next_lines = False
+    indent_level = 0
+
+    for line in full_text.splitlines():
+        stripped_line = line.strip()
+        if not stripped_line:
+            continue
+
+        for error_type in error_types:
+            if stripped_line.startswith(error_type + ": "):
+                current_error = error_type
+                error_msg = stripped_line[len(error_type)+2:]
+                error_messages.append(f"{error_type}: {error_msg}")
+                capture_next_lines = True
+                indent_level = 0
+                break
+            elif f"{error_type}:" in stripped_line and "Traceback" not in stripped_line:
+                current_error = error_type
+                error_msg = stripped_line.split(f'{error_type}:')[-1].strip()
+                error_messages.append(f"{error_type}: {error_msg}")
+                capture_next_lines = True
+                indent_level = 0
+                break
+
+    return "\n".join(error_messages) if error_messages else f"{case.result[0].message.splitlines()[0]}"
 
 def print_md_row(row, print_header):
     if print_header:
@@ -75,6 +122,8 @@ def print_suite(suite):
             category = 'op_extended'
         elif 'op_ut' in ut:
             category = 'op_ut'
+        else:
+            category = "unknown"
         row = {
             'Category': category,
             'UT': ut,

From d1828de5acf482a23bbaab4ab936ab00e59665cd Mon Sep 17 00:00:00 2001
From: "Zhong, Ruijie" <ruijie.zhong@intel.com>
Date: Thu, 10 Apr 2025 00:40:27 -0700
Subject: [PATCH 29/41] align the lint check

---
 .github/scripts/check-ut.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py
index cd0b7b405..0cf82d159 100644
--- a/.github/scripts/check-ut.py
+++ b/.github/scripts/check-ut.py
@@ -35,7 +35,7 @@ def get_message(case):
         return ""
     error_types = [
         "RuntimeError",
-        "ValueError", 
+        "ValueError",
         "TypeError",
         "AttributeError",
         "KeyError",

From b8dc74bc9ca31053085e337137e9a7b9020d240e Mon Sep 17 00:00:00 2001
From: "Zhong, Ruijie" <ruijie.zhong@intel.com>
Date: Thu, 10 Apr 2025 00:43:37 -0700
Subject: [PATCH 30/41] remove unneccessary parm

---
 .github/scripts/check-ut.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py
index 0cf82d159..d9c3d66a7 100644
--- a/.github/scripts/check-ut.py
+++ b/.github/scripts/check-ut.py
@@ -52,7 +52,6 @@ def get_message(case):
     ]
 
     error_messages = []
-    current_error = None
     capture_next_lines = False
     indent_level = 0
 
@@ -63,14 +62,12 @@ def get_message(case):
 
         for error_type in error_types:
             if stripped_line.startswith(error_type + ": "):
-                current_error = error_type
                 error_msg = stripped_line[len(error_type)+2:]
                 error_messages.append(f"{error_type}: {error_msg}")
                 capture_next_lines = True
                 indent_level = 0
                 break
             elif f"{error_type}:" in stripped_line and "Traceback" not in stripped_line:
-                current_error = error_type
                 error_msg = stripped_line.split(f'{error_type}:')[-1].strip()
                 error_messages.append(f"{error_type}: {error_msg}")
                 capture_next_lines = True

From f7a2fd3b70a6870af8c2f78387a6a8c4332b7468 Mon Sep 17 00:00:00 2001
From: "Zhong, Ruijie" <ruijie.zhong@intel.com>
Date: Thu, 10 Apr 2025 18:21:58 -0700
Subject: [PATCH 31/41] change the delimiter

---
 .github/scripts/check-ut.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py
index d9c3d66a7..91ff39ee7 100644
--- a/.github/scripts/check-ut.py
+++ b/.github/scripts/check-ut.py
@@ -74,7 +74,7 @@ def get_message(case):
                 indent_level = 0
                 break
 
-    return "\n".join(error_messages) if error_messages else f"{case.result[0].message.splitlines()[0]}"
+    return " | ".join(error_messages) if error_messages else f"{case.result[0].message.splitlines()[0]}"
 
 def print_md_row(row, print_header):
     if print_header:

From 3d60d1f9ec06584008021212f08e1ccaa5885a18 Mon Sep 17 00:00:00 2001
From: "Zhong, Ruijie" <ruijie.zhong@intel.com>
Date: Thu, 10 Apr 2025 18:22:39 -0700
Subject: [PATCH 32/41] change the delimiter

---
 .github/scripts/check-ut.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py
index 91ff39ee7..8aa069ebe 100644
--- a/.github/scripts/check-ut.py
+++ b/.github/scripts/check-ut.py
@@ -74,7 +74,7 @@ def get_message(case):
                 indent_level = 0
                 break
 
-    return " | ".join(error_messages) if error_messages else f"{case.result[0].message.splitlines()[0]}"
+    return " ; ".join(error_messages) if error_messages else f"{case.result[0].message.splitlines()[0]}"
 
 def print_md_row(row, print_header):
     if print_header:

From b7797296834fc4171df0dbdd68a590f4289a30e0 Mon Sep 17 00:00:00 2001
From: "Zhong, Ruijie" <ruijie.zhong@intel.com>
Date: Thu, 10 Apr 2025 23:58:59 -0700
Subject: [PATCH 33/41] add NotImplementedError check

---
 .github/scripts/check-ut.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py
index 8aa069ebe..e67fa812d 100644
--- a/.github/scripts/check-ut.py
+++ b/.github/scripts/check-ut.py
@@ -49,6 +49,7 @@ def get_message(case):
         "asyncio.TimeoutError",
         "FileNotFoundError",
         "PermissionError",
+        "NotImplementedError",
     ]
 
     error_messages = []

From f696faad63d48e4a2e65a15340c998aedc9d529d Mon Sep 17 00:00:00 2001
From: Cheng Penghui <penghui.cheng@intel.com>
Date: Mon, 14 Apr 2025 23:14:30 -0700
Subject: [PATCH 34/41] refine error log for test files without pytest

Signed-off-by: Cheng Penghui <penghui.cheng@intel.com>
---
 test/xpu/run_distributed_local.py | 52 ++++++++++++++++++++++++-------
 1 file changed, 41 insertions(+), 11 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index d4db4785a..96761cd82 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -8,6 +8,7 @@
 res = 0
 res2 = 0
 fail_test = []
+error_log = ""
 
 os.environ["CCL_ATL_TRANSPORT"] = "ofi"
 os.environ["CCL_SEND"] = "direct"
@@ -59,20 +60,49 @@ def run(test_command):
 
 
 for key in skip_dict_python:
-    skip_list = skip_dict_python[key]
+    skip_list = skip_dict_python[key] if skip_dict_python[key] else []
     test_command = ["python", key]
     fail = run(test_command)
     if fail.returncode:
-        for line in fail.stderr.split("\n"):
-            if "FAIL: " in line:
-                is_error = True
-                for skip_case in skip_list:
-                    if skip_case in line:
-                        print("Skiped error: ", key + " " + skip_case)
-                        is_error = False
-                if is_error:
-                    res2 += fail.returncode
-                    fail_test.append("".join(key + " " + line))
+        num_skipped = 0
+        num_err = 0
+        for i, err in enumerate(fail.stderr.split("FAIL: ")):
+            if i == 0 and len(err) > 0:
+                error_log += err
+                continue
+            is_skipped = False
+            for skip_case in skip_list:
+                if skip_case in err:
+                    print("Skipped error: ", key + " " + skip_case)
+                    num_skipped += 1
+                    is_skipped = True
+                    break
+            if not is_skipped:
+                num_err += 1
+                res2 += fail.returncode
+                if i == len(fail.stderr.split("FAIL: ")) - 1:
+                    error_log += "FAIL: "
+                    for line in err.split("\n"):
+                        if line.startswith("FAILED (failures="):
+                            num_errs = line.split("=")[1].split(")")[0].strip()
+                            error_log += ("FAILED (failures=" + str(int(num_errs) - num_skipped) + f" skipped {num_skipped} cases" + ")\n")
+                        else:
+                            error_log += (line + "\n")
+                else:
+                    error_log += ("FAIL: " + err)
+            else:
+                if i == len(fail.stderr.split("FAIL: ")) - 1:
+                    error_log += "FAIL: "
+                    for line in err.split("\n"):
+                        if line.startswith("FAILED (failures="):
+                            num_errs = line.split("=")[1].split(")")[0].strip()
+                            error_log += ("FAILED (failures=" + str(int(num_errs) - num_skipped) + f" skipped {num_skipped} cases" + ")\n")
+
+    if num_err > 0:
+        fail_test.append(key)
+        renamed_key = key.replace("../../../../", "").replace("/", "_")
+        with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f:
+            f.write(error_log)
 
 # run pytest with skiplist
 for key in skip_dict:

From 00326ac761623a735718105609c6e0cb05686a7c Mon Sep 17 00:00:00 2001
From: Cheng Penghui <penghui.cheng@intel.com>
Date: Tue, 15 Apr 2025 01:50:09 -0700
Subject: [PATCH 35/41] Fixed error for create log file without pytest

Signed-off-by: Cheng Penghui <penghui.cheng@intel.com>
---
 test/xpu/run_distributed_local.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index 46a0be814..46905cef1 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -64,9 +64,9 @@ def run(test_command):
     skip_list = skip_dict_python[key] if skip_dict_python[key] else []
     test_command = ["python", key]
     fail = run(test_command)
+    num_skipped = 0
+    num_err = 0
     if fail.returncode:
-        num_skipped = 0
-        num_err = 0
         for i, err in enumerate(fail.stderr.split("FAIL: ")):
             if i == 0 and len(err) > 0:
                 error_log += err
@@ -99,11 +99,16 @@ def run(test_command):
                             num_errs = line.split("=")[1].split(")")[0].strip()
                             error_log += ("FAILED (failures=" + str(int(num_errs) - num_skipped) + f" skipped {num_skipped} cases" + ")\n")
 
+    renamed_key = key.replace("../../../../", "").replace("/", "_")
     if num_err > 0:
         fail_test.append(key)
-        renamed_key = key.replace("../../../../", "").replace("/", "_")
         with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f:
             f.write(error_log)
+    else:
+        import pdb;pdb.set_trace()
+        with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f:
+            f.write(fail.stdout)
+            f.write(fail.stderr)
 
 # run pytest with skiplist
 for key in skip_dict:

From 8ad304c6881dc9ad625eb89bc11f85e224124c5a Mon Sep 17 00:00:00 2001
From: "Zhong, Ruijie" <ruijie.zhong@intel.com>
Date: Tue, 15 Apr 2025 02:34:04 -0700
Subject: [PATCH 36/41] add log summary function

---
 .github/scripts/check-ut.py | 287 ++++++++++++++++++++++++------------
 1 file changed, 192 insertions(+), 95 deletions(-)

diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py
index e67fa812d..fb636ec6a 100644
--- a/.github/scripts/check-ut.py
+++ b/.github/scripts/check-ut.py
@@ -1,22 +1,47 @@
 import argparse
 import sys
 import os
+import re
 from junitparser import JUnitXml, Error, Failure, Skipped
 
-parser = argparse.ArgumentParser()
-parser.add_argument('junitxml', nargs='+')
+parser = argparse.ArgumentParser(description='Test results analyzer')
+parser.add_argument('input_files', nargs='+', help='JUnit XML files or log files')
 args = parser.parse_args()
 
 failures = []
-suites = []
+summaries = []
+
+error_types = [
+    "RuntimeError",
+    "ValueError",
+    "TypeError",
+    "AttributeError",
+    "KeyError",
+    "IndexError",
+    "ImportError",
+    "AssertionError",
+    "Exception",
+    "OSError",
+    "Failed",
+    "TimeoutError",
+    "asyncio.TimeoutError",
+    "FileNotFoundError",
+    "PermissionError",
+    "NotImplementedError",
+]
 
 def get_classname(case):
-    return ' '.join(case.classname.split())
+    return ' '.join(case.classname.split()) if hasattr(case, 'classname') else case.get('classname', '')
 
 def get_name(case):
+    if isinstance(case, dict):
+        return case.get('name', '')
     return ' '.join(case.name.split())
 
 def get_result(case):
+    if isinstance(case, dict):
+        return case.get('status', 'failed')
+
     result = "passed"
     if case.result:
         if isinstance(case.result[0], Error):
@@ -28,29 +53,14 @@ def get_result(case):
     return result
 
 def get_message(case):
+    if isinstance(case, dict):
+        return case.get('error', '')
+
     if not case.result:
         return ""
     full_text = case.result[0].text if hasattr(case.result[0], 'text') else case.result[0].message
     if not full_text:
         return ""
-    error_types = [
-        "RuntimeError",
-        "ValueError",
-        "TypeError",
-        "AttributeError",
-        "KeyError",
-        "IndexError",
-        "ImportError",
-        "AssertionError",
-        "Exception",
-        "OSError",
-        "Failed",
-        "TimeoutError",
-        "asyncio.TimeoutError",
-        "FileNotFoundError",
-        "PermissionError",
-        "NotImplementedError",
-    ]
 
     error_messages = []
     capture_next_lines = False
@@ -77,86 +87,173 @@ def get_message(case):
 
     return " ; ".join(error_messages) if error_messages else f"{case.result[0].message.splitlines()[0]}"
 
-def print_md_row(row, print_header):
+def print_md_row(row, print_header=False):
     if print_header:
-        header = " | ".join([f"{key}" for key, _ in row.items()])
+        header = " | ".join([f"{key}" for key in row.keys()])
         print(f"| {header} |")
-        header = " | ".join(["-"*len(key) for key, _ in row.items()])
+        header = " | ".join(["---"] * len(row))
         print(f"| {header} |")
-    row = " | ".join([f"{value}" for _, value in row.items()])
-    print(f"| {row} |")
+    row_values = " | ".join([f"{value}" for value in row.values()])
+    print(f"| {row_values} |")
+
+def print_failures():
+    if not failures:
+        return
 
-def print_cases(cases):
+    print("### Test Failures")
     print_header = True
-    for case in cases:
-        classname = get_classname(case)
-        name = get_name(case)
-        result = get_result(case)
-        message = get_message(case)
-        row = {
-            'Class name': classname,
-            'Test name': name,
-            'Status': result,
-            'Message': message,
-        }
-        print_md_row(row, print_header)
+    for case in failures:
+        print_md_row({
+            'Class name': get_classname(case),
+            'Test name': get_name(case),
+            'Status': get_result(case),
+            'Message': get_message(case),
+            'Source': case['source'] if isinstance(case, dict) else 'XML'
+        }, print_header)
         print_header = False
 
-def print_suite(suite):
-    print_header = True
-    for suite in suites:
-        ut = args.junitxml[0]
-        del(args.junitxml[0])
-        ut = os.path.basename(ut).split('.')[0]
-        tests = suite.tests
-        skipped = suite.skipped
-        failures = suite.failures
-        errors = suite.errors
-        if ut == 'op_regression':
-            category = 'op_regression'
-        elif ut == 'op_regression_dev1':
-            category = 'op_regression_dev1'
-        elif ut == 'op_extended':
-            category = 'op_extended'
-        elif 'op_ut' in ut:
-            category = 'op_ut'
+def parse_log_file(log_file):
+    with open(log_file, 'r', encoding='utf-8') as f:
+        content = f.read()
+
+    ut_name = os.path.splitext(os.path.basename(log_file))[0]
+    summary = {
+        'Category': determine_category(ut_name),
+        'UT': ut_name,
+        'Test cases': 0,
+        'Passed': 0,
+        'Skipped': 0,
+        'Failures': 0,
+        'Errors': 0,
+        'Source': 'Log'
+    }
+
+    # Extract test counts
+    test_run_match = re.search(r"Ran (\d+) tests in [\d.]+s", content)
+    if test_run_match:
+        summary['Test cases'] = int(test_run_match.group(1))
+
+    # Extract skipped case number
+    skipped_match = re.search(r"skipped[ =](\d+)", content, re.IGNORECASE)
+    if skipped_match:
+        summary['Skipped'] = int(skipped_match.group(1))
+    else:
+        skipped_match = re.search(r"skipped (\d+) cases?", content, re.IGNORECASE)
+        if skipped_match:
+            summary['Skipped'] = int(skipped_match.group(1))
+
+    # Extract failures
+    failure_blocks = re.findall(r"(FAIL:.*?)(?:\n\n|\n=+\n|\Z)", content, re.DOTALL)
+    exist_test_names = set()
+    failures_number = 0
+
+    for block in failure_blocks:
+        case_match = re.match(r"FAIL: (\w+) \(__mp_main__\.(\w+)\)", block)
+        if not case_match:
+            continue
+
+        test_name = case_match.group(1)
+        if test_name in exist_test_names:
+            continue
+        exist_test_names.add(test_name)
+
+        error_msg = []
+        error_pattern = r"(" + "|".join(error_types) + r"):.*?(?=\n\S|\n\n|\n=+\n|\Z)"
+        error_matches = re.finditer(error_pattern, block, re.DOTALL)
+        if not error_matches and "Traceback" in block:
+            error_msg.append("Unknown error (see traceback)")
         else:
-            category = "unknown"
-        row = {
-            'Category': category,
-            'UT': ut,
-            'Test cases': tests,
-            'Passed': tests-skipped-failures-errors,
-            'Skipped': skipped,
-            'Failures': failures,
-            'Errors': errors,
-        }
-        print_md_row(row, print_header)
+            for match in error_matches:
+                error_msg.append(match.group(0).strip())
+
+        failures.append({
+            'classname': ut_name,
+            'name': f"{case_match.group(2)}:{test_name}",
+            'error': " ".join(error_msg),
+            'status': 'failed',
+            'source': 'Log'
+        })
+        failures_number += 1
+
+    if failures_number > summary['Failures']:
+        summary['Failures'] = failures_number
+    summary['Passed'] = summary['Test cases'] - summary['Failures'] - summary['Skipped']
+
+    return summary
+
+def determine_category(ut):
+    if ut == 'op_regression':
+        return 'op_regression'
+    elif ut == 'op_regression_dev1':
+        return 'op_regression_dev1'
+    elif ut == 'op_extended':
+        return 'op_extended'
+    elif 'op_ut' in ut:
+        return 'op_ut'
+    else:
+        return 'unknown'
+
+def process_log_file(log_file):
+    try:
+        summary = parse_log_file(log_file)
+        summaries.append(summary)
+    except Exception as e:
+        print(f"Error processing {log_file}: {e}", file=sys.stderr)
+
+def process_xml_file(xml_file):
+    try:
+        xml = JUnitXml.fromfile(xml_file)
+        ut = os.path.basename(xml_file).split('.')[0]
+        category = determine_category(ut)
+
+        for suite in xml:
+            suite_summary = {
+                'Category': category,
+                'UT': ut,
+                'Test cases': suite.tests,
+                'Passed': suite.tests - suite.skipped - suite.failures - suite.errors,
+                'Skipped': suite.skipped,
+                'Failures': suite.failures,
+                'Errors': suite.errors,
+                'Source': 'XML'
+            }
+            summaries.append(suite_summary)
+
+            for case in suite:
+                if get_result(case) not in ["passed", "skipped"]:
+                    failures.append(case)
+    except Exception as e:
+        print(f"Error processing {xml_file}: {e}", file=sys.stderr)
+
+def print_summary():
+    print("### Results Summary")
+    print_header = True
+
+    for summary in summaries:
+        print_md_row({
+            'Category': summary['Category'],
+            'UT': summary['UT'],
+            'Test cases': summary['Test cases'],
+            'Passed': summary['Passed'],
+            'Skipped': summary['Skipped'],
+            'Failures': summary['Failures'],
+            'Errors': summary['Errors'],
+            'Source': summary['Source']
+        }, print_header)
         print_header = False
 
-xmls = [ JUnitXml.fromfile(f) for f in args.junitxml ]
-for idx, xml in enumerate(xmls):
-    for suite in xml:
-        suites.append(suite)
-        for case in suite:
-            classname = get_classname(case)
-            name = get_name(case)
-            result = get_result(case)
-            if result not in ["passed", "skipped"]:
-                failures.append(case)
-
-printed = False
-def print_break(needed):
-    if needed:
-        print("")
-
-if failures:
-    print_break(printed)
-    print("### Failures")
-    print_cases(failures)
-    printed = True
-
-print("### Results Summary")
-print_suite(suites)
-
-sys.exit(0)
+def main():
+    for input_file in args.input_files:
+        if input_file.endswith('.log'):
+            process_log_file(input_file)
+        elif input_file.endswith('.xml'):
+            process_xml_file(input_file)
+        else:
+            print(f"Skipping unknown file type: {input_file}", file=sys.stderr)
+
+    print_failures()
+    print_summary()
+
+
+if __name__ == "__main__":
+    main()

From 4627a25c1a36a338a1f0b20eed07eeae868d248d Mon Sep 17 00:00:00 2001
From: "Zhong, Ruijie" <ruijie.zhong@intel.com>
Date: Tue, 15 Apr 2025 02:37:17 -0700
Subject: [PATCH 37/41] align the lint check

---
 .github/scripts/check-ut.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py
index fb636ec6a..5758c4e6d 100644
--- a/.github/scripts/check-ut.py
+++ b/.github/scripts/check-ut.py
@@ -113,7 +113,7 @@ def print_failures():
         print_header = False
 
 def parse_log_file(log_file):
-    with open(log_file, 'r', encoding='utf-8') as f:
+    with open(log_file, encoding='utf-8') as f:
         content = f.read()
 
     ut_name = os.path.splitext(os.path.basename(log_file))[0]

From 59c609e66945c3b4d2dae80a3f909256451be4e3 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Tue, 15 Apr 2025 23:07:01 -0700
Subject: [PATCH 38/41] Skipped cases rasied issue

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/run_distributed_local.py |   3 -
 test/xpu/skip_list_dist_local.py  | 271 +++++++++++++++++++++++++++---
 2 files changed, 246 insertions(+), 28 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index 46a0be814..63a588416 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -9,9 +9,6 @@
 fail_test = []
 error_log = ""
 
-os.environ["CCL_ATL_TRANSPORT"] = "ofi"
-os.environ["CCL_SEND"] = "direct"
-os.environ["CCL_RECV"] = "direct"
 os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining"
 # Get the xelink group card affinity
 ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index e6a2a34f3..9ec4c59e0 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -7,19 +7,120 @@
     # ),
     "../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_basic_checkpoint_end_to_end_cpu_offload1_offload_activations_False_use_orig_params_False",
+        "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False",
+        "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_True_use_orig_params_False",
+        "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False",
+        "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False",
+        "test_checkpoint_submodule_use_reentrant_False_xpu",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": (
+       # https://github.com/intel/torch-xpu-ops/issues/1504
+       "test_ddp_parity_xpu",
+    ),
     "../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_bf16_hook_has_wrapping_False_sharding_strategy0",
+        "test_bf16_hook_has_wrapping_False_sharding_strategy1",
+        "test_bf16_hook_has_wrapping_False_sharding_strategy2",
+        "test_bf16_hook_has_wrapping_True_sharding_strategy0",
+        "test_bf16_hook_has_wrapping_True_sharding_strategy1",
+        "test_bf16_hook_has_wrapping_True_sharding_strategy2",
+        "test_fp16_hook_has_wrapping_False_sharding_strategy1",
+        "test_fp16_hook_has_wrapping_False_sharding_strategy2",
+        "test_fp16_hook_has_wrapping_True_sharding_strategy0",
+        "test_fp16_hook_has_wrapping_True_sharding_strategy1",
+        "test_fp16_hook_has_wrapping_True_sharding_strategy2",
+    ),
     "../../../../test/distributed/fsdp/test_fsdp_core.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
         "test_delayed_optim_step_offload_true_no_shard_xpu",
         "test_transformer_no_grad_mixed_precision_True_xpu",
+        "test_delayed_optim_step_offload_false_no_shard_xpu",
+        "test_delayed_optim_step_offload_false_none_xpu",
+        "test_delayed_optim_step_offload_false_shard_grad_op_xpu",
+        "test_delayed_optim_step_offload_true_none_xpu",
+        "test_delayed_optim_step_offload_true_shard_grad_op_xpu",
+        "test_delayed_reduce_scatter_offload_false_no_shard_xpu",
+        "test_delayed_reduce_scatter_offload_false_none_xpu",
+        "test_delayed_reduce_scatter_offload_false_shard_grad_op_xpu",
+        "test_delayed_reduce_scatter_offload_true_none_xpu",
+        "test_delayed_reduce_scatter_offload_true_shard_grad_op_xpu",
+        "test_mixture_of_experts_offload_false_no_shard_xpu",
+        "test_mixture_of_experts_offload_false_none_xpu",
+        "test_mixture_of_experts_offload_false_shard_grad_op_xpu",
+        "test_mixture_of_experts_offload_true_none_xpu",
+        "test_mixture_of_experts_offload_true_shard_grad_op_xpu",
+        "test_mixture_of_experts_with_delay_before_free_offload_false_no_shard_xpu",
+        "test_mixture_of_experts_with_delay_before_free_offload_false_none_xpu",
+        "test_mixture_of_experts_with_delay_before_free_offload_false_shard_grad_op_xpu",
+        "test_mixture_of_experts_with_delay_before_free_offload_true_none_xpu",
+        "test_mixture_of_experts_with_delay_before_free_offload_true_shard_grad_op_xpu",
+        "test_nested_always_wrap_model_offload_false_no_shard_xpu",
+        "test_nested_always_wrap_model_offload_false_none_xpu",
+        "test_nested_always_wrap_model_offload_false_shard_grad_op_xpu",
+        "test_nested_always_wrap_model_offload_true_none_xpu",
+        "test_nested_always_wrap_model_offload_true_shard_grad_op_xpu",
+        "test_nested_wrapped_model_offload_false_no_shard_xpu",
+        "test_nested_wrapped_model_offload_false_none_xpu",
+        "test_nested_wrapped_model_offload_false_shard_grad_op_xpu",
+        "test_nested_wrapped_model_offload_true_none_xpu",
+        "test_nested_wrapped_model_offload_true_shard_grad_op_xpu",
+        "test_transformer_offload_false_none_xpu",
+        "test_transformer_offload_false_shard_grad_op_xpu",
+        "test_transformer_offload_true_none_xpu",
+        "test_transformer_offload_true_shard_grad_op_xpu",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        " test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_False_xpu",
     ),
-    "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_hooks_multi_traversal_xpu",
+        "test_parity_with_ddp_xpu",
+        "test_parity_with_non_frozen_fsdp_xpu",
+    ),
     "../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True ",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True",
+    ),
     "../../../../test/distributed/fsdp/test_fsdp_fx.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_grad_acc.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_hybrid_shard.py": None,
@@ -28,28 +129,89 @@
     "../../../../test/distributed/fsdp/test_fsdp_memory.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_meta.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_misc.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1535
         "test_fsdp_zero2_eval_with_prefetch",
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_fsdp_optimizer_overlap",
     ),
     "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_multi_forward_cpu",
+    ),
     "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None,
     # https://github.com/intel/torch-xpu-ops/issues/1537
     "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_flatten_sharded_optim_state_dict_nested",
+        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_False",
+        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_True",
+        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_True_use_diff_optim_inputs_False",
+        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_True_use_diff_optim_inputs_True",
+        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_False",
+        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_True",
+        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_True_use_diff_optim_inputs_False",
+        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_True_use_diff_optim_inputs_True",
+        "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_False",
+        "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_True",
+        "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_False",
+        "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_True",
+        "test_rekey_optim_state_dict_to_ids_state_dict_type0_use_multiple_param_groups_False",
+        "test_rekey_optim_state_dict_to_ids_state_dict_type0_use_multiple_param_groups_True",
+        "test_rekey_optim_state_dict_to_ids_state_dict_type1_use_multiple_param_groups_False",
+        "test_rekey_optim_state_dict_to_ids_state_dict_type1_use_multiple_param_groups_True",
+        "test_rekey_optim_state_dict_to_names",
+        "test_scatter_full_optim_state_dict_nested_halve_world_size",
+        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_False",
+        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_True",
+        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_False",
+        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_True",
+        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_False",
+        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_True",
+        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_False",
+        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_True",
+        "test_shard_full_optim_state_dict_nested_halve_world_size",
+        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_False",
+        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_True",
+        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_False",
+        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_True",
+        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_False",
+        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_True",
+        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_False",
+        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_True",
         "test_use_orig_params",
     ),
     # Performance check, skip
     # "../../../../test/distributed/fsdp/test_fsdp_overlap.py": (
+    #    # https://github.com/intel/torch-xpu-ops/issues/1504
     #    "test_forward_overlap",
     #    "test_forward_overlap_xpu",
     # ),
     "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_fsdp_ddp_parity_with_grad_scaler_offload_false_none_none_none",
+        "test_fsdp_ddp_parity_with_grad_scaler_offload_false_shard_grad_op_none_none",
+        "test_fsdp_ddp_parity_with_grad_scaler_offload_true_none_none_none",
+        "test_fsdp_ddp_parity_with_grad_scaler_offload_true_shard_grad_op_none_none",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_state_dict_save_load_flow_state_dict_type_local_state_dict",
+        "test_state_dict_save_load_flow_state_dict_type_sharded_state_dict",
+        "test_state_dict_save_load_flow_state_dict_type_state_dict",
+    ),
     "../../../../test/distributed/fsdp/test_fsdp_tp_integration.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_traversal.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_diff_hyperparams_sharding_strategy_str_full_shard",
+        "test_diff_hyperparams_sharding_strategy_str_no_shard",
+        "test_diff_hyperparams_sharding_strategy_str_shard_grad_op",
+        "test_no_sync_correctness",
+    ),
     "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": None,
     "../../../../test/distributed/fsdp/test_shard_utils.py": None,
     "../../../../test/distributed/fsdp/test_utils.py": None,
@@ -127,11 +289,20 @@
     ),
     "../../../../test/distributed/test_multi_threaded_pg.py": (
         # oneccl not support multi-threaded well, so skip it first.
+        # https://github.com/intel/torch-xpu-ops/issues/1509
         "test_bwd_sees_fwd_pg",
     ),
     "../../../../test/distributed/test_store.py": None,
-    "../../../../test/distributed/pipelining/test_backward.py": None,
-    "../../../../test/distributed/pipelining/test_microbatch.py": None,
+    "../../../../test/distributed/pipelining/test_backward.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_stage_backward_weight_multiple_iters_xpu",
+        "test_stage_backward_weight_xpu",
+        "test_stage_backward_xpu",
+    ),
+    "../../../../test/distributed/pipelining/test_microbatch.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_chunk_spec_xpu",
+    ),
     "../../../../test/distributed/pipelining/test_pipe.py": None,
     "../../../../test/distributed/pipelining/test_schedule.py": None,
     "../../../../test/distributed/pipelining/test_transformer.py": None,
@@ -184,7 +355,7 @@
     "../../../../test/distributed/tensor/parallel/test_tp_examples.py": (
         # RuntimeError: aten.add.Tensor: got mixed torch.Tensor and DTensor, need to convert all torch.Tensor to DTensor before calling distributed operators!
         # https://github.com/intel/torch-xpu-ops/issues/1555
-        "test/distributed/tensor/parallel/test_tp_examples.py::DistTensorParallelExampleTest::test_transformer_req_grad_seq_parallel_float32_thaw_all",
+        "test_transformer_req_grad_seq_parallel_float32_thaw_all",
         "test_transformer_req_grad_seq_parallel_float32_thaw_layers_0_attention_wv__layers_0_feed_forward_w1__layers_1_feed_forward_w2__layers_1_ffn_norm__output__tok_embeddings",
         "test_transformer_req_grad_seq_parallel_float32_thaw_layers_1_ffn_norm__norm__output__tok_embeddings",
         "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output__tok_embeddings",
@@ -201,9 +372,29 @@
     "../../../../test/distributed/tensor/test_api.py": None,
     "../../../../test/distributed/tensor/test_attention.py": None,
     "../../../../test/distributed/tensor/test_common_rules.py": None,
-    "../../../../test/distributed/tensor/test_dtensor.py": None,
-    "../../../../test/distributed/tensor/test_dtensor_compile.py": None,
-    "../../../../test/distributed/tensor/test_experimental_ops.py": None,
+    "../../../../test/distributed/tensor/test_dtensor.py": (
+        # Passed with updated test code for world_size 8
+        "test_auto_implicit_replication",
+        "test_default_value_sub_mesh",
+        "test_device_mesh_nd",
+        "test_dtensor_2d_mesh",
+        "test_dtensor_api_device_mesh_context_manager",
+        "test_dtensor_device_mesh_device_conversion",
+        "test_dtensor_spec_local_shard_offset",
+        "test_from_local_sub_mesh",
+        "test_implicit_replication",
+        "test_metadata_consistency_check",
+        "test_redistribute_sub_mesh",
+        "test_split_tensor_1D",
+    ),
+    "../../../../test/distributed/tensor/test_dtensor_compile.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_2d_fsdp_tp_compile",
+    ),
+    "../../../../test/distributed/tensor/test_experimental_ops.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1535
+        "test_bernoulli",
+    ),
     "../../../../test/distributed/tensor/test_init.py": None,
     "../../../../test/distributed/tensor/test_math_ops.py": (
         # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
@@ -211,28 +402,58 @@
         "test_mean",
         "test_nll_loss_and_cross_entropy",
     ),
-    "../../../../test/distributed/tensor/test_random_ops.py": None,
-    "../../../../test/distributed/tensor/test_redistribute.py": None,
+    "../../../../test/distributed/tensor/test_random_ops.py": (
+        # Need to update world size
+        "test_hsdp_tp_model_meta_init",
+    ),
+    "../../../../test/distributed/tensor/test_redistribute.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_redistribute_shard_dim_multi_dim_mesh",
+    ),
     "../../../../test/distributed/tensor/test_tensor_ops.py": None,
     "../../../../test/distributed/tensor/experimental/test_register_sharding.py": None,
     "../../../../test/distributed/_shard/test_sharder.py": None,
     # FSDP2
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_autograd.py": None,
-    "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": None,
-    "../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_clip_grad_norm_2d",
+    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1571
+        "test_set_reduce_scatter_divide_factor",
+    ),
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": None,
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_extensions.py": None,
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_frozen.py": None,
-    "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1508
+        "test_gradient_scaler",
+    ),
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py": None,
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_init.py": None,
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_logging.py": None,
-    "../../../../test/distributed/_composable/fsdp/test_fully_shard_memory.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_memory.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1535
+        "test_fully_shard_training_memory",
+    ),
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py": None,
-    "../../../../test/distributed/_composable/fsdp/test_fully_shard_overlap.py": None,
-    "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_overlap.py": (
+        # Performance test, should skip
+        "test_fully_shard_training_overlap",
+    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1572
+        "test_dp_state_dict_cpu_offload",
+    ),
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_state.py": None,
-    "../../../../test/distributed/_composable/fsdp/test_fully_shard_training.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_training.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1508
+        "test_post_optim_event",
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_train_parity_multi_group_unshard_async_op",
+        "test_train_parity_with_activation_checkpointing",
+    ),
 }
 
 skip_dict_python = {

From 1a30e75c19c6b977a00938d53909ae8cdfa4d164 Mon Sep 17 00:00:00 2001
From: Daisy Deng <daisy.deng@intel.com>
Date: Wed, 16 Apr 2025 23:07:49 -0700
Subject: [PATCH 39/41] enable RAG based similar issue search

---
 .github/scripts/check-ut.py | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py
index 7e7c6ecd4..eda195e3a 100644
--- a/.github/scripts/check-ut.py
+++ b/.github/scripts/check-ut.py
@@ -78,7 +78,7 @@ def get_message(case):
                 error_messages.append(f"{error_type}: {error_msg}")
                 capture_next_lines = True
                 indent_level = 0
-                break
+                break 
             elif f"{error_type}:" in stripped_line and "Traceback" not in stripped_line:
                 error_msg = stripped_line.split(f'{error_type}:')[-1].strip()
                 error_messages.append(f"{error_type}: {error_msg}")
@@ -98,6 +98,37 @@ def print_md_row(row, print_header=False):
     row_values = " | ".join([f"{value}" for value in row.values()])
     print(f"| {row_values} |")
 
+def get_similar_issues(classname, name, result, message):
+    import requests
+
+    os.environ["http_proxy"] = ""
+    os.environ["https_proxy"] = ""
+    DEFAULT_HOST_IP = "10.112.100.138"
+
+    def QnA(request, host_ip=DEFAULT_HOST_IP):
+        import json
+        url = f"http://{host_ip}:8888/v1/chatqna"
+    
+        headers = {"Content-Type": "application/json"}
+    
+        response = requests.post(url, headers=headers, json=request)
+        return response
+
+    prompt = f"unit test {name} {result} with {message}, is it a known issue? If yes, what is the issue id? And what is the owner and root cuase?"
+
+    request = {
+              "messages": prompt,
+              "stream": False
+           }
+
+    response = QnA (request)
+    if response.status_code==200:
+       result = response.json()["choices"][0]["message"]["content"]
+       answer = result.split("</think>")[-1].strip()
+       answer = answer.split("**Answer:**")[-1].strip()
+       return answer 
+    return ""
+
 def print_failures():
     if not failures:
         return
@@ -105,11 +136,13 @@ def print_failures():
     print("### Test Failures")
     print_header = True
     for case in failures:
+        issue = get_similar_issues(get_classname(case), get_name(case), get_result(case), get_message(case))
         print_md_row({
             'Class name': get_classname(case),
             'Test name': get_name(case),
             'Status': get_result(case),
             'Message': get_message(case),
+            'Similar issue': issue,
             'Source': case['source'] if isinstance(case, dict) else 'XML'
         }, print_header)
         print_header = False

From b4431a4edbebe27076eeef45af4c912248d4a7c5 Mon Sep 17 00:00:00 2001
From: Daisy Deng <daisy.deng@intel.com>
Date: Fri, 25 Apr 2025 00:39:52 -0700
Subject: [PATCH 40/41] create ut_failure_list.csv to support offline triage,
 instead of access RAG difrectly

---
 .github/scripts/check-ut.py | 37 +++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py
index eda195e3a..5778b5fd8 100644
--- a/.github/scripts/check-ut.py
+++ b/.github/scripts/check-ut.py
@@ -89,7 +89,7 @@ def get_message(case):
     return " ; ".join(error_messages) if error_messages else f"{case.result[0].message.splitlines()[0]}"
 
 
-def print_md_row(row, print_header=False):
+def print_md_row(row, print_header=False, fail_list=None):
     if print_header:
         header = " | ".join([f"{key}" for key in row.keys()])
         print(f"| {header} |")
@@ -98,6 +98,10 @@ def print_md_row(row, print_header=False):
     row_values = " | ".join([f"{value}" for value in row.values()])
     print(f"| {row_values} |")
 
+    if fail_list != None:
+        fail_list.write(f"| {row_values} |\n")
+
+
 def get_similar_issues(classname, name, result, message):
     import requests
 
@@ -133,19 +137,23 @@ def print_failures():
     if not failures:
         return
 
-    print("### Test Failures")
-    print_header = True
-    for case in failures:
-        issue = get_similar_issues(get_classname(case), get_name(case), get_result(case), get_message(case))
-        print_md_row({
-            'Class name': get_classname(case),
-            'Test name': get_name(case),
-            'Status': get_result(case),
-            'Message': get_message(case),
-            'Similar issue': issue,
-            'Source': case['source'] if isinstance(case, dict) else 'XML'
-        }, print_header)
-        print_header = False
+    with open("ut_failure_list.csv", "w") as fail_list:
+        fail_list.write("sep=\'|\''.\n")
+
+        print("### Test Failures")
+        print_header = True
+        for case in failures:
+            #issue = get_similar_issues(get_classname(case), get_name(case), get_result(case), get_message(case))
+            print_md_row({
+                'Class name': get_classname(case),
+                'Test name': get_name(case),
+                'Status': get_result(case),
+                'Message': get_message(case),
+                #'Similar issue': issue,
+                'Source': case['source'] if isinstance(case, dict) else 'XML'
+            }, print_header, fail_list)
+
+    print_header = False
 
 def parse_log_file(log_file):
     with open(log_file, encoding='utf-8') as f:
@@ -264,6 +272,7 @@ def print_summary():
     print("### Results Summary")
     print_header = True
 
+    
     for summary in summaries:
         print_md_row({
             'Category': summary['Category'],

From c8f27e65fc0d33aa35129c811c87a1a24e9e5216 Mon Sep 17 00:00:00 2001
From: Daisy Deng <daisy.deng@intel.com>
Date: Fri, 25 Apr 2025 00:54:15 -0700
Subject: [PATCH 41/41] upload ut_failure_list.csv

---
 .github/workflows/_linux_ut.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 8a4cc0b45..afd8afdf0 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -266,6 +266,7 @@ jobs:
           source activate xpu_op_${ZE_AFFINITY_MASK}
           pip install junitparser
           python .github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml >> $GITHUB_STEP_SUMMARY || true
+          if [ -f "ut_failure_list.csv"];then cp ut_failure_list.csv  ${{ github.workspace }}/ut_log/. fi 
       - name: UT Test Results Check
         shell: bash
         run: |
@@ -704,3 +705,9 @@ jobs:
         with:
           name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed
           path: ${{ github.workspace }}/ut_log
+      - name: Upload XPU UT Log
+        if: ${{ ! cancelled() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: XPU-UT-Failure-List-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed
+          path: ${{ github.workspace }}/ut_log/ut_failure_list.csv