[feature]2.2 custom_allreduce support cudagraph recapture (#4307)

ckl117 · web-flow · commit 63a03ee15235 · 2025-09-29T18:14:21.000+08:00
* custom_allreduce support cudagraph recapture

* delete code

* add shut_down/restart default group
diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -616,6 +616,8 @@ int64_t open_mem_handle(paddle::Tensor& mem_handle);
 
 void free_shared_buffer(int64_t buffer);
 
+void clear_ipc_handles(int64_t _fa);
+
 // speculative decoding Kernel
 std::vector<paddle::Tensor> SpeculateGetPaddingOffset(
     const paddle::Tensor& input_ids,
@@ -1204,6 +1206,8 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
 
   m.def("free_shared_buffer", &free_shared_buffer, "free_shared_buffer");
 
+  m.def("clear_ipc_handles", &clear_ipc_handles, "clear_ipc_handles");
+
   m.def("open_mem_handle", &open_mem_handle, "open_mem_handle");
 
   m.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta, "get_graph_buffer_ipc_meta");
diff --git a/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cu b/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cu
@@ -122,10 +122,14 @@ void register_graph_buffers(fptr_t _fa,
   for (int i = 0; i < handles.size(); i++) {
     bytes.emplace_back(handles[i].begin(), handles[i].end());
   }
-  bytes.reserve(handles.size());
   fa->register_graph_buffers(bytes, offsets);
 }
 
+void clear_ipc_handles(fptr_t _fa) {
+  auto fa = reinterpret_cast<paddle::CustomAllreduce*>(_fa);
+  fa->clear_ipc_handles();
+}
+
 std::tuple<fptr_t, paddle::Tensor> allocate_shared_buffer_and_handle(
     int64_t size) {
 
diff --git a/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh b/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh
@@ -517,10 +517,15 @@ class CustomAllreduce {
 #undef KL
   }
 
-  ~CustomAllreduce() {
+  void clear_ipc_handles(){
     for (auto [_, ptr] : ipc_handles_) {
       CUDACHECK(cudaIpcCloseMemHandle(ptr));
     }
+    ipc_handles_.clear();
+  }
+
+  ~CustomAllreduce() {
+    clear_ipc_handles();
   }
 };
 }  // namespace paddle
diff --git a/fastdeploy/distributed/communication.py b/fastdeploy/distributed/communication.py
@@ -42,6 +42,12 @@ def use_custom_allreduce(custom_all_reduce_max_bytes: int = 8192 * 1024):
     _TP_AR = CustomAllreduce(model_parallel_group, custom_all_reduce_max_bytes)
 
 
+def custom_ar_clear_ipc_handles():
+    global _TP_AR
+    if _TP_AR is not None:
+        _TP_AR.clear_ipc_handles()
+
+
 try:
 
     @paddle.jit.marker.unified
diff --git a/fastdeploy/distributed/custom_all_reduce/custom_all_reduce.py b/fastdeploy/distributed/custom_all_reduce/custom_all_reduce.py
@@ -25,6 +25,7 @@
 from fastdeploy.distributed.custom_all_reduce import cuda_wrapper
 from fastdeploy.model_executor.ops.gpu import (
     all_reduce,
+    clear_ipc_handles,
     dispose,
     get_graph_buffer_ipc_meta,
     init_custom_all_reduce,
@@ -220,6 +221,9 @@ def custom_all_reduce(self, input: paddle.Tensor) -> Optional[paddle.Tensor]:
         else:
             return self.all_reduce(input, input, registered=False)
 
+    def clear_ipc_handles(self):
+        clear_ipc_handles(self._ptr)
+
     def close(self):
         if self._ptr:
             dispose(self._ptr)
diff --git a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
@@ -23,7 +23,10 @@
 from paddle.device.cuda import graphs
 
 from fastdeploy.config import FDConfig
-from fastdeploy.distributed.communication import capture_custom_allreduce
+from fastdeploy.distributed.communication import (
+    capture_custom_allreduce,
+    custom_ar_clear_ipc_handles,
+)
 from fastdeploy.utils import get_logger
 
 logger = get_logger("cudagrpah_piecewise_backend", "cudagraph_piecewise_backend.log")
@@ -208,6 +211,7 @@ def _create_entry_dict(self):
     def clear_graph(self):
         """ """
         # Clear graphs
+        custom_ar_clear_ipc_handles()
         for id, entry in self.concrete_size_entries.items():
             if entry.cuda_graph:
                 del entry.cuda_graph
diff --git a/fastdeploy/rl/dynamic_weight_manager.py b/fastdeploy/rl/dynamic_weight_manager.py
@@ -66,6 +66,7 @@ def update_parameters(self, pid: int = 0) -> None:
         paddle.device.cuda.empty_cache()
 
         if not self.first_load:
+            paddle.distributed.restart_process_group()
             paddle.distributed.restart_process_group(self.parallel_config.tp_group)
             if self.parallel_config.enable_expert_parallel:
                 paddle.distributed.restart_process_group(self.parallel_config.ep_group)

Original file line number	Diff line number	Diff line change
`@@ -122,10 +122,14 @@ void register_graph_buffers(fptr_t _fa,`
`122`	`122`	`for (int i = 0; i < handles.size(); i++) {`
`123`	`123`	`bytes.emplace_back(handles[i].begin(), handles[i].end());`
`124`	`124`	`}`
`125`		`- bytes.reserve(handles.size());`
`126`	`125`	`fa->register_graph_buffers(bytes, offsets);`
`127`	`126`	`}`
`128`	`127`
	`128`	`+void clear_ipc_handles(fptr_t _fa) {`
	`129`	`+ auto fa = reinterpret_cast<paddle::CustomAllreduce*>(_fa);`
	`130`	`+ fa->clear_ipc_handles();`
	`131`	`+}`
	`132`	`+`
`129`	`133`	`std::tuple<fptr_t, paddle::Tensor> allocate_shared_buffer_and_handle(`
`130`	`134`	`int64_t size) {`
`131`	`135`
Original file line number	Diff line number	Diff line change
`@@ -517,10 +517,15 @@ class CustomAllreduce {`
`517`	`517`	`#undef KL`
`518`	`518`	`}`
`519`	`519`
`520`		`- ~CustomAllreduce() {`
	`520`	`+ void clear_ipc_handles(){`
`521`	`521`	`for (auto [_, ptr] : ipc_handles_) {`
`522`	`522`	`CUDACHECK(cudaIpcCloseMemHandle(ptr));`
`523`	`523`	`}`
	`524`	`+ ipc_handles_.clear();`
	`525`	`+ }`
	`526`	`+`
	`527`	`+ ~CustomAllreduce() {`
	`528`	`+ clear_ipc_handles();`
`524`	`529`	`}`
`525`	`530`	`};`
`526`	`531`	`} // namespace paddle`