InternLM · irexyc · Mar 27, 2026 · Mar 27, 2026 · Mar 30, 2026 · Mar 30, 2026
diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
@@ -121,7 +121,7 @@ def add_parser_api_server():
         hf_overrides = ArgumentHelper.hf_overrides(pt_group)
         disable_metrics = ArgumentHelper.disable_metrics(pt_group)
         dp = ArgumentHelper.dp(pt_group)
-        ArgumentHelper.ep(pt_group)
+        ep = ArgumentHelper.ep(pt_group)
         ArgumentHelper.enable_microbatch(pt_group)
         ArgumentHelper.enable_eplb(pt_group)
         ArgumentHelper.role(pt_group)
@@ -148,6 +148,7 @@ def add_parser_api_server():
         tb_group._group_actions.append(hf_overrides)
         tb_group._group_actions.append(disable_metrics)
         tb_group._group_actions.append(dp)
+        tb_group._group_actions.append(ep)
         ArgumentHelper.cp(tb_group)
         ArgumentHelper.rope_scaling_factor(tb_group)
         ArgumentHelper.num_tokens_per_iter(tb_group)
@@ -255,6 +256,7 @@ def api_server(args):
                                                    tp=args.tp,
                                                    dp=args.dp,
                                                    cp=args.cp,
+                                                   ep=args.ep,
                                                    nnodes=args.nnodes,
                                                    node_rank=args.node_rank,
                                                    dist_init_addr=args.dist_init_addr,

diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
@@ -261,6 +261,7 @@ class TurbomindEngineConfig:
     tp: int = 1
     dp: int = 1
     cp: int = 1
+    ep: int = 1
     device_num: int = None
     attn_tp_size: int = None
     attn_cp_size: int = None

diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
@@ -71,6 +71,7 @@ class ModelConfig:
     attn_tp_size: int = 1
     attn_cp_size: int = 1
     mlp_tp_size: int = 1
+    ep_size: int = 1
     model_format: str = 'hf'
     expert_num: list[int] = field(default_factory=list)
     expert_router_bias: bool = False

diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
@@ -276,6 +276,7 @@ def get_tm_model(model_path,
         tm_cfg.model_config.attn_cp_size = engine_config.attn_cp_size
     if engine_config.mlp_tp_size is not None:
         tm_cfg.model_config.mlp_tp_size = engine_config.mlp_tp_size
+    tm_cfg.model_config.ep_size = engine_config.ep
 
     output_model = OUTPUT_MODELS.get(output_model_name)(input_model=input_model,
                                                         cfg=tm_cfg,

diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py
@@ -140,7 +140,7 @@ class Ffn(Module):
 
     def __init__(self, model: BaseOutputModel):
         self.model = model
-        self.tp = model.mlp_tp_size
+        self.tp = model.mlp_tp_size if model.model_config.ep_size == 1 else 1
         # inter_sizes in config are padded and may be different from what's
         # in the weights
         self.inter_size = model.model_config.inter_size

diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
@@ -86,7 +86,27 @@ def complete_parallel_config(cfg: TurbomindEngineConfig):
 
 def update_parallel_config(cfg: TurbomindEngineConfig):
     cfg.device_num = len(cfg.devices) * cfg.nnodes if cfg.devices else cfg.device_num
-    if not complete_parallel_config(cfg):
+    if not complete_parallel_config(cfg) and cfg.ep > 1:
+        os.environ['NCCL_GIN_GDAKI_QP_DEPTH'] = '1024'
+        if cfg.communicator in ['cuda-ipc', 'native']:
+            assert cfg.nnodes == 1, 'TurboMind does not support multi-node with ep > 1'
+        total = cfg.dp * cfg.ep
+        if not cfg.device_num:
+            count = torch.cuda.device_count() * cfg.nnodes
+            if total < count:
+                count = total
+            cfg.device_num = count
+        assert total % cfg.device_num == 0
+        overlap = total // cfg.device_num
+        attn_dp_size = overlap
+        inner_tp_size = cfg.ep // overlap
+        cfg.outer_dp_size = cfg.dp // overlap
+        cfg.attn_dp_size = overlap
+        cfg.attn_tp_size = inner_tp_size // cfg.cp
+        cfg.attn_cp_size = cfg.cp
+        cfg.mlp_dp_size = 1
+        cfg.mlp_tp_size = cfg.attn_dp_size * cfg.attn_tp_size * cfg.attn_cp_size
+    elif not complete_parallel_config(cfg):
         total = cfg.dp * cfg.tp
         if not cfg.device_num:
             count = torch.cuda.device_count() * cfg.nnodes

diff --git a/src/turbomind/comm/device_comm.h b/src/turbomind/comm/device_comm.h
@@ -9,9 +9,58 @@
 #include <cuda_runtime.h>
 
 #include "src/turbomind/comm/host_comm.h"
+#include "src/turbomind/core/buffer.h"
+#include "src/turbomind/core/tensor.h"
 
 namespace turbomind::comm {
 
+struct EpConfig {
+    int num_nodes;
+    int num_experts;
+    int hidden;
+    int ll_max_tokens_per_rank;
+};
+
+enum class EpMode
+{
+    kNull,
+    kHighThroughput,
+    kLowLatency,
+};
+
+struct EpDispatchInput {
+    EpMode&                 mode;
+    core::Tensor&           x;
+    core::Tensor_<float>&   topk_weights;
+    core::Tensor_<int64_t>& topk_idx;
+};
+
+struct EpDispatchOutput {
+    core::Tensor        out_x;
+    core::Tensor        out_topk_weights;
+    core::Buffer_<int>& f2n;
+    core::Buffer_<int>& f2E;
+    core::Buffer_<int>& en2f;
+    core::Buffer_<int>& offsets;
+
+    std::vector<core::Tensor> handle;
+
+    int out_token_num;
+    int out_expert_token_num;
+};
+
+struct EpCombineInput {
+    EpMode&                     mode;
+    core::Tensor&               x;
+    std::vector<core::Tensor>&  handle;
+    std::optional<core::Tensor> topk_weights;
+    std::optional<core::Tensor> topk_idx;
+};
+
+struct EpCombineOutput {
+    core::Tensor out_x;
+};
+
 enum QueryAttr
 {
     kHasAllGather2D
@@ -117,6 +166,41 @@ class DeviceCommImpl {
     {
         throw std::runtime_error("not implemented");
     }
+
+    virtual void ReduceScatterV(const void*   sendbuff,  //
+                                void*         recvbuff,
+                                const size_t* counts,
+                                DataType      type,
+                                int           group,
+                                cudaStream_t  stream)
+    {
+        throw std::runtime_error("not implemented");
+    }
+
+    virtual void AllGatherV(const void*   sendbuff,  //
+                            void*         recvbuff,
+                            const size_t* counts,
+                            DataType      type,
+                            int           group,
+                            cudaStream_t  stream)
+    {
+        throw std::runtime_error("not implemented");
+    }
+
+    virtual void InitializeEp(const EpConfig& config)
+    {
+        throw std::runtime_error("ep not implemented");
+    }
+
+    virtual void Dispatch(const EpDispatchInput& input, EpDispatchOutput& output, int group)
+    {
+        throw std::runtime_error("not implemented");
+    }
+
+    virtual void Combine(const EpCombineInput& input, EpCombineOutput& output, int group)
+    {
+        throw std::runtime_error("not implemented");
+    }
 };
 
 class DeviceComm {

diff --git a/src/turbomind/comm/nccl/CMakeLists.txt b/src/turbomind/comm/nccl/CMakeLists.txt
@@ -6,5 +6,38 @@ add_library(nccl_comm STATIC nccl.cu)
 target_link_libraries(nccl_comm PRIVATE rms_norm core ${NCCL_LIBRARIES} logger)
 target_include_directories(nccl_comm PRIVATE ${NCCL_INCLUDE_DIRS})
 
-set_property(TARGET nccl_comm PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET nccl_comm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+set_property(TARGET nccl_comm PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET nccl_comm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+
+file(READ "${NCCL_INCLUDE_DIRS}/nccl.h" NCCL_HEADER_CONTENTS)
+string(REGEX MATCH "#define NCCL_MAJOR[ \t]+([0-9]+)" _ ${NCCL_HEADER_CONTENTS})
+set(NCCL_MAJOR ${CMAKE_MATCH_1})
+string(REGEX MATCH "#define NCCL_MINOR[ \t]+([0-9]+)" _ ${NCCL_HEADER_CONTENTS})
+set(NCCL_MINOR ${CMAKE_MATCH_1})
+string(REGEX MATCH "#define NCCL_PATCH[ \t]+([0-9]+)" _ ${NCCL_HEADER_CONTENTS})
+set(NCCL_PATCH ${CMAKE_MATCH_1})
+set(NCCL_VERSION_STRING "${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}")
+message(STATUS "Detected NCCL version: ${NCCL_VERSION_STRING}")
+
+if(NOT NCCL_VERSION_STRING VERSION_LESS "2.29.7")
+  set(DEEP_EP_SOURCE_FILES
+      deep_ep/deep_ep.cpp
+      deep_ep/gin_backend.cu
+      deep_ep/kernels/runtime.cu
+      deep_ep/kernels/layout.cu
+      deep_ep/kernels/intranode.cu
+      deep_ep/kernels/internode.cu
+      deep_ep/kernels/internode_ll.cu
+      nccl_ep.cu)
+
+  add_library(deepep STATIC ${DEEP_EP_SOURCE_FILES})
+  target_link_libraries(deepep PRIVATE ${NCCL_LIBRARIES} CUDA::cudart)
+  set_property(TARGET deepep PROPERTY CUDA_ARCHITECTURES 90)
+  target_include_directories(deepep PRIVATE ${NCCL_INCLUDE_DIRS})
+  set_property(TARGET deepep PROPERTY POSITION_INDEPENDENT_CODE ON)
+  set_property(TARGET deepep PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+
+  target_link_libraries(nccl_comm PRIVATE deepep)
+else()
+  message(STATUS "Skip deepep build because NCCL ${NCCL_VERSION_STRING} < 2.29.7")
+endif()