sgl-project
diff --git a/‎csrc/deepep/deep_ep.cpp‎
Lines changed: 40 additions & 83 deletions b/‎csrc/deepep/deep_ep.cpp‎
Lines changed: 40 additions & 83 deletions
diff --git a/‎csrc/deepep/deep_ep.hpp‎
Lines changed: 2 additions & 1 deletion b/‎csrc/deepep/deep_ep.hpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎csrc/deepep/ops/op_host/dispatch_layout.cpp‎
Lines changed: 5 additions & 0 deletions b/‎csrc/deepep/ops/op_host/dispatch_layout.cpp‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎csrc/deepep/ops/op_host/notify_dispatch.cpp‎
Lines changed: 25 additions & 1 deletion b/‎csrc/deepep/ops/op_host/notify_dispatch.cpp‎
Lines changed: 25 additions & 1 deletion
diff --git a/‎csrc/deepep/ops/op_host/op_api/aclnn_dispatch_layout.cpp‎
Lines changed: 3 additions & 2 deletions b/‎csrc/deepep/ops/op_host/op_api/aclnn_dispatch_layout.cpp‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎csrc/deepep/ops/op_host/op_api/aclnn_dispatch_layout.h‎
Lines changed: 2 additions & 1 deletion b/‎csrc/deepep/ops/op_host/op_api/aclnn_dispatch_layout.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎csrc/deepep/ops/op_host/op_api/aclnn_notify_dispatch.cpp‎
Lines changed: 6 additions & 2 deletions b/‎csrc/deepep/ops/op_host/op_api/aclnn_notify_dispatch.cpp‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎csrc/deepep/ops/op_host/op_api/aclnn_notify_dispatch.h‎
Lines changed: 3 additions & 1 deletion b/‎csrc/deepep/ops/op_host/op_api/aclnn_notify_dispatch.h‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎csrc/deepep/ops/op_kernel/dispatch_layout.cpp‎
Lines changed: 6 additions & 5 deletions b/‎csrc/deepep/ops/op_kernel/dispatch_layout.cpp‎
Lines changed: 6 additions & 5 deletions
@@ -116,11 +116,13 @@ Buffer::get_dispatch_layout(const torch::Tensor &topk_idx, int num_experts, std:
     7. The server offset of tokens received by each expert from this NPU.
        size:[numExpert, MAX_BS]
     */
+    auto send_token_idx_small = at::zeros({num_tokens, num_topk}, at::dtype(at::kInt).device(device));
     auto notify_send_data = at::zeros({notify_send_data_size}, at::dtype(at::kInt).device(device));
     EXEC_NPU_CMD(aclnnDispatchLayout, new_topk_idx, num_tokens, num_ranks, num_experts, num_topk, local_ranksize,
-                 num_tokens_per_rank, num_tokens_per_expert, is_token_in_rank, notify_send_data);
+                 num_tokens_per_rank, num_tokens_per_expert, is_token_in_rank, notify_send_data, send_token_idx_small);
 
     this->notify_send_data = notify_send_data;
+    this->send_token_idx_small = send_token_idx_small;
     this->notify_send_data_size = notify_send_data_size;
 
     std::optional<torch::Tensor> num_tokens_per_rdma_rank = std::nullopt;
@@ -161,6 +163,19 @@ Buffer::intranode_dispatch(const at::Tensor &x, const std::optional<at::Tensor>
     EP_HOST_ASSERT(config.num_sms % 2 == 0);
     int num_channels = config.num_sms / 2;
 
+    at::Tensor expert_ids = new_topk_idx.to(at::kInt);
+    int64_t tp_size = 1;
+    int64_t tp_rank = 0;
+    int64_t quant_mode = use_quant ? DYNAMIC_SCALES : NO_SCALES;
+    auto recv_topk_idx = std::optional<at::Tensor>();
+    auto recv_topk_weights = std::optional<at::Tensor>();
+    // Wait streams
+    std::optional<EventHandle> event;
+    auto rank_prefix_matrix = at::empty({num_ranks, num_ranks}, at::dtype(at::kInt).device(x.device()));
+    auto channel_prefix_matrix = at::empty({num_ranks, num_channels}, at::dtype(at::kInt).device(x.device()));
+    auto recv_channel_prefix_matrix = at::empty({num_ranks, num_channels}, at::dtype(at::kInt).device(x.device()));
+    std::vector<int> num_recv_tokens_per_expert_list;
+
     at::Tensor new_x = x;
     // for padding
     if (topk_idx->size(0) < PADDING_SIZE) {
@@ -240,7 +255,11 @@ Buffer::intranode_dispatch(const at::Tensor &x, const std::optional<at::Tensor>
 
     auto send_data_offset = torch::empty({num_experts}, at::dtype(at::kInt).device(x.device()));
     at::Tensor recv_data = torch::empty({num_experts * send_per_group}, at::dtype(at::kInt).device(x.device()));
-
+    at::Tensor total_recv_token_ = torch::empty({1}, at::dtype(at::kInt).device(x.device()));
+    at::Tensor recv_count_ = torch::empty({num_experts}, at::dtype(at::kInt).device(x.device()));
+    at::Tensor recv_offset_ = torch::empty({num_experts}, at::dtype(at::kInt).device(x.device()));
+    at::Tensor max_bs_ = torch::empty({1}, at::dtype(at::kInt).device(x.device()));
+    at::Tensor recv_tokens_per_expert_ = torch::empty({num_local_experts}, at::dtype(at::kLong).device(x.device()));
     // get ep name
     char hcom_ep_name[HCOMM_NAME_LEN];
     if (!moe_all_to_all_group_name.empty()) {
@@ -257,95 +276,33 @@ Buffer::intranode_dispatch(const at::Tensor &x, const std::optional<at::Tensor>
                  hcom_ep_name,  // commGroup
                  num_ranks,     // rankSize
                  rank,          // rankId
-                 local_rank_size, local_rank_id, send_data_offset, recv_data);
-
-    auto options_cpu = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCPU);
-    std::vector<int32_t> local_expert_acc(num_experts, 0);
-    auto send_token_idx_cpu = torch::empty({num_tokens, num_topk}, options_cpu);
-    auto send_token_idx_ptr = send_token_idx_cpu.data_ptr<int>();
-
-    auto topk_idx_cpu = new_topk_idx.to(at::kCPU);
-    auto topk_idx_ptr = topk_idx_cpu.data_ptr<int64_t>();
-    for (int i = 0; i < num_tokens; ++i) {
-        for (int j = 0; j < num_topk; ++j) {
-            int64_t expert_idx = topk_idx_ptr[i * num_topk + j];
-            if (expert_idx >= 0) {
-                int32_t cnt = local_expert_acc[expert_idx];
-                send_token_idx_ptr[i * num_topk + j] = cnt;
-                local_expert_acc[expert_idx]++;
-            }
-        }
-    }
-
-    EP_HOST_ASSERT(recv_data.dim() == 1 and recv_data.is_contiguous());
-    EP_HOST_ASSERT(recv_data.size(0) % num_experts == 0);
-    at::Tensor recv_offset_cpu = torch::empty({num_experts}, options_cpu);
-    at::Tensor recv_count_cpu = torch::empty({num_experts}, options_cpu);
-    auto recv_data_cpu = recv_data.to(at::kCPU);
-    auto recv_data_ptr = recv_data_cpu.data_ptr<int>();
-    auto recv_count_ptr = recv_count_cpu.data_ptr<int>();
-    auto recv_offset_ptr = recv_offset_cpu.data_ptr<int>();
-    int total_recv_tokens = 0;
-    int num_max_dispatch_tokens_per_rank = 0;
-    std::vector<int> num_recv_tokens_per_expert_list;
-
-    for (int64_t local_e = 0; local_e < num_local_experts; ++local_e) {
-        int64_t local_expert_recv_tokens = 0;
-        for (int64_t src_rank = 0; src_rank < num_ranks; ++src_rank) {
-            int64_t index = local_e * num_ranks + src_rank;
-            int64_t pair_idx = send_per_group * (src_rank * num_local_experts + local_e);
-
-            int recv_cnt = recv_data_ptr[pair_idx];             // count from this src_rank for this global_expert
-            int recv_off = recv_data_ptr[pair_idx + 1];         // offset in that src_rank's window
-            int send_num_tokens = recv_data_ptr[pair_idx + 2];  // all bs from rank
-
-            total_recv_tokens += recv_cnt;
-            recv_count_ptr[index] = total_recv_tokens;
-            recv_offset_ptr[index] = recv_off;
-            num_max_dispatch_tokens_per_rank = std::max(num_max_dispatch_tokens_per_rank, send_num_tokens);
-
-            local_expert_recv_tokens += recv_cnt;
-        }
-        num_recv_tokens_per_expert_list.push_back(local_expert_recv_tokens);
-    }
-
-    at::Tensor expert_ids = new_topk_idx.to(at::kInt);
-    int64_t tp_size = 1;
-    int64_t tp_rank = 0;
-    int64_t quant_mode = use_quant ? DYNAMIC_SCALES : NO_SCALES;
-    int64_t global_bs = static_cast<int64_t>(
-        std::max(num_max_dispatch_tokens_per_rank * num_ranks, static_cast<int64_t>(num_worst_tokens)));
-
-    auto send_token_idx = send_token_idx_cpu.to(x.device());
-    auto recv_offset = recv_offset_cpu.to(x.device());
-    auto recv_count = recv_count_cpu.to(x.device());
-
-    int num_recv_tokens = (total_recv_tokens == 0) ? 1 : total_recv_tokens;
+                 local_rank_size, local_rank_id, send_data_offset, recv_data, total_recv_token_, recv_count_,
+                 recv_offset_, max_bs_, recv_tokens_per_expert_);
+    auto send_token_idx_small = this->send_token_idx_small;
+    int64_t gBs = max_bs_.item<int>() * num_ranks;
+    int64_t trt = total_recv_token_.item<int>();
+    int num_recv_tokens = (trt == 0) ? 1 : trt;
     auto expandx_out = use_quant ? torch::empty({num_recv_tokens, hidden}, at::dtype(at::kChar).device(x.device()))
                                  : torch::empty({num_recv_tokens, hidden}, x.options());
     auto dynamic_scales_out = torch::empty({num_recv_tokens}, at::dtype(at::kFloat).device(x.device()));
     auto expand_idx_out = torch::empty({num_recv_tokens * 3}, at::dtype(at::kInt).device(x.device()));
+    if (topk_idx.has_value()) {
+        recv_topk_idx = at::empty({trt, num_topk}, topk_idx->options());
+        recv_topk_weights = at::empty({trt, num_topk}, topk_weights->options());
+    }
 
-    EXEC_NPU_CMD(aclnnCamMoeDispatchNormal, new_x, expert_ids, send_data_offset, send_token_idx, recv_offset,
-                 recv_count, hcom_ep_name,
+    EXEC_NPU_CMD(aclnnCamMoeDispatchNormal, new_x, expert_ids, send_data_offset, send_token_idx_small, recv_offset_,
+                 recv_count_, hcom_ep_name,
                  num_ranks,  // rankSize
                  rank,       // rankId
-                 hcom_ep_name, tp_size, tp_rank, num_experts, quant_mode, global_bs, expandx_out, dynamic_scales_out,
+                 hcom_ep_name, tp_size, tp_rank, num_experts, quant_mode, gBs, expandx_out, dynamic_scales_out,
                  expand_idx_out, dispatch_wait_recv_cost_stats_out);
-
-    auto recv_topk_idx = std::optional<at::Tensor>();
-    auto recv_topk_weights = std::optional<at::Tensor>();
-    if (topk_idx.has_value()) {
-        recv_topk_idx = at::empty({total_recv_tokens, num_topk}, topk_idx->options());
-        recv_topk_weights = at::empty({total_recv_tokens, num_topk}, topk_weights->options());
+    auto recv_token_per_exp_cpu = recv_tokens_per_expert_.to(at::kCPU);
+    auto recv_token_per_exp_ptr = recv_token_per_exp_cpu.data_ptr<int64_t>();
+    for (int64_t local_e = 0; local_e < num_local_experts; ++local_e) {
+        int token_cnt = static_cast<int>(recv_token_per_exp_ptr[local_e]);
+        num_recv_tokens_per_expert_list.emplace_back(token_cnt);
     }
-    // Wait streams
-    std::optional<EventHandle> event;
-
-    auto rank_prefix_matrix = at::empty({num_ranks, num_ranks}, at::dtype(at::kInt).device(x.device()));
-    auto channel_prefix_matrix = at::empty({num_ranks, num_channels}, at::dtype(at::kInt).device(x.device()));
-    auto recv_channel_prefix_matrix = at::empty({num_ranks, num_channels}, at::dtype(at::kInt).device(x.device()));
-
     // Return values
     return {expandx_out,
             dynamic_scales_out,
@@ -356,7 +313,7 @@ Buffer::intranode_dispatch(const at::Tensor &x, const std::optional<at::Tensor>
             channel_prefix_matrix,
             recv_channel_prefix_matrix,
             expand_idx_out,
-            recv_count,
+            recv_count_,
             event};
 }
 
 
@@ -29,7 +29,8 @@ struct Buffer {
     at::Tensor new_topk_idx;
     at::Tensor new_scales;
     at::Tensor notify_send_data;  // only for internode notify
-    int notify_send_data_size;    // only for internode notify
+    at::Tensor send_token_idx_small;
+    int notify_send_data_size;  // only for internode notify
 
     int64_t shared_expert_rank_num;
     int64_t shared_expert_num = 1;
 
@@ -38,6 +38,11 @@ class DispatchLayout : public OpDef
             .DataType({ge::DT_INT32})
             .Format({ge::FORMAT_ND})
             .UnknownShapeFormat({ge::FORMAT_ND});
+        this->Output("sendTokenIdxSmall")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32})
+            .Format({ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND});
 
         OpAICoreConfig a3_config;
         a3_config.DynamicCompileStaticFlag(true)
 
@@ -26,7 +26,31 @@ class NotifyDispatch : public OpDef
             .DataType({ge::DT_FLOAT16, ge::DT_FLOAT, ge::DT_INT32})
             .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
             .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
-
+        this->Output("totalRecvTokens")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT16, ge::DT_FLOAT, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Output("recvCount")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT16, ge::DT_FLOAT, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Output("recvOffset")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT16, ge::DT_FLOAT, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Output("maxBs")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT16, ge::DT_FLOAT, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Output("recvTokensPerExpert")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT64, ge::DT_INT64, ge::DT_INT64})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
         this->Attr("sendCount").Int();
         this->Attr("num_tokens").Int();
         this->Attr("comm_group").String();
 
@@ -18,11 +18,12 @@ aclnnStatus aclnnDispatchLayoutGetWorkspaceSize(const aclTensor *topkIdx, int64_
                                                 int64_t numExperts, int64_t numTopk, int64_t localRankSize,
                                                 const aclTensor *numTokensPerRank, const aclTensor *numTokensPerExpert,
                                                 const aclTensor *isTokenInRank, const aclTensor *notifySendData,
-                                                uint64_t *workspaceSize, aclOpExecutor **executor)
+                                                const aclTensor *sendTokenIdxSmall, uint64_t *workspaceSize,
+                                                aclOpExecutor **executor)
 {
     return aclnnInnerDispatchLayoutGetWorkspaceSize(topkIdx, numTokens, numRanks, numExperts, numTopk, localRankSize,
                                                     numTokensPerRank, numTokensPerExpert, isTokenInRank, notifySendData,
-                                                    workspaceSize, executor);
+                                                    sendTokenIdxSmall, workspaceSize, executor);
 }
 
 aclnnStatus aclnnDispatchLayout(void *workspace, uint64_t workspaceSize, aclOpExecutor *executor, aclrtStream stream)
 
@@ -24,7 +24,8 @@ extern "C" {
 __attribute__((visibility("default"))) aclnnStatus aclnnDispatchLayoutGetWorkspaceSize(
     const aclTensor *topkIdx, int64_t numTokens, int64_t numRanks, int64_t numExperts, int64_t numTopk,
     int64_t localRankSize, const aclTensor *numTokensPerRank, const aclTensor *numTokensPerExpert,
-    const aclTensor *isTokenInRank, const aclTensor *notifySendData, uint64_t *workspaceSize, aclOpExecutor **executor);
+    const aclTensor *isTokenInRank, const aclTensor *notifySendData, const aclTensor *sendTokenIdxSmall,
+    uint64_t *workspaceSize, aclOpExecutor **executor);
 
 /* function: aclnnDispatchLayout
  * workspace : workspace memory addr(input).
 
@@ -20,11 +20,15 @@ aclnnStatus aclnnNotifyDispatchGetWorkspaceSize(const aclTensor *sendData, const
                                                 int64_t sendCount, int64_t numTokens, char *commGroup, int64_t rankSize,
                                                 int64_t rankId, int64_t localRankSize, int64_t localRankId,
                                                 const aclTensor *sendDataOffset, const aclTensor *recvData,
-                                                uint64_t *workspaceSize, aclOpExecutor **executor)
+                                                const aclTensor *totalRecvTokens, const aclTensor *recvCount,
+                                                const aclTensor *recvOffset, const aclTensor *maxBs,
+                                                const aclTensor *recvTokensPerExpert, uint64_t *workspaceSize,
+                                                aclOpExecutor **executor)
 {
     return aclnnInnerNotifyDispatchGetWorkspaceSize(sendData, tokenPerExpertData, sendCount, numTokens, commGroup,
                                                     rankSize, rankId, localRankSize, localRankId, sendDataOffset,
-                                                    recvData, workspaceSize, executor);
+                                                    recvData, totalRecvTokens, recvCount, recvOffset, maxBs,
+                                                    recvTokensPerExpert, workspaceSize, executor);
 }
 
 aclnnStatus aclnnNotifyDispatch(void *workspace, uint64_t workspaceSize, aclOpExecutor *executor, aclrtStream stream)
 
@@ -27,7 +27,9 @@ extern "C" {
 __attribute__((visibility("default"))) aclnnStatus aclnnNotifyDispatchGetWorkspaceSize(
     const aclTensor *sendData, const aclTensor *tokenPerExpertData, int64_t sendCount, int64_t numTokens,
     char *commGroup, int64_t rankSize, int64_t rankId, int64_t localRankSize, int64_t localRankId,
-    const aclTensor *sendDataOffset, const aclTensor *recvData, uint64_t *workspaceSize, aclOpExecutor **executor);
+    const aclTensor *sendDataOffset, const aclTensor *recvData, const aclTensor *totalRecvTokens,
+    const aclTensor *recvCount, const aclTensor *recvOffset, const aclTensor *maxBs,
+    const aclTensor *recvTokensPerExpert, uint64_t *workspaceSize, aclOpExecutor **executor);
 
 /* function: aclnnNotifyDispatch
  * parameters :
 
@@ -8,7 +8,8 @@
 
 extern "C" __global__ __aicore__ void dispatch_layout(GM_ADDR topkIdx, GM_ADDR numTokensPerRank,
                                                       GM_ADDR numTokensPerExpert, GM_ADDR isTokenInRank,
-                                                      GM_ADDR notifySendData, GM_ADDR workspace, GM_ADDR tiling)
+                                                      GM_ADDR notifySendData, GM_ADDR sendTokenIdxSmall,
+                                                      GM_ADDR workspace, GM_ADDR tiling)
 {
     REGISTER_TILING_DEFAULT(DispatchLayoutTilingData);
     GET_TILING_DATA_WITH_STRUCT(DispatchLayoutTilingData, tilingData, tiling);
@@ -17,13 +18,13 @@ extern "C" __global__ __aicore__ void dispatch_layout(GM_ADDR topkIdx, GM_ADDR n
 
     if (TILING_KEY_IS(TILING_KEY_INT)) {
         MoeDispatchLayout::DispatchLayout<int32_t> op;
-        op.Init(topkIdx, numTokensPerRank, numTokensPerExpert, isTokenInRank, notifySendData, workspace, &pipe,
-                &tilingData);
+        op.Init(topkIdx, numTokensPerRank, numTokensPerExpert, isTokenInRank, notifySendData, sendTokenIdxSmall,
+                workspace, &pipe, &tilingData);
         op.Process();
     } else if (TILING_KEY_IS(TILING_KEY_A2_INT)) {
         MoeDispatchLayoutA2::DispatchLayoutA2<int32_t> op;
-        op.Init(topkIdx, numTokensPerRank, numTokensPerExpert, isTokenInRank, notifySendData, workspace, &pipe,
-                &tilingData);
+        op.Init(topkIdx, numTokensPerRank, numTokensPerExpert, isTokenInRank, notifySendData, sendTokenIdxSmall,
+                workspace, &pipe, &tilingData);
         op.Process();
     }
 }
Original file line number	Diff line number	Diff line change
`@@ -18,11 +18,12 @@ aclnnStatus aclnnDispatchLayoutGetWorkspaceSize(const aclTensor *topkIdx, int64_`
`18`	`18`	`int64_t numExperts, int64_t numTopk, int64_t localRankSize,`
`19`	`19`	`const aclTensor numTokensPerRank, const aclTensor numTokensPerExpert,`
`20`	`20`	`const aclTensor isTokenInRank, const aclTensor notifySendData,`
`21`		`- uint64_t workspaceSize, aclOpExecutor *executor)`
	`21`	`+ const aclTensor sendTokenIdxSmall, uint64_t workspaceSize,`
	`22`	`+ aclOpExecutor **executor)`
`22`	`23`	`{`
`23`	`24`	`return aclnnInnerDispatchLayoutGetWorkspaceSize(topkIdx, numTokens, numRanks, numExperts, numTopk, localRankSize,`
`24`	`25`	`numTokensPerRank, numTokensPerExpert, isTokenInRank, notifySendData,`
`25`		`- workspaceSize, executor);`
	`26`	`+ sendTokenIdxSmall, workspaceSize, executor);`
`26`	`27`	`}`
`27`	`28`
`28`	`29`	`aclnnStatus aclnnDispatchLayout(void workspace, uint64_t workspaceSize, aclOpExecutor executor, aclrtStream stream)`