sgl-project
diff --git a/‎csrc/deepep/deep_ep.cpp‎
Lines changed: 13 additions & 12 deletions b/‎csrc/deepep/deep_ep.cpp‎
Lines changed: 13 additions & 12 deletions
diff --git a/‎csrc/deepep/deep_ep.hpp‎
Lines changed: 2 additions & 1 deletion b/‎csrc/deepep/deep_ep.hpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎csrc/deepep/ops/CMakePresets.json‎
Lines changed: 1 addition & 1 deletion b/‎csrc/deepep/ops/CMakePresets.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/deepep/ops/op_host/dispatch_layout.cpp‎
Lines changed: 21 additions & 3 deletions b/‎csrc/deepep/ops/op_host/dispatch_layout.cpp‎
Lines changed: 21 additions & 3 deletions
diff --git a/‎csrc/deepep/ops/op_host/dispatch_layout_a2.cpp‎
Lines changed: 0 additions & 78 deletions b/‎csrc/deepep/ops/op_host/dispatch_layout_a2.cpp‎
Lines changed: 0 additions & 78 deletions
diff --git a/‎csrc/deepep/ops/op_host/dispatch_layout_tiling.cc‎
Lines changed: 44 additions & 5 deletions b/‎csrc/deepep/ops/op_host/dispatch_layout_tiling.cc‎
Lines changed: 44 additions & 5 deletions
@@ -12,6 +12,10 @@ constexpr int PADDING_SIZE = 3;
 constexpr size_t HCOMM_NAME_LEN = 128;
 constexpr uint32_t NO_SCALES = 0;
 constexpr uint32_t DYNAMIC_SCALES = 2;
+// In a shared header
+constexpr int A2_LOCAL_RANK_SIZE = 8;
+constexpr int A2_MAX_BATCH_SIZE = 4096;
+constexpr int A2_EXPERT_DATA_SIZE = 1 + 2 * A2_MAX_BATCH_SIZE;  // 8193
 
 Buffer::Buffer(int64_t rank, int64_t num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_bytes, bool low_latency_mode,
                std::string moe_all_to_all_group_name)
@@ -46,7 +50,7 @@ bool Buffer::is_available() const
     return available;
 }
 
-std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor, torch::Tensor, std::optional<EventHandle>>
+std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor, std::optional<EventHandle>>
 Buffer::get_dispatch_layout(const torch::Tensor &topk_idx, int num_experts, std::optional<EventHandle> &previous_event,
                             bool async, bool allocate_on_comm_stream)
 {
@@ -73,30 +77,27 @@ Buffer::get_dispatch_layout(const torch::Tensor &topk_idx, int num_experts, std:
 
     const int num_tokens = new_topk_idx.size(0);
     const int num_topk = new_topk_idx.size(1);
-    const int local_ranksize = 8;
+    const int local_ranksize = A2_LOCAL_RANK_SIZE;
     auto server_num = num_ranks / local_ranksize;
 
     auto device = new_topk_idx.device();
     auto num_tokens_per_expert = at::zeros({num_experts}, at::dtype(at::kInt).device(device));
     auto num_tokens_per_rank = at::zeros({num_ranks}, at::dtype(at::kInt).device(device));
     auto is_token_in_rank = at::zeros({num_tokens, num_ranks}, at::dtype(at::kInt).device(device));
-    auto local_token_server_offset = at::zeros({num_tokens * server_num}, at::dtype(at::kInt).device(device));
-    auto local_token_server_uniq_count = at::zeros({server_num}, at::dtype(at::kInt).device(device));
-    auto local_token_server_total_count = at::zeros({num_tokens * server_num}, at::dtype(at::kInt).device(device));
-    auto local_token_server_num = at::zeros({num_tokens}, at::dtype(at::kInt).device(device));
-    const int total_size = num_experts * 8193 + server_num + num_tokens * (1 + 2 * server_num + num_topk);
-    auto expert_rank_token_idx = at::zeros({total_size}, at::dtype(at::kInt).device(device));
+    const int total_size =
+        num_experts * A2_EXPERT_DATA_SIZE + server_num + num_tokens * (1 + 2 * server_num + num_topk);
+    auto total_data = at::zeros({total_size}, at::dtype(at::kInt).device(device));
 
-    EXEC_NPU_CMD(aclnnDispatchLayoutA2, new_topk_idx, num_tokens, num_ranks, num_experts, num_topk, local_ranksize, num_tokens_per_rank,
-                 num_tokens_per_expert, is_token_in_rank, local_token_server_offset, local_token_server_uniq_count,
-                 local_token_server_total_count, local_token_server_num, expert_rank_token_idx);
+    EXEC_NPU_CMD(aclnnDispatchLayout, new_topk_idx, num_tokens, num_ranks, num_experts, num_topk, local_ranksize,
+                 num_tokens_per_rank, num_tokens_per_expert, is_token_in_rank, total_data);
 
+    this->send_data = total_data;
     std::optional<torch::Tensor> num_tokens_per_rdma_rank = std::nullopt;
     std::optional<EventHandle> output_event = std::nullopt;
     auto is_token_in_rank_bool = is_token_in_rank.to(at::kBool);
 
     return std::make_tuple(num_tokens_per_rank, num_tokens_per_rdma_rank, num_tokens_per_expert, is_token_in_rank_bool,
-                           expert_rank_token_idx, output_event);
+                           output_event);
 }
 
 std::tuple<at::Tensor, std::optional<at::Tensor>, std::optional<at::Tensor>, std::optional<at::Tensor>,
 
@@ -26,6 +26,7 @@ struct Buffer {
     at::Tensor ori_x;
     at::Tensor new_topk_idx;
     at::Tensor new_scales;
+    at::Tensor send_data;
 
     int64_t shared_expert_rank_num;
     int64_t shared_expert_num = 1;
@@ -47,7 +48,7 @@ struct Buffer {
 
     bool is_available() const;
 
-    std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor, torch::Tensor, std::optional<EventHandle>>
+    std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor, std::optional<EventHandle>>
     get_dispatch_layout(const torch::Tensor &topk_idx, int num_experts, std::optional<EventHandle> &previous_event,
                         bool async, bool allocate_on_comm_stream);
 
 
@@ -27,7 +27,7 @@
                 },
                 "ASCEND_COMPUTE_UNIT": {
                     "type": "STRING",
-                    "value": "ascend910b"
+                    "value": "ascend910_93"
                 },
                 "ENABLE_TEST": {
                     "type": "BOOL",
 
@@ -16,6 +16,7 @@ class DispatchLayout : public OpDef
         this->Attr("num_ranks").Int();
         this->Attr("num_experts").Int();
         this->Attr("num_topk").Int();
+        this->Attr("local_ranksize").Int();
 
         this->Output("numTokensPerRank")
             .ParamType(REQUIRED)
@@ -32,9 +33,14 @@ class DispatchLayout : public OpDef
             .DataType({ge::DT_INT32})
             .Format({ge::FORMAT_ND})
             .UnknownShapeFormat({ge::FORMAT_ND});
+        this->Output("totalData")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32})
+            .Format({ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND});
 
-        OpAICoreConfig aicore_config;
-        aicore_config.DynamicCompileStaticFlag(true)
+        OpAICoreConfig a3_config;
+        a3_config.DynamicCompileStaticFlag(true)
             .DynamicFormatFlag(true)
             .DynamicRankSupportFlag(true)
             .DynamicShapeSupportFlag(true)
@@ -44,7 +50,19 @@ class DispatchLayout : public OpDef
             .ExtendCfgInfo("jitCompile.flag", "static_true")
             .ExtendCfgInfo("multiKernelSupportDynamicGraph.value", "multi_kernel");
 
-        this->AICore().AddConfig("ascend910_93", aicore_config);
+        OpAICoreConfig a2_config;
+        a2_config.DynamicCompileStaticFlag(true)
+            .DynamicFormatFlag(true)
+            .DynamicRankSupportFlag(true)
+            .DynamicShapeSupportFlag(true)
+            .NeedCheckSupportFlag(false)
+            .PrecisionReduceFlag(true)
+            .ExtendCfgInfo("aclnnSupport.value", "support_aclnn")
+            .ExtendCfgInfo("jitCompile.flag", "static_false")
+            .ExtendCfgInfo("multiKernelSupportDynamicGraph.value", "multi_kernel");
+
+        this->AICore().AddConfig("ascend910_93", a3_config);
+        this->AICore().AddConfig("ascend910b", a2_config);
     }
 };
 
 
@@ -26,17 +26,23 @@ constexpr uint32_t INPUT_TOPK_IDX_INDEX = 0;
 constexpr uint32_t OUTPUT_NUM_TOKEN_PER_RANK_INDEX = 0;
 constexpr uint32_t OUTPUT_NUM_TOKEN_PER_EXPERT_INDEX = 1;
 constexpr uint32_t OUTPUT_IS_TOKEN_IN_RANK_INDEX = 2;
+constexpr uint32_t OUTPUT_TOTAL_DATA_INDEX = 3;
 
 constexpr uint32_t ATTR_NUM_TOKENS_INDEX = 0;
 constexpr uint32_t ATTR_NUM_RANKS_INDEX = 1;
 constexpr uint32_t ATTR_NUM_EXPERTS_INDEX = 2;
 constexpr uint32_t ATTR_NUM_TOPK_INDEX = 3;
+constexpr uint32_t ATTR_LOCAL_RANKSIZE_INDEX = 4;
 const int64_t MAX_COMM_WORLD_SIZE = 384;
 const int64_t MAX_MOE_EXPERTS_NUM = 512;
+const int64_t MAX_A2_LOCAL_RANKSIZE = 8;
 constexpr uint32_t SYSTEM_NEED_WORKSPACE = 16 * 1024 * 1024;
 constexpr uint32_t KERNEL_USE_WORKSPACE = 1 * 1024 * 1024;
 constexpr uint32_t KERNEL_A2_ARG_SIZE = 1 * 1024 * 1024;
 
+constexpr static int TILING_KEY_INT = 23;
+constexpr static int TILING_KEY_A2_TYPE = 100;
+
 constexpr uint32_t TWO_DIMS = 2;
 constexpr uint32_t K_MAX = 16;
 }  // namespace
@@ -48,9 +54,24 @@ static void PrintTilingDataInfo(const char *nodeName, DispatchLayoutTilingData &
     OP_LOGD(nodeName, "numRanks is %u.", tilingData.dispatchLayoutInfo.numRanks);
     OP_LOGD(nodeName, "numExperts is %u.", tilingData.dispatchLayoutInfo.numExperts);
     OP_LOGD(nodeName, "numTopk is %u.", tilingData.dispatchLayoutInfo.numTopk);
+    OP_LOGD(nodeName, "localRankSize is %u.", tilingData.dispatchLayoutInfo.localRankSize);
     OP_LOGD(nodeName, "totalUbSize is %lu.", tilingData.dispatchLayoutInfo.totalUbSize);
 }
 
+static bool CheckIfA2Machine(gert::TilingContext *context)
+{
+    fe::PlatFormInfos *platformInfoPtr = context->GetPlatformInfo();
+    fe::PlatFormInfos &platformInfo = *platformInfoPtr;
+
+    std::string socVersion;
+    (void)platformInfo.GetPlatformResWithLock("version", "Short_SoC_version", socVersion);
+
+    if (socVersion == "Ascend910B") {
+        return true;
+    }
+    return false;
+}
+
 static ge::graphStatus GetAttrAndSetTilingData(gert::TilingContext *context, const char *nodeName,
                                                DispatchLayoutTilingData &tilingData)
 {
@@ -61,11 +82,14 @@ static ge::graphStatus GetAttrAndSetTilingData(gert::TilingContext *context, con
     auto numRanksPtr = attrs->GetAttrPointer<int64_t>(static_cast<int>(ATTR_NUM_RANKS_INDEX));
     auto numExpertsPtr = attrs->GetAttrPointer<int64_t>(ATTR_NUM_EXPERTS_INDEX);
     auto numTopkPtr = attrs->GetAttrPointer<int64_t>(static_cast<int>(ATTR_NUM_TOPK_INDEX));
+    auto localRankSizePtr = attrs->GetAttrPointer<int64_t>(static_cast<int>(ATTR_LOCAL_RANKSIZE_INDEX));
 
     OP_TILING_CHECK(numTokensPtr == nullptr, OP_LOGE(nodeName, "numTokensPtr is null."), return ge::GRAPH_FAILED);
     OP_TILING_CHECK(numRanksPtr == nullptr, OP_LOGE(nodeName, "numRanksPtr is null."), return ge::GRAPH_FAILED);
     OP_TILING_CHECK(numExpertsPtr == nullptr, OP_LOGE(nodeName, "numExpertsPtr is null."), return ge::GRAPH_FAILED);
     OP_TILING_CHECK(numTopkPtr == nullptr, OP_LOGE(nodeName, "numTopkPtr is null."), return ge::GRAPH_FAILED);
+    OP_TILING_CHECK(localRankSizePtr == nullptr, OP_LOGE(nodeName, "localRankSizePtr is null."),
+                    return ge::GRAPH_FAILED);
 
     OP_TILING_CHECK((*numRanksPtr <= 0) || (*numRanksPtr > MAX_COMM_WORLD_SIZE),
                     OP_LOGE(nodeName, "rankSize is invalid, only support (0, %ld], but got rankSize=%ld.",
@@ -80,10 +104,19 @@ static ge::graphStatus GetAttrAndSetTilingData(gert::TilingContext *context, con
         OP_LOGE(nodeName, "numTopkPtr is invalid, only support (0, %u], but got numTopk=%ld.", K_MAX, *numTopkPtr),
         return ge::GRAPH_FAILED);
 
+    if (CheckIfA2Machine(context)) {
+        OP_TILING_CHECK(
+            (*localRankSizePtr <= 0) || (*localRankSizePtr > MAX_A2_LOCAL_RANKSIZE),
+            OP_LOGE(nodeName, "localRankSizePtr is invalid, only support (0, %ld], but got localRankSize=%ld.",
+                    MAX_A2_LOCAL_RANKSIZE, *localRankSizePtr),
+            return ge::GRAPH_FAILED);
+    }
+
     tilingData.dispatchLayoutInfo.numTokens = static_cast<uint32_t>(*numTokensPtr);
     tilingData.dispatchLayoutInfo.numRanks = static_cast<uint32_t>(*numRanksPtr);
     tilingData.dispatchLayoutInfo.numExperts = static_cast<uint32_t>(*numExpertsPtr);
     tilingData.dispatchLayoutInfo.numTopk = static_cast<uint32_t>(*numTopkPtr);
+    tilingData.dispatchLayoutInfo.localRankSize = static_cast<uint32_t>(*localRankSizePtr);
 
     return ge::GRAPH_SUCCESS;
 }
@@ -102,11 +135,13 @@ static bool CheckTensorDataType(gert::TilingContext *context, const char *nodeNa
     auto numTokensPerRank = context->GetOutputDesc(OUTPUT_NUM_TOKEN_PER_RANK_INDEX);
     auto numTokensPerExpert = context->GetOutputDesc(OUTPUT_NUM_TOKEN_PER_EXPERT_INDEX);
     auto isTokenInRank = context->GetOutputDesc(OUTPUT_IS_TOKEN_IN_RANK_INDEX);
+    auto totalData = context->GetOutputDesc(OUTPUT_TOTAL_DATA_INDEX);
 
     OP_TILING_CHECK(topkIdx == nullptr, OP_LOGE(nodeName, "topkIdx is null."), return false);
     OP_TILING_CHECK(numTokensPerRank == nullptr, OP_LOGE(nodeName, "numTokensPerRank is null."), return false);
     OP_TILING_CHECK(numTokensPerExpert == nullptr, OP_LOGE(nodeName, "numTokensPerExpert is null."), return false);
     OP_TILING_CHECK(isTokenInRank == nullptr, OP_LOGE(nodeName, "isTokenInRank is null."), return false);
+    OP_TILING_CHECK(totalData == nullptr, OP_LOGE(nodeName, "totalData is null."), return false);
 
     OP_TILING_CHECK((topkIdx->GetDataType() != ge::DT_INT64),
                     OP_LOGE(nodeName, "topkIdx datatype is invalid, datatype should be int, but is %d.",
@@ -124,6 +159,10 @@ static bool CheckTensorDataType(gert::TilingContext *context, const char *nodeNa
                     OP_LOGE(nodeName, "isTokenInRank datatype is invalid, datatype should be int, but is %d.",
                             static_cast<ge::DataType>(isTokenInRank->GetDataType())),
                     return false);
+    OP_TILING_CHECK((totalData->GetDataType() != ge::DT_INT32),
+                    OP_LOGE(nodeName, "totalData datatype is invalid, datatype should be int, but is %d.",
+                            static_cast<ge::DataType>(totalData->GetDataType())),
+                    return false);
 
     return true;
 }
@@ -169,11 +208,11 @@ static ge::graphStatus DispatchLayoutTilingFuncImpl(gert::TilingContext *context
     OP_TILING_CHECK(SetWorkSpace(context, nodeName) != ge::GRAPH_SUCCESS,
                     OP_LOGE(nodeName, "Tiling set workspace failed."), return ge::GRAPH_FAILED);
 
-    fe::PlatFormInfos *platformInfoPtr = context->GetPlatformInfo();
-    fe::PlatFormInfos &platformInfo = *platformInfoPtr;
-
-    std::string socVersion;
-    (void)platformInfo.GetPlatformResWithLock("version", "Short_SoC_version", socVersion);
+    int tilingKey = TILING_KEY_INT;
+    if (CheckIfA2Machine(context)) {
+        tilingKey = tilingKey + TILING_KEY_A2_TYPE;
+    }
+    context->SetTilingKey(tilingKey);
 
     auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo());
     uint32_t blockDim;