sgl-project
diff --git a/‎CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 31 additions & 0 deletions b/‎README.md‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎build.sh‎
Lines changed: 9 additions & 1 deletion b/‎build.sh‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎contrib/torch_memory_saver/README.md‎
Lines changed: 1 addition & 1 deletion b/‎contrib/torch_memory_saver/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎contrib/torch_memory_saver/python/torch_memory_saver/hooks/mode_preload.py‎
Lines changed: 11 additions & 4 deletions b/‎contrib/torch_memory_saver/python/torch_memory_saver/hooks/mode_preload.py‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎csrc/CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions b/‎csrc/CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎csrc/batch_matmul_transpose/op_host/batch_matmul_transpose.cpp‎
Lines changed: 124 additions & 0 deletions b/‎csrc/batch_matmul_transpose/op_host/batch_matmul_transpose.cpp‎
Lines changed: 124 additions & 0 deletions
@@ -27,6 +27,13 @@ else ()
     add_compile_options(-g -rdynamic)
 endif ()
 
+if(DEFINED ENV{DEBUG_MODE})
+    if("$ENV{DEBUG_MODE}" STREQUAL "ON")
+        add_compile_definitions(DEBUG_MODE)
+        message(STATUS "Debug logging enabled from environment")
+    endif()
+endif()
+
 set(PROJECT_OP_SRC_BASE ${PROJECT_SOURCE_DIR}/csrc)
 set(PROJECT_BUILD_PATH ${PROJECT_SOURCE_DIR}/build)
 set(PROJECT_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/output)
 
@@ -1,3 +1,34 @@
 # sgl-kernel-npu
+
 SGLang kernel library for NPU
 Contribution guide refer to [Contribution Guide](docs/developer_guide/contribution_guide.md).
+
+## Quick start
+
+DeepEP-Ascend: Ascend Implementation of DeepEP. [README](https://github.com/sgl-project/sgl-kernel-npu/blob/main/python/deep_ep/README.md)
+
+SGL-Kernel-NPU: Other SGLang Kernels for Ascend NPU. [README](https://github.com/sgl-project/sgl-kernel-npu/blob/main/python/sgl_kernel_npu/README.md)
+
+## DeepEP-Ascend Performance
+
+### Normal kernels with pure HCCS
+
+We test normal kernels on A3 384 SuperPOD. And we follow the DeepSeek-V3/R1 pretraining setting (4096 tokens per batch, 7168 hidden, top-8 experts, INT8 dispatching and BF16 combining).
+
+| Type      | Dispatch #EP | Bottleneck bandwidth | Combine #EP | Bottleneck bandwidth |
+| --------- | ------------ | -------------------- | ----------- | -------------------- |
+| Intranode | 8            | 146 GB/s (HCCS)      | 8           | 125 GB/s (HCCS)      |
+| Intranode | 16           | 107 GB/s (HCCS)      | 16          | 103 GB/s (HCCS)      |
+| Intranode | 32           | 102 GB/s (HCCS)      | 32          | 95 GB/s (HCCS)       |
+| Intranode | 64           | 81 GB/s (HCCS)       | 64          | 91 GB/s (HCCS)       |
+| Intranode | 128          | 57 GB/s (HCCS)       | 128         | 81 GB/s (HCCS)       |
+
+### Low-latency kernels with pure HCCS
+
+We test normal kernels on A3 384 SuperPOD. And we follow a typical DeepSeek-V3/R1 production setting (128 tokens per batch, 7168 hidden, top-8 experts, INT8 dispatching and BF16 combining).
+
+| Dispatch #EP | Latency | Bandwidth      | Combine #EP | Latency | Bandwidth       |
+| ------------ | ------- | -------------- | ----------- | ------- | --------------- |
+| 8            | 132 us  | 58 GB/s (HCCS) | 8           | 126 us  | 116 GB/s (HCCS) |
+| 16           | 139 us  | 55 GB/s (HCCS) | 16          | 135 us  | 109 GB/s (HCCS) |
+| 32           | 153 us  | 49 GB/s (HCCS) | 32          | 151 us  | 97 GB/s (HCCS)  |
@@ -9,7 +9,9 @@ ONLY_BUILD_DEEPEP_ADAPTER_MODULE="OFF"
 ONLY_BUILD_DEEPEP_KERNELs_MODULE="OFF"
 ONLY_BUILD_MEMORY_SAVER_MODULE="OFF"
 
-while getopts ":a:h" opt; do
+DEBUG_MODE="OFF"
+
+while getopts ":a:hd" opt; do
     case ${opt} in
         a )
             BUILD_DEEPEP_MODULE="OFF"
@@ -41,6 +43,9 @@ while getopts ":a:h" opt; do
                     ;;
             esac
             ;;
+        d )
+            DEBUG_MODE="ON"
+            ;;
         h )
             echo "Use './build.sh' build all modules."
             echo "Use './build.sh -a <target>' to build specific parts of the project."
@@ -67,6 +72,9 @@ done
 
 shift $((OPTIND -1))
 
+
+export DEBUG_MODE=$DEBUG_MODE
+
 SOC_VERSION="${1:-Ascend910_9382}"
 
 if [ -n "$ASCEND_HOME_PATH" ]; then
 
@@ -85,7 +85,7 @@ bash build.sh  -a memory-saver
 2. Pip install the `.whl` file into your Python environment
 
 ```bash
-pip install output/deep_ep*.whl
+pip install output/torch_memory_saver*.whl
 ```
 ## Test
 You can use this command for local testing:
 
@@ -2,6 +2,7 @@
 import os
 from contextlib import contextmanager
 
+import torch
 from torch_memory_saver.hooks.base import HookUtilBase
 from torch_memory_saver.utils import get_binary_path_from_package
 
@@ -23,11 +24,17 @@ def get_path_binary(self):
 @contextmanager
 def configure_subprocess():
     """Configure environment variables for subprocesses. Only needed for hook_mode=preload."""
-    with _change_env(
-        "LD_PRELOAD",
-        str(get_binary_path_from_package("torch_memory_saver_hook_mode_preload")),
-    ):
+    # Currently, torch_memory_saver does not support preload for npu, therefore LD_PRELOAD interception is not implemented.
+    if hasattr(torch, "npu") and torch.npu.is_available():
         yield
+        return
+
+    else:
+        with _change_env(
+            "LD_PRELOAD",
+            str(get_binary_path_from_package("torch_memory_saver_hook_mode_preload")),
+        ):
+            yield
 
 
 @contextmanager
 
@@ -9,6 +9,8 @@ FILE(GLOB OP_SRCS
     ${PROJECT_OP_SRC_BASE}/alloc_extend/op_host/alloc_extend_tiling.cpp
     ${PROJECT_OP_SRC_BASE}/assign_cache_op/op_host/assign_cache.cpp
     ${PROJECT_OP_SRC_BASE}/mla_preprocess/op_host/mla_preprocess.cpp
+    ${PROJECT_OP_SRC_BASE}/batch_matmul_transpose/op_host/batch_matmul_transpose.cpp
+    ${PROJECT_OP_SRC_BASE}/batch_matmul_transpose/op_host/tiling/tiling_data.cpp
     )
 
 # set the so name
@@ -19,6 +21,7 @@ ascendc_library(no_workspace_kernel STATIC
     ${PROJECT_OP_SRC_BASE}/helloworld/op_kernel/kernel_helloworld.cpp
     ${PROJECT_OP_SRC_BASE}/cache_location_assign/op_kernel/cache_loc_assign_kernel.cpp
     ${PROJECT_OP_SRC_BASE}/assign_cache_op/op_kernel/assign_cache_op.cpp
+    ${PROJECT_OP_SRC_BASE}/batch_matmul_transpose/op_kernel/batch_matmul_transpose_kernel.cpp
 )
 
 ascendc_library(workspace_kernel STATIC
 
@@ -0,0 +1,124 @@
+#include <iostream>
+#include <string>
+#include "acl/acl.h"
+#include "kernel_tiling/kernel_tiling.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "tiling/tiling_data.h"
+#include "defines.h"
+#include "torch_helper.h"
+#include "common_tiling.h"
+#include "aclrtlaunch_batch_matmul_transpose.h"
+
+namespace sglang {
+namespace npu_kernel {
+using namespace pp_matmul;
+
+std::unordered_map<c10::string_view, uint16_t> quantModeMap = {
+    {"per_channel_symm", 0},
+    {"per_channel_asymm", 1},
+    {"per_token_symm", 2},
+};
+
+std::unordered_map<c10::string_view, uint16_t> formatModeMap = {
+    {"ND", 0},
+    {"NZ", 1},
+};
+
+std::unordered_map<c10::ScalarType, TensorDType> atType2tensorDType = {
+    {at::ScalarType::BFloat16, TensorDType::TENSOR_DTYPE_BF16},
+    {at::ScalarType::Half, TensorDType::TENSOR_DTYPE_FLOAT16}};
+
+// batch size -> memory index
+constexpr uint32_t MAX_CAPTURE_NUM = 1024;
+
+template <typename MapType>
+inline int GetModeVal(const MapType &mode_map, c10::optional<c10::string_view> mode_opt, c10::string_view default_mode,
+                      const char *mode_name)
+{
+    std::string modeStr(mode_name);
+    c10::string_view mode_str = mode_opt.value_or(default_mode);
+    auto it = mode_map.find(mode_str);
+    // if input mode is unsupported, use default value
+    TORCH_CHECK(it != mode_map.end(), modeStr, c10::str(": Unsupported mode value ", mode_str));
+    return it->second;
+}
+
+HOST_API void batch_matmul_transpose(const at::Tensor &tensor_a, const at::Tensor &tensor_b, at::Tensor &tensor_c,
+                                     c10::optional<c10::string_view> format_mode,
+                                     c10::optional<c10::string_view> quant_mode)
+{
+    auto tensorAShape = tensor_a.sizes();
+    auto tensorBShape = tensor_b.sizes();
+    auto tensorCShape = tensor_c.sizes();
+    uint32_t n;
+    uint32_t block_dim;
+    HardwareInfo hwInfo;
+    std::map<c10::ScalarType, float> dTypeMap = {{at::ScalarType::Half, 2.0}, {at::ScalarType::BFloat16, 2.0}};
+
+    at::ScalarType aType = tensor_a.scalar_type();
+    at::ScalarType bType = tensor_b.scalar_type();
+    at::ScalarType cType = tensor_c.scalar_type();
+    TORCH_CHECK(aType == bType && bType == cType, "tensor type is not the same");
+    TORCH_CHECK((aType == at::ScalarType::BFloat16) || (aType == at::ScalarType::Half),
+                "tensor type only support half or bf16");
+
+    TensorFormat formatMode = static_cast<TensorFormat>(GetModeVal(formatModeMap, format_mode, "ND", "format_mode"));
+    MatMul::QuantMode quantMode =
+        static_cast<MatMul::QuantMode>(GetModeVal(quantModeMap, quant_mode, "per_channel_symm", "quant_mode"));
+
+    TORCH_CHECK(tensorAShape.size() == 3, "batch size is not same between srcTensor and dstTensor");
+    if (formatMode == TensorFormat::TENSOR_FORMAT_ND) {
+        TORCH_CHECK(tensorBShape.size() == 3, "tensor shape should be dim3 in ND format");
+        TORCH_CHECK(tensorAShape[2] == tensorBShape[1], "tensor shape is wrong");
+        n = tensorBShape[2];
+    } else {
+        TORCH_CHECK(tensorBShape.size() == 4, "tensor shape should be dim4 in nz format");
+        TORCH_CHECK(tensorAShape[2] == tensorBShape[2], "tensor shape is wrong");
+        n = tensorBShape[1] * tensorBShape[3];
+    }
+    TORCH_CHECK(tensorAShape[1] == tensorBShape[0], "tensor shape is wrong");
+
+    OpShape opShape = {.batchSize = static_cast<uint32_t>(tensorAShape[1]),
+                       .m = static_cast<uint32_t>(tensorAShape[0]),
+                       .k = static_cast<uint32_t>(tensorAShape[2]),
+                       .n = n};
+    PpMatmulTilingData matmulTilingData = {
+        .opShape = opShape,
+    };
+    auto dType = atType2tensorDType[aType];
+    MatMulInfo mmInfo = {.batchSize = opShape.batchSize,
+                         .m = opShape.m,
+                         .k = opShape.k,
+                         .n = opShape.n,
+                         .dtypeA = dType,
+                         .dtypeB = dType,
+                         .dtypeC = dType,
+                         .formatB = formatMode,
+                         .mmType = MatMul::MatMulType::MATMUL_EIN_SUM,
+                         .inDtype = dTypeMap[aType],
+                         .outDtype = dTypeMap[cType],
+                         .quantMode = quantMode};
+    GetPpMatmulTiling(mmInfo, hwInfo, block_dim, matmulTilingData);
+    host_utils::PpMatmulTilingCheck(matmulTilingData);
+
+    // tiling
+    int32_t batchIdx = opShape.m - 1;
+    uint32_t tilingSize = sizeof(PpMatmulTilingData);
+    static auto global_tiling_data = at::empty(
+        {tilingSize * MAX_CAPTURE_NUM}, at::TensorOptions().dtype(at::kByte).device(tensor_a.options().device()));
+    if (batchIdx >= 0 && batchIdx < MAX_CAPTURE_NUM) {
+        aclrtMemcpy(global_tiling_data.data_ptr<uint8_t>() + (tilingSize * batchIdx), tilingSize, &matmulTilingData,
+                    tilingSize, ACL_MEMCPY_HOST_TO_DEVICE);
+    } else {
+        // Handle the case where batchIdx is out of range
+        TORCH_CHECK(false, "batchIdx is out of range: ", batchIdx);
+    }
+    at::Tensor tiling_tensor =
+        at::from_blob(global_tiling_data.data_ptr<uint8_t>() + (tilingSize * batchIdx), tilingSize, at::kByte);
+
+    EXEC_KERNEL_CMD(batch_matmul_transpose, block_dim, tensor_a, tensor_b, tensor_c, tiling_tensor);
+}
+
+}  // namespace npu_kernel
+
+}  // namespace sglang
Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,8 @@ FILE(GLOB OP_SRCS`
`9`	`9`	`${PROJECT_OP_SRC_BASE}/alloc_extend/op_host/alloc_extend_tiling.cpp`
`10`	`10`	`${PROJECT_OP_SRC_BASE}/assign_cache_op/op_host/assign_cache.cpp`
`11`	`11`	`${PROJECT_OP_SRC_BASE}/mla_preprocess/op_host/mla_preprocess.cpp`
	`12`	`+ ${PROJECT_OP_SRC_BASE}/batch_matmul_transpose/op_host/batch_matmul_transpose.cpp`
	`13`	`+ ${PROJECT_OP_SRC_BASE}/batch_matmul_transpose/op_host/tiling/tiling_data.cpp`
`12`	`14`	`)`
`13`	`15`
`14`	`16`	`# set the so name`
`@@ -19,6 +21,7 @@ ascendc_library(no_workspace_kernel STATIC`
`19`	`21`	`${PROJECT_OP_SRC_BASE}/helloworld/op_kernel/kernel_helloworld.cpp`
`20`	`22`	`${PROJECT_OP_SRC_BASE}/cache_location_assign/op_kernel/cache_loc_assign_kernel.cpp`
`21`	`23`	`${PROJECT_OP_SRC_BASE}/assign_cache_op/op_kernel/assign_cache_op.cpp`
	`24`	`+ ${PROJECT_OP_SRC_BASE}/batch_matmul_transpose/op_kernel/batch_matmul_transpose_kernel.cpp`
`22`	`25`	`)`
`23`	`26`
`24`	`27`	`ascendc_library(workspace_kernel STATIC`