sgl-project
diff --git a/‎csrc/CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions b/‎csrc/CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎csrc/alloc_extend/op_host/alloc_extend_tiling.cpp‎
Lines changed: 1 addition & 1 deletion b/‎csrc/alloc_extend/op_host/alloc_extend_tiling.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/assign_cache_op/op_host/assign_cache.cpp‎
Lines changed: 2 additions & 2 deletions b/‎csrc/assign_cache_op/op_host/assign_cache.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎csrc/build_tree/op_host/build_tree.cpp‎
Lines changed: 1 addition & 1 deletion b/‎csrc/build_tree/op_host/build_tree.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/cache_location_assign/op_host/cache_loc_assign.cpp‎
Lines changed: 1 addition & 1 deletion b/‎csrc/cache_location_assign/op_host/cache_loc_assign.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/lightning_indexer/README.md‎
Lines changed: 71 additions & 0 deletions b/‎csrc/lightning_indexer/README.md‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎csrc/lightning_indexer/op_host/lightning_indexer.cpp‎
Lines changed: 171 additions & 0 deletions b/‎csrc/lightning_indexer/op_host/lightning_indexer.cpp‎
Lines changed: 171 additions & 0 deletions
@@ -1,5 +1,6 @@
 # set the library output dir to the python dir for wheel package build
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/python/sgl_kernel_npu/sgl_kernel_npu/lib)
+set(ASCEND_INCLUDE_DIR ${ASCEND_HOME_PATH}/aarch64-linux/include)
 
 # host side files
 FILE(GLOB OP_SRCS
@@ -17,6 +18,8 @@ FILE(GLOB OP_SRCS
     ${PROJECT_OP_SRC_BASE}/lora/op_host/bgmv_shrink.cpp
     ${PROJECT_OP_SRC_BASE}/lora/op_host/sgmv_expand.cpp
     ${PROJECT_OP_SRC_BASE}/lora/op_host/sgmv_shrink.cpp
+    ${PROJECT_OP_SRC_BASE}/lightning_indexer/op_host/lightning_indexer.cpp
+    ${PROJECT_OP_SRC_BASE}/lightning_indexer/op_host/tiling/lightning_indexer_tiling.cpp
     )
 
 # set the so name
@@ -38,6 +41,7 @@ ascendc_library(workspace_kernel STATIC
     ${PROJECT_OP_SRC_BASE}/mla_preprocess/op_kernel/mla_preprocess_kernel.cpp
     ${PROJECT_OP_SRC_BASE}/alloc_extend/op_kernel/alloc_extend_kernel.cpp
     ${PROJECT_OP_SRC_BASE}/build_tree/op_kernel/build_tree_kernel.cpp
+    ${PROJECT_OP_SRC_BASE}/lightning_indexer/op_kernel/lightning_indexer_kernel.cpp
 )
 
 ascendc_compile_definitions(workspace_kernel PRIVATE
@@ -71,4 +75,7 @@ target_include_directories(${OP_PLUGIN_NAME} PRIVATE
         ${TORCH_DIR}/include
         ${TORCH_DIR}/include/torch/csrc/api/include
         ${TORCH_NPU_DIR}/include
+        ${ASCEND_INCLUDE_DIR}/external
+        ${ASCEND_INCLUDE_DIR}/experiment/platform
+        ${ASCEND_INCLUDE_DIR}/experiment/runtime
 )
@@ -34,7 +34,7 @@ at::Tensor get_tiling(int32_t &block_dim, int32_t &workspace_size, const int64_t
     tiling_data->used_core_num = block_dim;
     tiling_data->total_extend_tokens = total_extend_tokens;
 
-    auto tiling_tensor = TorchNpuHepler::CopyTensorHostToDevice(tiling_buffer);
+    auto tiling_tensor = TorchNpuHelper::CopyTensorHostToDevice(tiling_buffer);
     return tiling_tensor;
 }
 
 
@@ -23,7 +23,7 @@ HOST_API at::Tensor GetTilingTensor(CustomAssignTilingData &tilingData, size_t t
 {
     auto buffer = at::empty({static_cast<int64_t>(tilingSize)}, at::kByte);
     tilingData.SetToBuffer(buffer.data_ptr<uint8_t>(), tilingSize);
-    auto tilingTensor = TorchNpuHepler::CopyTensorHostToDevice(buffer);
+    auto tilingTensor = TorchNpuHelper::CopyTensorHostToDevice(buffer);
     return tilingTensor;
 }
 
@@ -57,7 +57,7 @@ HOST_API bool assign_cache_op(at::Tensor &dstTensor, const at::Tensor &srcTensor
     at::Tensor tiling = GetTilingTensor(tilingData, sizeof(tilingData));
 
     auto sync = at::zeros({syncWorkspaceSize, 1}, at::kByte);
-    auto syncDevice = TorchNpuHepler::CopyTensorHostToDevice(sync);
+    auto syncDevice = TorchNpuHelper::CopyTensorHostToDevice(sync);
     EXEC_KERNEL_CMD(assign_cache_op, blockDim, dstTensor, srcTensor, dstStartIdx, dstEndIdx, srcStartIdx, srcEndIdx,
                     syncDevice, tiling);
     return true;
 
@@ -42,7 +42,7 @@ at::Tensor get_tiling(int32_t &block_dim, int32_t &workspace_size, int32_t batch
     tiling_data->big_core_tile_num = (batch_size + block_dim - 1) / block_dim;
     tiling_data->small_core_tile_num = batch_size / block_dim;
 
-    auto tiling_tensor = TorchNpuHepler::CopyTensorHostToDevice(tiling_buffer);
+    auto tiling_tensor = TorchNpuHelper::CopyTensorHostToDevice(tiling_buffer);
     return tiling_tensor;
 }
 
 
@@ -70,7 +70,7 @@ at::Tensor getTiling(const at::Tensor &reqPoolIndices, uint64_t rowSize, uint64_
         throw std::invalid_argument("Batch size is too large, buffer is not enough to do calculate");
     }
 
-    auto tilingTensor = TorchNpuHepler::CopyTensorHostToDevice(tilingBuffer);
+    auto tilingTensor = TorchNpuHelper::CopyTensorHostToDevice(tilingBuffer);
     return tilingTensor;
 }
 
 
@@ -0,0 +1,71 @@
+# torch.ops.npu.lightning_indexer<a name="ZH-CN_TOPIC_0000001979260729"></a>
+
+## Product Support Status <a name="zh-cn_topic_0000001832267082_section14441124184110"></a>
+| Product                                                         | Supported |
+| ------------------------------------------------------------ | :-------: |
+|<term>Atlas A3 Inference Product Series</term>   | √  |
+
+## Function Description<a name="zh-cn_topic_0000001832267082_section14441124184110"></a>
+
+`LightningIndexer` computes the Top-$k$ positions corresponding to each token based on a series of operations. For an Index Query $Q_{index}\in\R^{g\times d}$ corresponding to a certain token, given the context Index Key $K_{index}\in\R^{S_{k}\times d},W\in\R^{g\times 1}$, where $g$ is the group size for GQA, $d$ is the dimension of each head, and $S_{k}$ is the context length, the specific calculation formula for `LightningIndexer` is as follows:
+$$
+\text{Top-}k\left\{[1]_{1\times g}@\left[(W@[1]_{1\times S_{k}})\odot\text{ReLU}\left(Q_{index}@K_{index}^T\right)\right]\right\}
+$$
+
+## Function Prototype<a name="zh-cn_topic_0000001832267082_section45077510411"></a>
+
+```
+torch.ops.npu.lightning_indexer(query, key, weights, actual_seq_lengths_query=None, actual_seq_lengths_key=None, block_table=None, layout_query='BSND', layout_key='BSND', sparse_count=2048, sparse_mode=3) -> Tensor
+```
+
+## Parameter Description<a name="zh-cn_topic_0000001832267082_section112637109429"></a>
+
+>**Note:**<br>
+>
+>- Dimension meanings for query, key, and weights parameters: B (Batch Size) represents the batch size of input samples, S (Sequence Length) represents the sequence length of input samples, H (Head Size) represents the size of the hidden layer, N (Head Num) represents the number of attention heads, D (Head Dim) represents the smallest unit dimension of the hidden layer, satisfying D=H/N, T represents the cumulative sum of sequence lengths for all batch input samples.
+>- S1 represents the S dimension in query shape, S2 represents the S dimension in key shape, N1 represents the N dimension in query shape, N2 represents the N dimension in key shape.
+
+-   **query** (`Tensor`): Required parameter, non-contiguous tensors not supported. Data layout supports ND format. Data types supported: `bfloat16` and `float16`.
+
+-   **key** (`Tensor`): Required parameter, non-contiguous tensors not supported. Data layout supports ND format. Data types supported: `bfloat16` and `float16`. When layout_key is 'PA_BSND', the shape is [block_count, block_size, N2, D], where block_count is the total number of blocks in PageAttention, and block_size is the number of tokens in one block.
+
+-   **weights** (`Tensor`): Required parameter, non-contiguous tensors not supported. Data layout supports ND format. Data types supported: `bfloat16` and `float16`. Supported input shapes: [B,S1,N1], [T,N1].
+
+- <strong>*</strong>: Represents that parameters before it are position-dependent and must be provided in order (required parameters); parameters after it are keyword arguments, position-independent, and optional (default values will be used if not provided).
+
+-   **actual_seq_lengths_query** (`Tensor`): Optional parameter, represents the number of valid tokens for `query` in different batches. Data type supported: `int32`. If sequence length is not specified, None can be passed, indicating it's the same as the S dimension length of `query`'s shape.
+    -   The number of valid tokens for each batch in this parameter must not exceed the S dimension size in `query`. Supports a 1D tensor of length B. When `query`'s input_layout is 'TND', this parameter must be provided, and the number of elements in this parameter is used as the B value. Each element's value in this parameter represents the cumulative sum of tokens for the current batch and all previous batches (prefix sum), so the value of a later element must be >= the value of the previous element. Negative values are not allowed.
+
+-   **actual_seq_lengths_key** (`Tensor`): Optional parameter, represents the number of valid tokens for `key` in different batches. Data type supported: `int32`. If sequence length is not specified, None can be passed, indicating it's the same as the S dimension length of key's shape. Supports a 1D tensor of length B.
+
+-   **block_table** (`Tensor`): Optional parameter, represents the block mapping table used for KV storage in PageAttention. Data layout supports ND format. Data type supported: `int32`.
+    -   In PageAttention scenarios, block_table must be 2D, with the first dimension length equal to B, and the second dimension length not less than maxBlockNumPerSeq (maxBlockNumPerSeq is the maximum number of blocks corresponding to actual_seq_lengths_key for each batch).
+
+-   **layout_query** (`str`): Optional parameter, identifies the data layout format of input `query`. Currently supports: 'BSND', 'TND'. Default value: "BSND".
+
+-   **layout_key** (`str`): Optional parameter, identifies the data layout format of input `key`. Currently supports: 'PA_BSND', 'BSND', 'TND'. Default value: "BSND". In non-PageAttention scenarios, this parameter value should be consistent with **layout_query**.
+
+-   **sparse_count** (`int`): Optional parameter, represents the number of blocks to retain during the topK phase. Supports values 1-2048. Data type supported: `int32`.
+
+-   **sparse_mode** (`int`): Optional parameter, specifies the sparse mode. Supports values 0/3. Data type supported: `int32`.
+
+    -   When sparse_mode is 0, it represents defaultMask mode.
+    -   When sparse_mode is 3, it represents rightDownCausal mode mask, corresponding to the lower triangular scenario divided by the right vertex.
+
+## Return Value Description<a name="zh-cn_topic_0000001832267082_section22231435517"></a>
+
+-   **out** (`Tensor`): Output from the formula, data type supported: `int32`. Data layout supports ND format.
+
+## Constraints<a name="zh-cn_topic_0000001832267082_section12345537164214"></a>
+
+-   This interface supports inference scenarios.
+-   This interface supports graph mode.
+-   When used with PyTorch, the versions of CANN-related packages and PyTorch-related packages must be compatible.
+-   Parameter N in query supports 64, parameter N in key supports 1.
+-   Parameter D in query and parameter D in key must be equal to 128.
+-   Data types of parameters query, key, and weights must be consistent.
+-   Supports block_size values that are multiples of 16, with maximum support up to 1024.
+
+## Usage Example<a name="zh-cn_topic_0000001832267082_section14459801435"></a>
+
+-   See details in [test_lightning_indexer.py](../../tests/python/sgl_kernel_npu/test_lightning_indexer.py)
@@ -0,0 +1,171 @@
+#include <cstdio>
+#include <string>
+#include "acl/acl.h"
+#include "kernel_tiling/kernel_tiling.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "tiling/lightning_indexer_tiling.h"
+#include "defines.h"
+#include "torch_helper.h"
+#include "ge_helper.h"
+#include "common_tiling.h"
+#include "lightning_indexer_def.h"
+#include "common.h"
+#include "aclrtlaunch_lightning_indexer.h"
+
+namespace sglang::LIHost {
+
+using namespace ge_helper;
+constexpr uint32_t MAX_CAPTURE_NUM = 1024;
+constexpr uint32_t MAX_DECODE_BS = 512;
+// npu tensor max size
+constexpr int SIZE = 8;
+constexpr int DIM_0 = 0;
+constexpr int DIM_1 = 1;
+constexpr int DIM_2 = 2;
+constexpr int DIM_3 = 3;
+
+// namespace scope global parameters
+uint32_t actualCaptureNum = 0;
+static std::unordered_map<uint64_t, uint32_t> captureMap;
+// at::Tensor workspace;
+
+inline at::Tensor ConstructLightningIndexerOutputTensor(const at::Tensor &query, const at::Tensor &key,
+                                                        const c10::optional<at::Tensor> &actual_seq_lengths_query,
+                                                        int64_t sparse_count, std::string query_layout_str,
+                                                        std::string key_layout_str)
+{
+    at::SmallVector<int64_t, SIZE> outputSize;
+    for (size_t i = 0; i < query.sizes().size(); i++) {
+        TORCH_CHECK(query.size(i) > 0,
+                    "All values within query's shape should be greater "
+                    "than 0, but shape[",
+                    i, "] is ", query.size(i));
+    }
+    TORCH_CHECK(sparse_count > 0, "sparse count should be greater than 0, but now is ", sparse_count);
+
+    if (query_layout_str == "BSND") {
+        outputSize = {query.size(DIM_0), query.size(DIM_1), key.size(DIM_2), sparse_count};
+    } else {
+        int n_dim_index = 0;
+        n_dim_index = (key_layout_str == "TND") ? DIM_1 : DIM_2;
+        outputSize = {query.size(DIM_0), key.size(n_dim_index), sparse_count};
+    }
+    at::Tensor output = at::empty(outputSize, query.options().dtype(at::kInt));
+
+    return output;
+}
+}  // namespace sglang::LIHost
+
+namespace sglang {
+namespace npu_kernel {
+HOST_API at::Tensor lightning_indexer(const at::Tensor &query, const at::Tensor &key, const at::Tensor &weights,
+                                      const c10::optional<at::Tensor> &actual_seq_lengths_query,
+                                      const c10::optional<at::Tensor> &actual_seq_lengths_key,
+                                      const c10::optional<at::Tensor> &block_table,
+                                      c10::optional<c10::string_view> layout_query,
+                                      c10::optional<c10::string_view> layout_key, c10::optional<int64_t> sparse_count,
+                                      c10::optional<int64_t> sparse_mode)
+{
+    using namespace LIHost;
+    LightningIndexer indexer("lightning_indexer");
+    auto context = std::make_shared<TilingContext>("lightning_indexer");
+    TORCH_CHECK(context != nullptr, "TilingContext is null");
+
+    std::string layoutQuery(indexer.GetAttr(ATTR_QUERY_LAYOUT_INDEX).GetString());
+    std::string layoutKey(indexer.GetAttr(ATTR_KEY_LAYOUT_INDEX).GetString());
+    int64_t sparseCount = std::any_cast<int32_t>(indexer.GetAttr(ATTR_SPARSE_COUNT_INDEX).GetValue());
+
+    if (layout_query.has_value()) {
+        layoutQuery = std::string(layout_query.value());
+        indexer.SetAttrStr("layout_query", layoutQuery);
+    }
+    if (layout_key.has_value()) {
+        layoutKey = std::string(layout_key.value());
+        indexer.SetAttrStr("layout_key", layoutKey);
+    }
+    if (sparse_count.has_value()) {
+        sparseCount = sparse_count.value();
+        indexer.SetAttrAny("sparse_count", static_cast<int32_t>(sparseCount));
+    }
+    if (sparse_mode.has_value()) {
+        indexer.SetAttrAny("sparse_mode", static_cast<int32_t>(sparse_mode.value()));
+    }
+
+    at::Tensor sparse_indices = ConstructLightningIndexerOutputTensor(query, key, actual_seq_lengths_query, sparseCount,
+                                                                      layoutQuery, layoutKey);
+
+    auto qScalarType = query.scalar_type();
+
+    at::Tensor actualSeqLengthsQuery =
+        actual_seq_lengths_query.has_value()
+            ? actual_seq_lengths_query.value()
+            : at::empty({1}, at::TensorOptions().dtype(qScalarType).device(query.options().device()));
+
+    at::Tensor actualSeqLengthsKey =
+        actual_seq_lengths_key.has_value()
+            ? actual_seq_lengths_key.value()
+            : at::empty({1}, at::TensorOptions().dtype(qScalarType).device(query.options().device()));
+
+    at::Tensor blockTable =
+        block_table.has_value()
+            ? block_table.value()
+            : at::empty({1}, at::TensorOptions().dtype(qScalarType).device(query.options().device()));
+
+    indexer.SetToContext(context, qScalarType);
+    context->RegisterTensor(query, true);
+    context->RegisterTensor(key, true);
+    context->RegisterTensor(weights, true);
+    context->RegisterTensor(actual_seq_lengths_query, true);
+    context->RegisterTensor(actual_seq_lengths_key, true);
+    context->RegisterTensor(block_table, true);
+    context->RegisterTensor(sparse_indices, false);
+
+    LITilingInfo liInfo;
+    LIInfoParser LIInfoParser(context.get());
+    TORCH_CHECK(LIInfoParser.ParseAndCheck(liInfo) == ge::GRAPH_SUCCESS, "lightning_indexer ParseAndCheck failed")
+
+    LightningIndexerTiling liTiling(context.get());
+    liTiling.DoTiling(&liInfo);
+    const auto &tilingData = liTiling.GetTilingData();
+
+    uint32_t tilingSize = sizeof(LITilingData);
+    auto blockDim = tilingData.usedCoreNum;
+    auto bs = tilingData.bSize;
+    at::Tensor tilingTensor;
+
+    auto tup =
+        std::make_tuple(tilingData.bSize, tilingData.n2Size, tilingData.gSize, tilingData.s1Size, tilingData.s2Size,
+                        tilingData.blockSize, tilingData.maxBlockNumPerBatch, tilingData.tilingKey);
+    auto hashValue = host_utils::TupleHasher::Hash(tup);
+
+    static auto globalTilingBuffer = at::empty({tilingSize * MAX_CAPTURE_NUM},
+                                               at::TensorOptions().dtype(at::kByte).device(query.options().device()));
+
+    if (actualCaptureNum >= MAX_CAPTURE_NUM) {
+        static auto preillTilingBuffer =
+            at::empty({tilingSize}, at::TensorOptions().dtype(at::kByte).device(query.options().device()));
+        aclrtMemcpy(preillTilingBuffer.data_ptr<uint8_t>(), tilingSize, &tilingData, tilingSize,
+                    ACL_MEMCPY_HOST_TO_DEVICE);
+        tilingTensor = at::from_blob(preillTilingBuffer.data_ptr<uint8_t>(), tilingSize, at::kByte);
+    } else if (captureMap.find(hashValue) != captureMap.end()) {
+        // Decode replay phase and part of cached prefill tiling data got from globalTilingBuffer
+        tilingTensor = at::from_blob(globalTilingBuffer.data_ptr<uint8_t>() + (tilingSize * captureMap[hashValue]),
+                                     tilingSize, at::kByte);
+    } else {
+        // Captured tiling cached here
+        captureMap[hashValue] = actualCaptureNum;
+        aclrtMemcpy(globalTilingBuffer.data_ptr<uint8_t>() + actualCaptureNum * tilingSize, tilingSize, &tilingData,
+                    tilingSize, ACL_MEMCPY_HOST_TO_DEVICE);
+        actualCaptureNum++;
+        tilingTensor = at::from_blob(globalTilingBuffer.data_ptr<uint8_t>() + (tilingSize * captureMap[hashValue]),
+                                     tilingSize, at::kByte);
+    }
+
+    size_t workspaceSize = context->GetWorkspaceSize();
+    auto workspace = at::empty({workspaceSize}, at::TensorOptions().dtype(at::kByte).device(query.options().device()));
+    EXEC_KERNEL_CMD(lightning_indexer, blockDim, query, key, weights, actualSeqLengthsQuery, actualSeqLengthsKey,
+                    blockTable, sparse_indices, workspace, tilingTensor);
+    return sparse_indices;
+}
+}  // namespace npu_kernel
+}  // namespace sglang
Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ at::Tensor get_tiling(int32_t &block_dim, int32_t &workspace_size, const int64_t`
`34`	`34`	`tiling_data->used_core_num = block_dim;`
`35`	`35`	`tiling_data->total_extend_tokens = total_extend_tokens;`
`36`	`36`
`37`		`- auto tiling_tensor = TorchNpuHepler::CopyTensorHostToDevice(tiling_buffer);`
	`37`	`+ auto tiling_tensor = TorchNpuHelper::CopyTensorHostToDevice(tiling_buffer);`
`38`	`38`	`return tiling_tensor;`
`39`	`39`	`}`
`40`	`40`
Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ at::Tensor get_tiling(int32_t &block_dim, int32_t &workspace_size, int32_t batch`
`42`	`42`	`tiling_data->big_core_tile_num = (batch_size + block_dim - 1) / block_dim;`
`43`	`43`	`tiling_data->small_core_tile_num = batch_size / block_dim;`
`44`	`44`
`45`		`- auto tiling_tensor = TorchNpuHepler::CopyTensorHostToDevice(tiling_buffer);`
	`45`	`+ auto tiling_tensor = TorchNpuHelper::CopyTensorHostToDevice(tiling_buffer);`
`46`	`46`	`return tiling_tensor;`
`47`	`47`	`}`
`48`	`48`
Original file line number	Diff line number	Diff line change
`@@ -70,7 +70,7 @@ at::Tensor getTiling(const at::Tensor &reqPoolIndices, uint64_t rowSize, uint64_`
`70`	`70`	`throw std::invalid_argument("Batch size is too large, buffer is not enough to do calculate");`
`71`	`71`	`}`
`72`	`72`
`73`		`- auto tilingTensor = TorchNpuHepler::CopyTensorHostToDevice(tilingBuffer);`
	`73`	`+ auto tilingTensor = TorchNpuHelper::CopyTensorHostToDevice(tilingBuffer);`
`74`	`74`	`return tilingTensor;`
`75`	`75`	`}`
`76`	`76`