|
| 1 | +#include <cstdio> |
| 2 | +#include <string> |
| 3 | +#include "acl/acl.h" |
| 4 | +#include "kernel_tiling/kernel_tiling.h" |
| 5 | +#include "tiling/platform/platform_ascendc.h" |
| 6 | +#include "tiling/lightning_indexer_tiling.h" |
| 7 | +#include "defines.h" |
| 8 | +#include "torch_helper.h" |
| 9 | +#include "ge_helper.h" |
| 10 | +#include "common_tiling.h" |
| 11 | +#include "lightning_indexer_def.h" |
| 12 | +#include "common.h" |
| 13 | +#include "aclrtlaunch_lightning_indexer.h" |
| 14 | + |
| 15 | +namespace sglang::LIHost { |
| 16 | + |
| 17 | +using namespace ge_helper; |
| 18 | +constexpr uint32_t MAX_CAPTURE_NUM = 1024; |
| 19 | +constexpr uint32_t MAX_DECODE_BS = 512; |
| 20 | +// npu tensor max size |
| 21 | +constexpr int SIZE = 8; |
| 22 | +constexpr int DIM_0 = 0; |
| 23 | +constexpr int DIM_1 = 1; |
| 24 | +constexpr int DIM_2 = 2; |
| 25 | +constexpr int DIM_3 = 3; |
| 26 | + |
| 27 | +// namespace scope global parameters |
| 28 | +uint32_t actualCaptureNum = 0; |
| 29 | +static std::unordered_map<uint64_t, uint32_t> captureMap; |
| 30 | +// at::Tensor workspace; |
| 31 | + |
| 32 | +inline at::Tensor ConstructLightningIndexerOutputTensor(const at::Tensor &query, const at::Tensor &key, |
| 33 | + const c10::optional<at::Tensor> &actual_seq_lengths_query, |
| 34 | + int64_t sparse_count, std::string query_layout_str, |
| 35 | + std::string key_layout_str) |
| 36 | +{ |
| 37 | + at::SmallVector<int64_t, SIZE> outputSize; |
| 38 | + for (size_t i = 0; i < query.sizes().size(); i++) { |
| 39 | + TORCH_CHECK(query.size(i) > 0, |
| 40 | + "All values within query's shape should be greater " |
| 41 | + "than 0, but shape[", |
| 42 | + i, "] is ", query.size(i)); |
| 43 | + } |
| 44 | + TORCH_CHECK(sparse_count > 0, "sparse count should be greater than 0, but now is ", sparse_count); |
| 45 | + |
| 46 | + if (query_layout_str == "BSND") { |
| 47 | + outputSize = {query.size(DIM_0), query.size(DIM_1), key.size(DIM_2), sparse_count}; |
| 48 | + } else { |
| 49 | + int n_dim_index = 0; |
| 50 | + n_dim_index = (key_layout_str == "TND") ? DIM_1 : DIM_2; |
| 51 | + outputSize = {query.size(DIM_0), key.size(n_dim_index), sparse_count}; |
| 52 | + } |
| 53 | + at::Tensor output = at::empty(outputSize, query.options().dtype(at::kInt)); |
| 54 | + |
| 55 | + return output; |
| 56 | +} |
| 57 | +} // namespace sglang::LIHost |
| 58 | + |
| 59 | +namespace sglang { |
| 60 | +namespace npu_kernel { |
| 61 | +HOST_API at::Tensor lightning_indexer(const at::Tensor &query, const at::Tensor &key, const at::Tensor &weights, |
| 62 | + const c10::optional<at::Tensor> &actual_seq_lengths_query, |
| 63 | + const c10::optional<at::Tensor> &actual_seq_lengths_key, |
| 64 | + const c10::optional<at::Tensor> &block_table, |
| 65 | + c10::optional<c10::string_view> layout_query, |
| 66 | + c10::optional<c10::string_view> layout_key, c10::optional<int64_t> sparse_count, |
| 67 | + c10::optional<int64_t> sparse_mode) |
| 68 | +{ |
| 69 | + using namespace LIHost; |
| 70 | + LightningIndexer indexer("lightning_indexer"); |
| 71 | + auto context = std::make_shared<TilingContext>("lightning_indexer"); |
| 72 | + TORCH_CHECK(context != nullptr, "TilingContext is null"); |
| 73 | + |
| 74 | + std::string layoutQuery(indexer.GetAttr(ATTR_QUERY_LAYOUT_INDEX).GetString()); |
| 75 | + std::string layoutKey(indexer.GetAttr(ATTR_KEY_LAYOUT_INDEX).GetString()); |
| 76 | + int64_t sparseCount = std::any_cast<int32_t>(indexer.GetAttr(ATTR_SPARSE_COUNT_INDEX).GetValue()); |
| 77 | + |
| 78 | + if (layout_query.has_value()) { |
| 79 | + layoutQuery = std::string(layout_query.value()); |
| 80 | + indexer.SetAttrStr("layout_query", layoutQuery); |
| 81 | + } |
| 82 | + if (layout_key.has_value()) { |
| 83 | + layoutKey = std::string(layout_key.value()); |
| 84 | + indexer.SetAttrStr("layout_key", layoutKey); |
| 85 | + } |
| 86 | + if (sparse_count.has_value()) { |
| 87 | + sparseCount = sparse_count.value(); |
| 88 | + indexer.SetAttrAny("sparse_count", static_cast<int32_t>(sparseCount)); |
| 89 | + } |
| 90 | + if (sparse_mode.has_value()) { |
| 91 | + indexer.SetAttrAny("sparse_mode", static_cast<int32_t>(sparse_mode.value())); |
| 92 | + } |
| 93 | + |
| 94 | + at::Tensor sparse_indices = ConstructLightningIndexerOutputTensor(query, key, actual_seq_lengths_query, sparseCount, |
| 95 | + layoutQuery, layoutKey); |
| 96 | + |
| 97 | + auto qScalarType = query.scalar_type(); |
| 98 | + |
| 99 | + at::Tensor actualSeqLengthsQuery = |
| 100 | + actual_seq_lengths_query.has_value() |
| 101 | + ? actual_seq_lengths_query.value() |
| 102 | + : at::empty({1}, at::TensorOptions().dtype(qScalarType).device(query.options().device())); |
| 103 | + |
| 104 | + at::Tensor actualSeqLengthsKey = |
| 105 | + actual_seq_lengths_key.has_value() |
| 106 | + ? actual_seq_lengths_key.value() |
| 107 | + : at::empty({1}, at::TensorOptions().dtype(qScalarType).device(query.options().device())); |
| 108 | + |
| 109 | + at::Tensor blockTable = |
| 110 | + block_table.has_value() |
| 111 | + ? block_table.value() |
| 112 | + : at::empty({1}, at::TensorOptions().dtype(qScalarType).device(query.options().device())); |
| 113 | + |
| 114 | + indexer.SetToContext(context, qScalarType); |
| 115 | + context->RegisterTensor(query, true); |
| 116 | + context->RegisterTensor(key, true); |
| 117 | + context->RegisterTensor(weights, true); |
| 118 | + context->RegisterTensor(actual_seq_lengths_query, true); |
| 119 | + context->RegisterTensor(actual_seq_lengths_key, true); |
| 120 | + context->RegisterTensor(block_table, true); |
| 121 | + context->RegisterTensor(sparse_indices, false); |
| 122 | + |
| 123 | + LITilingInfo liInfo; |
| 124 | + LIInfoParser LIInfoParser(context.get()); |
| 125 | + TORCH_CHECK(LIInfoParser.ParseAndCheck(liInfo) == ge::GRAPH_SUCCESS, "lightning_indexer ParseAndCheck failed") |
| 126 | + |
| 127 | + LightningIndexerTiling liTiling(context.get()); |
| 128 | + liTiling.DoTiling(&liInfo); |
| 129 | + const auto &tilingData = liTiling.GetTilingData(); |
| 130 | + |
| 131 | + uint32_t tilingSize = sizeof(LITilingData); |
| 132 | + auto blockDim = tilingData.usedCoreNum; |
| 133 | + auto bs = tilingData.bSize; |
| 134 | + at::Tensor tilingTensor; |
| 135 | + |
| 136 | + auto tup = |
| 137 | + std::make_tuple(tilingData.bSize, tilingData.n2Size, tilingData.gSize, tilingData.s1Size, tilingData.s2Size, |
| 138 | + tilingData.blockSize, tilingData.maxBlockNumPerBatch, tilingData.tilingKey); |
| 139 | + auto hashValue = host_utils::TupleHasher::Hash(tup); |
| 140 | + |
| 141 | + static auto globalTilingBuffer = at::empty({tilingSize * MAX_CAPTURE_NUM}, |
| 142 | + at::TensorOptions().dtype(at::kByte).device(query.options().device())); |
| 143 | + |
| 144 | + if (actualCaptureNum >= MAX_CAPTURE_NUM) { |
| 145 | + static auto preillTilingBuffer = |
| 146 | + at::empty({tilingSize}, at::TensorOptions().dtype(at::kByte).device(query.options().device())); |
| 147 | + aclrtMemcpy(preillTilingBuffer.data_ptr<uint8_t>(), tilingSize, &tilingData, tilingSize, |
| 148 | + ACL_MEMCPY_HOST_TO_DEVICE); |
| 149 | + tilingTensor = at::from_blob(preillTilingBuffer.data_ptr<uint8_t>(), tilingSize, at::kByte); |
| 150 | + } else if (captureMap.find(hashValue) != captureMap.end()) { |
| 151 | + // Decode replay phase and part of cached prefill tiling data got from globalTilingBuffer |
| 152 | + tilingTensor = at::from_blob(globalTilingBuffer.data_ptr<uint8_t>() + (tilingSize * captureMap[hashValue]), |
| 153 | + tilingSize, at::kByte); |
| 154 | + } else { |
| 155 | + // Captured tiling cached here |
| 156 | + captureMap[hashValue] = actualCaptureNum; |
| 157 | + aclrtMemcpy(globalTilingBuffer.data_ptr<uint8_t>() + actualCaptureNum * tilingSize, tilingSize, &tilingData, |
| 158 | + tilingSize, ACL_MEMCPY_HOST_TO_DEVICE); |
| 159 | + actualCaptureNum++; |
| 160 | + tilingTensor = at::from_blob(globalTilingBuffer.data_ptr<uint8_t>() + (tilingSize * captureMap[hashValue]), |
| 161 | + tilingSize, at::kByte); |
| 162 | + } |
| 163 | + |
| 164 | + size_t workspaceSize = context->GetWorkspaceSize(); |
| 165 | + auto workspace = at::empty({workspaceSize}, at::TensorOptions().dtype(at::kByte).device(query.options().device())); |
| 166 | + EXEC_KERNEL_CMD(lightning_indexer, blockDim, query, key, weights, actualSeqLengthsQuery, actualSeqLengthsKey, |
| 167 | + blockTable, sparse_indices, workspace, tilingTensor); |
| 168 | + return sparse_indices; |
| 169 | +} |
| 170 | +} // namespace npu_kernel |
| 171 | +} // namespace sglang |
0 commit comments