|
| 1 | +#include <iostream> |
| 2 | +#include <string> |
| 3 | +#include "acl/acl.h" |
| 4 | +#include "kernel_tiling/kernel_tiling.h" |
| 5 | +#include "tiling/platform/platform_ascendc.h" |
| 6 | +#include "tiling/tiling_data.h" |
| 7 | +#include "defines.h" |
| 8 | +#include "torch_helper.h" |
| 9 | +#include "common_tiling.h" |
| 10 | +#include "aclrtlaunch_batch_matmul_transpose.h" |
| 11 | + |
| 12 | +namespace sglang { |
| 13 | +namespace npu_kernel { |
| 14 | +using namespace pp_matmul; |
| 15 | + |
| 16 | +std::unordered_map<c10::string_view, uint16_t> quantModeMap = { |
| 17 | + {"per_channel_symm", 0}, |
| 18 | + {"per_channel_asymm", 1}, |
| 19 | + {"per_token_symm", 2}, |
| 20 | +}; |
| 21 | + |
| 22 | +std::unordered_map<c10::string_view, uint16_t> formatModeMap = { |
| 23 | + {"ND", 0}, |
| 24 | + {"NZ", 1}, |
| 25 | +}; |
| 26 | + |
| 27 | +std::unordered_map<c10::ScalarType, TensorDType> atType2tensorDType = { |
| 28 | + {at::ScalarType::BFloat16, TensorDType::TENSOR_DTYPE_BF16}, |
| 29 | + {at::ScalarType::Half, TensorDType::TENSOR_DTYPE_FLOAT16}}; |
| 30 | + |
| 31 | +// batch size -> memory index |
| 32 | +constexpr uint32_t MAX_CAPTURE_NUM = 1024; |
| 33 | + |
| 34 | +template <typename MapType> |
| 35 | +inline int GetModeVal(const MapType &mode_map, c10::optional<c10::string_view> mode_opt, c10::string_view default_mode, |
| 36 | + const char *mode_name) |
| 37 | +{ |
| 38 | + std::string modeStr(mode_name); |
| 39 | + c10::string_view mode_str = mode_opt.value_or(default_mode); |
| 40 | + auto it = mode_map.find(mode_str); |
| 41 | + // if input mode is unsupported, use default value |
| 42 | + TORCH_CHECK(it != mode_map.end(), modeStr, c10::str(": Unsupported mode value ", mode_str)); |
| 43 | + return it->second; |
| 44 | +} |
| 45 | + |
| 46 | +HOST_API void batch_matmul_transpose(const at::Tensor &tensor_a, const at::Tensor &tensor_b, at::Tensor &tensor_c, |
| 47 | + c10::optional<c10::string_view> format_mode, |
| 48 | + c10::optional<c10::string_view> quant_mode) |
| 49 | +{ |
| 50 | + auto tensorAShape = tensor_a.sizes(); |
| 51 | + auto tensorBShape = tensor_b.sizes(); |
| 52 | + auto tensorCShape = tensor_c.sizes(); |
| 53 | + uint32_t n; |
| 54 | + uint32_t block_dim; |
| 55 | + HardwareInfo hwInfo; |
| 56 | + std::map<c10::ScalarType, float> dTypeMap = {{at::ScalarType::Half, 2.0}, {at::ScalarType::BFloat16, 2.0}}; |
| 57 | + |
| 58 | + at::ScalarType aType = tensor_a.scalar_type(); |
| 59 | + at::ScalarType bType = tensor_b.scalar_type(); |
| 60 | + at::ScalarType cType = tensor_c.scalar_type(); |
| 61 | + TORCH_CHECK(aType == bType && bType == cType, "tensor type is not the same"); |
| 62 | + TORCH_CHECK((aType == at::ScalarType::BFloat16) || (aType == at::ScalarType::Half), |
| 63 | + "tensor type only support half or bf16"); |
| 64 | + |
| 65 | + TensorFormat formatMode = static_cast<TensorFormat>(GetModeVal(formatModeMap, format_mode, "ND", "format_mode")); |
| 66 | + MatMul::QuantMode quantMode = |
| 67 | + static_cast<MatMul::QuantMode>(GetModeVal(quantModeMap, quant_mode, "per_channel_symm", "quant_mode")); |
| 68 | + |
| 69 | + TORCH_CHECK(tensorAShape.size() == 3, "batch size is not same between srcTensor and dstTensor"); |
| 70 | + if (formatMode == TensorFormat::TENSOR_FORMAT_ND) { |
| 71 | + TORCH_CHECK(tensorBShape.size() == 3, "tensor shape should be dim3 in ND format"); |
| 72 | + TORCH_CHECK(tensorAShape[2] == tensorBShape[1], "tensor shape is wrong"); |
| 73 | + n = tensorBShape[2]; |
| 74 | + } else { |
| 75 | + TORCH_CHECK(tensorBShape.size() == 4, "tensor shape should be dim4 in nz format"); |
| 76 | + TORCH_CHECK(tensorAShape[2] == tensorBShape[2], "tensor shape is wrong"); |
| 77 | + n = tensorBShape[1] * tensorBShape[3]; |
| 78 | + } |
| 79 | + TORCH_CHECK(tensorAShape[1] == tensorBShape[0], "tensor shape is wrong"); |
| 80 | + |
| 81 | + OpShape opShape = {.batchSize = static_cast<uint32_t>(tensorAShape[1]), |
| 82 | + .m = static_cast<uint32_t>(tensorAShape[0]), |
| 83 | + .k = static_cast<uint32_t>(tensorAShape[2]), |
| 84 | + .n = n}; |
| 85 | + PpMatmulTilingData matmulTilingData = { |
| 86 | + .opShape = opShape, |
| 87 | + }; |
| 88 | + auto dType = atType2tensorDType[aType]; |
| 89 | + MatMulInfo mmInfo = {.batchSize = opShape.batchSize, |
| 90 | + .m = opShape.m, |
| 91 | + .k = opShape.k, |
| 92 | + .n = opShape.n, |
| 93 | + .dtypeA = dType, |
| 94 | + .dtypeB = dType, |
| 95 | + .dtypeC = dType, |
| 96 | + .formatB = formatMode, |
| 97 | + .mmType = MatMul::MatMulType::MATMUL_EIN_SUM, |
| 98 | + .inDtype = dTypeMap[aType], |
| 99 | + .outDtype = dTypeMap[cType], |
| 100 | + .quantMode = quantMode}; |
| 101 | + GetPpMatmulTiling(mmInfo, hwInfo, block_dim, matmulTilingData); |
| 102 | + host_utils::PpMatmulTilingCheck(matmulTilingData); |
| 103 | + |
| 104 | + // tiling |
| 105 | + int32_t batchIdx = opShape.m - 1; |
| 106 | + uint32_t tilingSize = sizeof(PpMatmulTilingData); |
| 107 | + static auto global_tiling_data = at::empty( |
| 108 | + {tilingSize * MAX_CAPTURE_NUM}, at::TensorOptions().dtype(at::kByte).device(tensor_a.options().device())); |
| 109 | + if (batchIdx >= 0 && batchIdx < MAX_CAPTURE_NUM) { |
| 110 | + aclrtMemcpy(global_tiling_data.data_ptr<uint8_t>() + (tilingSize * batchIdx), tilingSize, &matmulTilingData, |
| 111 | + tilingSize, ACL_MEMCPY_HOST_TO_DEVICE); |
| 112 | + } else { |
| 113 | + // Handle the case where batchIdx is out of range |
| 114 | + TORCH_CHECK(false, "batchIdx is out of range: ", batchIdx); |
| 115 | + } |
| 116 | + at::Tensor tiling_tensor = |
| 117 | + at::from_blob(global_tiling_data.data_ptr<uint8_t>() + (tilingSize * batchIdx), tilingSize, at::kByte); |
| 118 | + |
| 119 | + EXEC_KERNEL_CMD(batch_matmul_transpose, block_dim, tensor_a, tensor_b, tensor_c, tiling_tensor); |
| 120 | +} |
| 121 | + |
| 122 | +} // namespace npu_kernel |
| 123 | + |
| 124 | +} // namespace sglang |
0 commit comments