Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,9 @@ if(ARCH STREQUAL "x86_64")
if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "12.8")
list(APPEND CMAKE_CUDA_ARCHITECTURES 120a-real) # 5090
endif ()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "12.8")
list(APPEND CMAKE_CUDA_ARCHITECTURES 100a-real) # B200
endif()
if (MSVC)
list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES 80-real 90a-real)
endif ()
Expand Down
13 changes: 13 additions & 0 deletions src/turbomind/core/copy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,19 @@ void BatchCopy::Run()
return;
}

// cuMemcpyBatchAsync is known to crash on sm_100 (Blackwell); use sequential path.
int device = 0;
(void)cudaGetDevice(&device);
int major = 0;
(void)cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device);
if (major >= 10) {
for (unsigned i = 0; i < src_.size(); ++i) {
Comment on lines +84 to +90
Copy link

Copilot AI Apr 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This SM100 workaround queries cudaGetDevice + cudaDeviceGetAttribute(cudaDevAttrComputeCapabilityMajor, ...) on every BatchCopy::Run() call. BatchCopy can be used in tight loops, so this adds avoidable overhead. Consider caching the compute capability once (e.g. static local, or a member initialized in the constructor, with a per-device cache if needed).

Copilot uses AI. Check for mistakes.
core::Copy(src_[i], size_[i], dst_[i]);
}
Reset();
return;
}

std::visit(
[&](auto&& copy) {
using T = std::decay_t<decltype(copy)>;
Expand Down
85 changes: 70 additions & 15 deletions src/turbomind/kernels/gemm/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,44 @@
# Copyright (c) OpenMMLab. All rights reserved.

set(GEMM2_KERNELS_SM70
kernel/sm70_884_4.cu
kernel/sm70_884_8.cu
kernel/sm70_884_16.cu
)
set(GEMM2_KERNELS_SM75
kernel/sm75_16816_4.cu
kernel/sm75_16816_8.cu
kernel/sm75_16816_16.cu
)
set(GEMM2_KERNELS_SM80
kernel/sm80_16816_4.cu
kernel/sm80_16816_8.cu
kernel/sm80_16816_16.cu
)
set(GEMM2_KERNELS_SM90
kernel/sm90_16816_4.cu
kernel/sm90_16816_8.cu
kernel/sm90_16816_16.cu
kernel/sm90_64n32_8.cu
)

set(GEMM2_ARCH_90_ENABLED FALSE)
set(_sm90_archs "${CMAKE_CUDA_ARCHITECTURES}")
list(FILTER _sm90_archs INCLUDE REGEX "^90")
if(_sm90_archs)
set(GEMM2_ARCH_90_ENABLED TRUE)
else()
# When building for SM100+ without explicit SM90, still compile SM90 CUTLASS
# kernels so the fat binary can run MoE models on H100 (CUTLASS fused path).
set(_sm100_archs "${CMAKE_CUDA_ARCHITECTURES}")
list(FILTER _sm100_archs INCLUDE REGEX "^100")
if(_sm100_archs)
set(GEMM2_ARCH_90_ENABLED TRUE)
set(_sm90_archs "90")
message(STATUS "GEMM: auto-enabling SM90 CUTLASS kernels for H100 backward compatibility")
endif()
endif()

add_library(gemm2
gemm.cu
kernel.cu
Expand All @@ -16,28 +55,25 @@ add_library(gemm2
tuner/sampler.cu
tuner/stopping_criterion.cc
tuner/params.cc
kernel/sm90_16816_4.cu
kernel/sm90_16816_8.cu
kernel/sm90_16816_16.cu
kernel/sm80_16816_4.cu
kernel/sm80_16816_8.cu
kernel/sm80_16816_16.cu
kernel/sm75_16816_4.cu
kernel/sm75_16816_8.cu
kernel/sm75_16816_16.cu
kernel/sm70_884_4.cu
kernel/sm70_884_8.cu
kernel/sm70_884_16.cu
kernel/sm90_64n32_8.cu
${GEMM2_KERNELS_SM70}
${GEMM2_KERNELS_SM75}
${GEMM2_KERNELS_SM80}
cublas.cu
moe_utils_v2.cu
test/test_utils.cu
)

target_link_libraries(gemm2 PRIVATE parser nvidia::cutlass::cutlass CUDA::cuda_driver)


target_compile_definitions(gemm2 PRIVATE -DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
# cublasGemmGroupedBatchedEx (CUDA 12.5+): grouped batched GEMM for MoE on SM100
set(_has_sm100 FALSE)
set(_archs_100 "${CMAKE_CUDA_ARCHITECTURES}")
list(FILTER _archs_100 INCLUDE REGEX "^100")
if(_archs_100 AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.5")
set(_has_sm100 TRUE)
Comment on lines +69 to +73
Copy link

Copilot AI Apr 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_has_sm100 is set but never used. If it’s not needed, remove it; if it is meant to drive later logic, wire it up so the intent is clear (unused variables in CMake can hide configuration bugs).

Suggested change
set(_has_sm100 FALSE)
set(_archs_100 "${CMAKE_CUDA_ARCHITECTURES}")
list(FILTER _archs_100 INCLUDE REGEX "^100")
if(_archs_100 AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.5")
set(_has_sm100 TRUE)
set(_archs_100 "${CMAKE_CUDA_ARCHITECTURES}")
list(FILTER _archs_100 INCLUDE REGEX "^100")
if(_archs_100 AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.5")

Copilot uses AI. Check for mistakes.
target_compile_definitions(gemm2 PRIVATE ENABLE_CUBLAS_GROUPED=1)
message(STATUS "GEMM: ENABLE_CUBLAS_GROUPED=1 (cublasGemmGroupedBatchedEx for MoE on SM100)")
endif()

target_compile_options(gemm2 PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:
Expand All @@ -48,7 +84,26 @@ target_compile_options(gemm2 PRIVATE
set_property(TARGET gemm2 PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET gemm2 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)

if(GEMM2_ARCH_90_ENABLED)
# SM90 kernels only compile for 90/90a; avoid building them for sm_100.
add_library(gemm2_sm90 STATIC ${GEMM2_KERNELS_SM90})
set_target_properties(gemm2_sm90 PROPERTIES
CUDA_ARCHITECTURES "${_sm90_archs}"
POSITION_INDEPENDENT_CODE ON
CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
target_compile_definitions(gemm2_sm90 PRIVATE -DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
target_compile_options(gemm2_sm90 PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:
-Xptxas=-v
--generate-line-info
--threads 16>
)
target_link_libraries(gemm2_sm90 PRIVATE parser nvidia::cutlass::cutlass CUDA::cuda_driver)
target_link_libraries(gemm2 PRIVATE gemm2_sm90)

target_compile_definitions(gemm2 PRIVATE GEMM2_ARCH_90_ENABLED)
endif()

if (BUILD_TEST)
add_executable(test_gemm_v2
Expand Down
9 changes: 8 additions & 1 deletion src/turbomind/kernels/gemm/arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,15 @@ struct Sm80: Arch<800, 900> {
static constexpr int value = 800;
};

struct Sm90: Arch<900> {
struct Sm90: Arch<900, 1000> {
static constexpr int value = 900;
};

// B200 (Blackwell) SM 100
struct Sm100: Arch<1000> {
static constexpr int value = 1000;
};

inline bool is_arch_compatible(int karch, int darch)
{
switch (karch) {
Expand All @@ -43,6 +48,8 @@ inline bool is_arch_compatible(int karch, int darch)
return Sm80::is_compatible(darch);
case 900:
return Sm90::is_compatible(darch);
case 1000:
return Sm100::is_compatible(darch);
default:
return false;
}
Expand Down
4 changes: 4 additions & 0 deletions src/turbomind/kernels/gemm/convert_v3.cu
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ std::array<const LayoutConverter*, 2> GetConverters(DataType data_type,
if (weight_type == kHalf || weight_type == kBfloat16) {
constexpr Cvt<uint16_t, uint16_t> W;
if (grouped) {
// SM100: CublasGroupedKernel uses cublasGemmGroupedBatchedEx, which expects standard
// (K,N) row-major weight. Skip tiled conversion to avoid layout mismatch.
if (sm >= 100)
return {};
// clang-format off
if (sm >= 80) return {W(sm8_, kRow, s16816h | B | _1), {}};
if (sm == 75) return {W(sm75, kRow, s16816h | B | _1), {}};
Expand Down
Loading
Loading