Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ if (MSVC)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/Zc:preprocessor>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/Zc:__cplusplus>)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=/Zc:preprocessor -Xcompiler=/Zc:__cplusplus")
# nvcc's front-end has a known bug (fixed in NVCC 13.1) where the fmt 11.x
# literal-encoding probe (fmt/base.h:is_utf8_enabled) always returns false,
# causing a static_assert failure. Disable fmt's Unicode check project-wide;
# without /utf-8 the MSVC host compiler also sees use_utf8=false, so all TUs
# agree and there is no ODR inconsistency.
add_compile_definitions(FMT_UNICODE=0)
endif ()

find_package(CUDAToolkit REQUIRED)
Expand All @@ -39,6 +45,19 @@ option(BUILD_FAST_MATH "Build in fast math mode" ON)

include(FetchContent)

FetchContent_Declare(
fmt
GIT_REPOSITORY https://github.com/fmtlib/fmt.git
GIT_TAG 11.1.4
GIT_SHALLOW ON
GIT_PROGRESS TRUE
USES_TERMINAL_DOWNLOAD TRUE
EXCLUDE_FROM_ALL
)
FetchContent_MakeAvailable(fmt)
set_target_properties(fmt PROPERTIES POSITION_INDEPENDENT_CODE ON)
include_directories(${fmt_SOURCE_DIR}/include)

if (BUILD_TEST)
FetchContent_Declare(
Catch2
Expand Down Expand Up @@ -269,6 +288,11 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++${CXX_STD}")
# Suppress nvcc warning 128 (loop not reachable) in fmt headers - nvcc false positive
# Suppress nvcc warning 27 (character value out of range) from fmt/format.h's
# fractional_part_rounding_thresholds which uses U32 string literals with values
# exceeding 0x10FFFF - harmless, pending fix in fmtlib (PR #4719)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --diag-suppress=128 --diag-suppress=27")

string(REPLACE "-O2" "" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
string(REPLACE "-O2" "" CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE}")
Expand Down
4 changes: 2 additions & 2 deletions src/turbomind/comm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ cmake_minimum_required(VERSION 3.11)
find_package(Threads)

add_library(host_comm STATIC host_comm.cc thread_comm.cc)
target_link_libraries(host_comm PRIVATE core logger Threads::Threads)
target_link_libraries(host_comm PRIVATE core Threads::Threads)
set_property(TARGET host_comm PROPERTY POSITION_INDEPENDENT_CODE ON)

add_library(device_comm STATIC device_comm.cc)
target_link_libraries(device_comm PRIVATE core logger)
target_link_libraries(device_comm PRIVATE core)
set_property(TARGET device_comm PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET device_comm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)

Expand Down
2 changes: 1 addition & 1 deletion src/turbomind/comm/cuda_ipc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ target_link_libraries(cuda_ipc_comm PRIVATE
core
cuda_utils
CUDA::cuda_driver
logger)
)

set_property(TARGET cuda_ipc_comm PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET cuda_ipc_comm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
10 changes: 5 additions & 5 deletions src/turbomind/comm/cuda_ipc/cuda_ipc_comm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@

#include "src/turbomind/comm/cuda_ipc/semaphore.h"

#include "src/turbomind/core/logger.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/logger.h"

namespace turbomind::comm {

Expand Down Expand Up @@ -156,7 +156,7 @@ CudaIpcCommImpl::~CudaIpcCommImpl()
}

for (const auto& a : allocation_) {
TM_LOG_WARNING("[COMM][%d] Allocation (%p, %lu) is not freed", global_rank_, a.uc_beg, a.size);
TM_LOG_WARN("Rank {}: Allocation ({}, {}) is not freed", global_rank_, a.uc_beg, a.size);
}

cudaStreamSynchronize(0);
Expand Down Expand Up @@ -220,7 +220,7 @@ void CudaIpcCommImpl::Free(void* ptr)
allocation_.erase(it);
}
else {
TM_LOG_WARNING("[TM][COMM][%d] Freeing %p which is not allocated by this module", global_rank_, ptr);
TM_LOG_WARN("Rank {}: Freeing {} which is not allocated by this module", global_rank_, ptr);
}
}

Expand All @@ -230,7 +230,7 @@ void CudaIpcCommImpl::Register(void* ptr, size_t size)
auto& symm = groups_.at(0).symmetric;

if (symm.find(ptr) != symm.end()) {
TM_LOG_WARNING("[TM][COMM][%d] Duplicated registration on (%p, %lu)", global_rank_, ptr, size);
TM_LOG_WARN("Rank {}: Duplicated registration on ({}, {})", global_rank_, ptr, size);
return;
}

Expand Down Expand Up @@ -317,7 +317,7 @@ void CudaIpcCommImpl::Deregister(void* ptr)
Deregister(s.extract(it).value());
}
else {
TM_LOG_WARNING("[TM][COMM][%d] Deregistering non-registered address %p", global_rank_, ptr);
TM_LOG_WARN("Rank {}: Deregistering non-registered address {}", global_rank_, ptr);
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/turbomind/comm/env.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include <string>
#include <type_traits>

#include "src/turbomind/utils/logger.h"
#include "src/turbomind/core/logger.h"

namespace turbomind {

Expand Down Expand Up @@ -40,7 +40,7 @@ auto GetEnv()
if (is_set) {
std::stringstream ss;
ss << x;
TM_LOG_INFO("[%s] %s=%s", E::prefix, E::name, ss.str().c_str());
TM_LOG_INFO("{} {}={}", E::prefix, E::name, ss.str());
}
return x;
}();
Expand Down
2 changes: 1 addition & 1 deletion src/turbomind/comm/gloo/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ add_library(gloo_comm STATIC
tcp_store.cc
)
set_property(TARGET gloo_comm PROPERTY POSITION_INDEPENDENT_CODE ON)
target_link_libraries(gloo_comm PUBLIC gloo host_comm logger xgrammar)
target_link_libraries(gloo_comm PUBLIC gloo host_comm xgrammar)

add_executable(test_ipc_comm test_ipc_comm.cc)
target_link_libraries(test_ipc_comm PRIVATE gloo_comm Threads::Threads)
4 changes: 2 additions & 2 deletions src/turbomind/comm/gloo/gloo_comm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

#include "src/turbomind/comm/gloo/tcp_store.h"
#include "src/turbomind/comm/host_comm.h"
#include "src/turbomind/utils/logger.h"
#include "src/turbomind/core/logger.h"

namespace turbomind::comm {

Expand Down Expand Up @@ -346,7 +346,7 @@ class GlooGroupId: public HostGroupId {
void Initialize() override
{
info_ = GlobalStoreFactory::Instance().New();
TM_LOG_INFO("[TM][COMM] GlooGroupId=%s", info_.c_str());
TM_LOG_INFO("GlooGroupId={}", info_);
}

void Export(std::ostream& os) override
Expand Down
6 changes: 3 additions & 3 deletions src/turbomind/comm/gloo/tcp_store.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include <gloo/transport/tcp/socket.h>

#include "src/turbomind/comm/gloo/tcp_store.h"
#include "src/turbomind/utils/logger.h"
#include "src/turbomind/core/logger.h"

namespace turbomind::comm {

Expand Down Expand Up @@ -148,7 +148,7 @@ TCPStore::TCPStore(const std::string& host, int port)
}
}
catch (const std::exception& e) {
TM_LOG_WARNING("[TM][COMM] Failed to connect to store after %d retries: %s", retry, e.what());
TM_LOG_WARN("Failed to connect to store after {} retries: {}", retry, e.what());
std::this_thread::sleep_for(std::chrono::seconds(1));
retry += 1;
}
Expand Down Expand Up @@ -209,7 +209,7 @@ void TCPStore::wait(const std::vector<std::string>& keys, const std::chrono::mil
ss << key << " ";
}
ss << "]";
TM_LOG_ERROR("[TM][COMM] %s, elapsed %lld s", ss.str().c_str(), elapsed.count());
TM_LOG_ERROR("{}, elapsed {} s", ss.str(), elapsed.count());
throw std::runtime_error("Wait timeout for key(s): " + ss.str());
}
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
Expand Down
2 changes: 1 addition & 1 deletion src/turbomind/comm/host_comm.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
#include <vector>

#include "src/turbomind/core/data_type.h"
#include "src/turbomind/core/logger.h"
#include "src/turbomind/core/serdes.h"
#include "src/turbomind/utils/logger.h"

namespace turbomind::comm {

Expand Down
2 changes: 1 addition & 1 deletion src/turbomind/comm/nccl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
cmake_minimum_required(VERSION 3.11)

add_library(nccl_comm STATIC nccl.cu)
target_link_libraries(nccl_comm PRIVATE rms_norm core ${NCCL_LIBRARIES} logger)
target_link_libraries(nccl_comm PRIVATE rms_norm core ${NCCL_LIBRARIES})
target_include_directories(nccl_comm PRIVATE ${NCCL_INCLUDE_DIRS})

set_property(TARGET nccl_comm PROPERTY POSITION_INDEPENDENT_CODE ON)
Expand Down
28 changes: 13 additions & 15 deletions src/turbomind/comm/nccl/nccl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@
#include "src/turbomind/comm/device_comm.h"
#include "src/turbomind/comm/host_comm.h"
#include "src/turbomind/core/check.h"
#include "src/turbomind/core/logger.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/string_utils.h"

#include "src/turbomind/kernels/norm/rms_norm.h"

#define NCCLCHECK(e) \
if (auto ec = e; ec != ncclSuccess) { \
auto msg = fmtstr("NCCL error %s:%d '%s'", __FILE__, __LINE__, ncclGetErrorString(ec)); \
auto msg = fmt::format("NCCL error {}:{} '{}'", __FILE__, __LINE__, ncclGetErrorString(ec)); \
throw std::runtime_error(msg.c_str()); \
}

Expand Down Expand Up @@ -76,16 +76,15 @@ static NcclApis& nccl_apis()
};
if (version >= NCCL_VERSION(2, 27, 0)) {
if (version < NCCL_VERSION(2, 28, 0)) {
TM_LOG_WARNING(
"[NCCL] Window registration may cause memory leaks in NCCL 2.27, use NCCL 2.28+ or disable the feature by setting NCCL_WIN_ENABLE=0.");
TM_LOG_WARN(
"Window registration may cause memory leaks in NCCL 2.27, use NCCL 2.28+ or disable the feature by setting NCCL_WIN_ENABLE=0.");
}
load_symbol(apis.ncclCommWindowRegister, "ncclCommWindowRegister");
load_symbol(apis.ncclCommWindowDeregister, "ncclCommWindowDeregister");
}
else {
TM_LOG_WARNING(
"[NCCL] Window registration is not supported by NCCL %d, use NCCL 2.28+ for better performance.",
version);
TM_LOG_WARN("Window registration is not supported by NCCL {}, use NCCL 2.28+ for better performance.",
version);
}
if (version >= NCCL_VERSION(2, 19, 0)) {
load_symbol(apis.ncclMemAlloc, "ncclMemAlloc");
Expand All @@ -97,8 +96,7 @@ static NcclApis& nccl_apis()
load_symbol(apis.ncclCommSplit, "ncclCommSplit");
}
else {
TM_LOG_WARNING("[NCCL] Splitting communicators is not supported by NCCL %d, use NCCL 2.18+ if needed.",
version);
TM_LOG_WARN("Splitting communicators is not supported by NCCL {}, use NCCL 2.18+ if needed.", version);
}
return apis;
}();
Expand All @@ -116,16 +114,16 @@ public:
~NcclCommImpl()
{
for (const auto& [ptr, _] : handles_.at(0)) {
TM_LOG_WARNING("[NCCL][%d] Buffer %p is not deregistered", global_rank_, ptr);
TM_LOG_WARN("Rank {}: Buffer {} is not deregistered", global_rank_, ptr);
}

for (const auto& [ptr, size] : buffers_) {
TM_LOG_WARNING("[NCCL][%d] Allocation (%p, %lu) is not freed", global_rank_, ptr, size);
TM_LOG_WARN("Rank {}: Allocation ({}, {}) is not freed", global_rank_, ptr, size);
}

for (auto& c : groups_) {
if (auto ec = ncclCommDestroy(c); ec != ncclSuccess) {
TM_LOG_ERROR("[NCCL][%d] Failed to destroy communicator: %s", global_rank_, ncclGetErrorString(ec));
TM_LOG_ERROR("Rank {}: Failed to destroy communicator: {}", global_rank_, ncclGetErrorString(ec));
}
}
}
Expand Down Expand Up @@ -169,7 +167,7 @@ public:
buffers_.erase(ptr);
}
else {
TM_LOG_WARNING("[NCCL][%d] Freeing %p which is not allocated by NcclComm", global_rank_, ptr);
TM_LOG_WARN("Rank {}: Freeing {} which is not allocated by NcclComm", global_rank_, ptr);
}
}

Expand All @@ -181,7 +179,7 @@ public:
}
}
else {
TM_LOG_WARNING("[NCCL][%d] Duplicated registration on (%p, %lu)", global_rank_, ptr, size);
TM_LOG_WARN("Rank {}: Duplicated registration on ({}, {})", global_rank_, ptr, size);
}
}

Expand All @@ -193,7 +191,7 @@ public:
}
}
else {
TM_LOG_WARNING("[NCCL][%d] Deregistering non-registered address %p", global_rank_, ptr);
TM_LOG_WARN("Rank {}: Deregistering non-registered address {}", global_rank_, ptr);
}
}

Expand Down
10 changes: 5 additions & 5 deletions src/turbomind/comm/test_comm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -740,9 +740,9 @@ struct TestComm {
std::mt19937 gen{};
std::uniform_int_distribution dist{0, 31}; // 5 mantissa bits

TM_LOG_INFO("dp_size_0 %d, tp_size_0 %d", dp_size_0, tp_size_0);
TM_LOG_INFO("dp_size_1 %d, tp_size_1 %d", dp_size_1, tp_size_1);
TM_LOG_INFO("inner_tp %d", inner_tp);
TM_LOG_INFO("dp_size_0 {}, tp_size_0 {}", dp_size_0, tp_size_0);
TM_LOG_INFO("dp_size_1 {}, tp_size_1 {}", dp_size_1, tp_size_1);
TM_LOG_INFO("inner_tp {}", inner_tp);

vector tokens = tokens_;
for (auto& x : tokens) {
Expand Down Expand Up @@ -817,7 +817,7 @@ struct TestComm {
const int tp_rank_1 = d_comm->rank(group1);
const int local_id = g_rank / inner_tp; // which local partition this rank belongs to

// TM_LOG_INFO("g_rank %d, dp_rank_0 %d, tp_rank_0 %d, dp_rank_1 %d, tp_rank_1 %d, local_id %d",
// TM_LOG_INFO("g_rank {}, dp_rank_0 {}, tp_rank_0 {}, dp_rank_1 {}, tp_rank_1 {}, local_id {}",
// g_rank,
// dp_rank_0,
// tp_rank_0,
Expand Down Expand Up @@ -885,7 +885,7 @@ struct TestComm {
for (const auto& n : tokens) {
if (n % dp_size_1) {
if (g_rank == 0) {
TM_LOG_INFO("Skipped %d", n);
TM_LOG_INFO("Skipped {}", n);
}
continue;
}
Expand Down
23 changes: 20 additions & 3 deletions src/turbomind/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,19 @@

cmake_minimum_required(VERSION 3.11)

include(FetchContent)

FetchContent_Declare(
concurrentqueue
GIT_REPOSITORY https://github.com/cameron314/concurrentqueue.git
GIT_TAG v1.0.4
GIT_SHALLOW ON
GIT_PROGRESS TRUE
USES_TERMINAL_DOWNLOAD TRUE
EXCLUDE_FROM_ALL
)
FetchContent_MakeAvailable(concurrentqueue)

add_library(core STATIC
check.cc
allocator.cc
Expand All @@ -12,9 +25,10 @@ add_library(core STATIC
tensor.cc
tensor.cu
module.cc
copy.cc)
copy.cc
logger.cc)

target_link_libraries(core PUBLIC cuda_utils logger CUDA::cudart CUDA::cuda_driver)
target_link_libraries(core PUBLIC cuda_utils CUDA::cudart CUDA::cuda_driver fmt::fmt concurrentqueue)

set_property(TARGET core PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET core PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
Expand All @@ -23,5 +37,8 @@ target_compile_options(core PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xptxas=-v>)

if (BUILD_TEST)
add_executable(test_core test_core.cc)
target_link_libraries(test_core PRIVATE core logger Catch2::Catch2WithMain)
target_link_libraries(test_core PRIVATE core Catch2::Catch2WithMain)

add_executable(test_logger test_logger.cc)
target_link_libraries(test_logger PRIVATE core Catch2::Catch2WithMain)
endif ()
4 changes: 2 additions & 2 deletions src/turbomind/core/allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ class StackAllocatorImpl: public AllocatorImpl {
p = underlying_impl_->allocate(size);
}

// TM_LOG_ERROR("allocate %p, %ld", p, size);
// TM_LOG_ERROR("allocate {}, {}", p, size);

size_ += size;
++num_;
Expand All @@ -159,7 +159,7 @@ class StackAllocatorImpl: public AllocatorImpl {
{
size = round_up(size, kAlignment);

// TM_LOG_ERROR("deallocate %p, %p, %ld", p, cached_ptr_, size);
// TM_LOG_ERROR("deallocate {}, {}, {}", p, cached_ptr_, size);

if ((char*)p + size == cached_ptr_) {
cached_ptr_ -= size;
Expand Down
Loading
Loading