A lightweight C++ library for dynamic CUDA runtime compilation and kernel caching. gpulite simplifies building and deploying CUDA-dependent applications by providing runtime symbol resolution and automated kernel compilation with caching.
No CUDA SDK Required at Build Time! - Compile your applications without installing the CUDA SDK.
- No Build-Time CUDA Dependencies: Compiles without CUDA SDK installed - only requires C++17 compiler
- Dynamic Symbol Resolution: Loads CUDA libraries (libcuda.so, libcudart.so, libnvrtc.so) at runtime
- Runtime Compilation: Compiles CUDA kernels using NVRTC with automatic compute capability detection
- Kernel Caching: Intelligent caching system to avoid recompilation of identical kernels
- Easy Integration: Header-only design for simple project integration
- Cross-Platform Support: Supports Linux and Windows platforms
Traditional CUDA applications require the CUDA SDK to be installed at build time and often have complex deployment requirements. gpulite solves this by:
- Eliminating build-time CUDA dependencies - No need for CUDA SDK during compilation
- Simplifying deployment - Applications can run on any system with CUDA drivers installed
- Reducing compilation overhead - Kernels are compiled once and cached for subsequent runs
- Providing runtime flexibility - Kernels can be modified or optimized at runtime
#include <gpulite/gpulite.hpp>
int main() {
// Check if CUDA is available
if (!CUDA_DRIVER_INSTANCE.loaded() || !NVRTC_INSTANCE.loaded()) {
throw std::runtime_error("CUDA runtime not available");
}
// Your CUDA kernel code as a string
const char* kernel_source = R"(
extern "C" __global__ void vector_add(float* a, float* b, float* c, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
c[idx] = a[idx] + b[idx];
}
}
)";
// Create kernel factory and cache the kernel
auto& factory = KernelFactory::instance();
auto* kernel = factory.create(
"vector_add", // kernel name
kernel_source, // kernel source code
"vector_add.cu", // virtual source filename
{"-std=c++17"} // compilation options
);
// Allocate device memory and launch kernel
float *d_a, *d_b, *d_c;
int n = 1024;
// Use the cleaner GPULITE_CUDART_CALL macro (hides the global instance)
GPULITE_CUDART_CALL(cudaMalloc(reinterpret_cast<void**>(&d_a), n * sizeof(float)));
GPULITE_CUDART_CALL(cudaMalloc(reinterpret_cast<void**>(&d_b), n * sizeof(float)));
GPULITE_CUDART_CALL(cudaMalloc(reinterpret_cast<void**>(&d_c), n * sizeof(float)));
// Prepare kernel arguments
std::vector<void*> args = {&d_a, &d_b, &d_c, &n};
// Launch kernel
kernel->launch(
dim3((n + 127) / 128), // grid size
dim3(128), // block size
0, // shared memory size
nullptr, // stream
args, // kernel arguments
true // synchronize after launch
);
// Clean up
GPULITE_CUDART_CALL(cudaFree(d_a));
GPULITE_CUDART_CALL(cudaFree(d_b));
GPULITE_CUDART_CALL(cudaFree(d_c));
return 0;
}Compilation:
Linux:
# Save the above code as main.cpp, then compile:
g++ -std=c++17 main.cpp -ldl -o my_gpu_app
# Run the application:
./my_gpu_appWindows (with Visual Studio):
# Save the above code as main.cpp, then compile:
cl /std:c++17 main.cpp /Fe:my_gpu_app.exe
# Run the application:
my_gpu_app.exeRequirements:
- C++17 compatible compiler (GCC 7+, Clang 5+, MSVC 2017+)
- CUDA SDK installed at run-time, but not at build time!
- Linux or Windows
// Load kernel from a .cu file
auto* kernel = KernelFactory::instance().createFromSource(
"my_kernel",
"/path/to/kernel.cu",
"kernel.cu",
{"-std=c++17"}
);For templated kernels, use the getKernelName helper:
// For a templated kernel like: template<typename T> __global__ void process(T* data)
std::string kernel_name = getKernelName<float>("process");
auto* kernel = factory.create(kernel_name, source, "template_kernel.cu", {});cmake_minimum_required(VERSION 3.14)
project(MyProject LANGUAGES CXX)
include(FetchContent)
FetchContent_Declare(
gpulite
GIT_REPOSITORY https://github.com/rubber-duck-debug/gpu-lite.git
GIT_TAG v1.0.0
)
FetchContent_MakeAvailable(gpulite)
add_executable(my_app main.cpp)
target_link_libraries(my_app PRIVATE gpulite)This is the recommended approach because:
- Dependencies (
${CMAKE_DL_LIBS},Threads) are automatically linked - Include paths are automatically configured
- Use a specific tag (e.g.,
v1.0.0) for reproducible builds
Clone or copy the gpulite repo to external/gpulite, then:
cmake_minimum_required(VERSION 3.12)
project(MyProject LANGUAGES CXX)
add_subdirectory(external/gpulite)
add_executable(my_app main.cpp)
target_link_libraries(my_app PRIVATE gpulite)Copy gpulite/gpulite.hpp to your project and link dependencies manually:
cmake_minimum_required(VERSION 3.12)
project(MyProject LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)
add_executable(my_app main.cpp)
target_include_directories(my_app PRIVATE path/to/gpulite)
if(NOT WIN32)
target_link_libraries(my_app PRIVATE ${CMAKE_DL_LIBS})
endif()std::vector<std::string> options = {
"-std=c++17", // C++ standard
"--use_fast_math", // Fast math operations
"-DBLOCK_SIZE=256", // Preprocessor definitions
"--maxrregcount=32" // Maximum register count
};
auto* kernel = factory.create("optimized_kernel", source, "kernel.cu", options);// Set maximum dynamic shared memory
kernel->setFuncAttribute(CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 8192);
// Get kernel register usage
int reg_count = kernel->getFuncAttribute(CU_FUNC_ATTRIBUTE_NUM_REGS);
std::cout << "Kernel uses " << reg_count << " registers per thread" << std::endl;// Create CUDA stream
cudaStream_t stream;
GPULITE_CUDART_CALL(cudaStreamCreate(&stream));
// Launch kernel asynchronously
kernel->launch(
grid, block, shared_mem_size,
reinterpret_cast<void*>(stream),
args,
false // don't synchronize
);
// Do other work...
// Synchronize when needed
GPULITE_CUDART_CALL(cudaStreamSynchronize(stream));
GPULITE_CUDART_CALL(cudaStreamDestroy(stream));gpulite provides comprehensive error checking with detailed error messages:
try {
auto* kernel = factory.create("my_kernel", source, "kernel.cu", {});
kernel->launch(grid, block, 0, nullptr, args);
} catch (const std::runtime_error& e) {
std::cerr << "CUDA Error: " << e.what() << std::endl;
// Error message includes file and line information
}- NVIDIA GPU with CUDA capability 3.0 or higher
- NVIDIA CUDA drivers installed
- Linux or Windows operating system
- C++17 compatible compiler (GCC 7+, Clang 5+, MSVC 2017+)
- CMake 3.12 or higher
- Standard C++ library with threading support
- No CUDA SDK installation required - gpulite uses minimal type wrappers
Linux:
libcuda.so- CUDA Driver APIlibcudart.so- CUDA Runtime APIlibnvrtc.so- NVIDIA Runtime Compilation API
Windows:
nvcuda.dll- CUDA Driver APIcudart64_*.dll- CUDA Runtime APInvrtc64_*.dll- NVIDIA Runtime Compilation API
Note: These libraries are only required at runtime, provided by the CUDA SDK/NVIDIA drivers.
| Platform | Status |
|---|---|
| Linux | ✅ Supported |
| Windows | ✅ Supported |
| macOS | ❌ Not applicable (no CUDA support) |
| Platform | Status |
|---|---|
| CUDA | ✅ Supported |
| HIP | 🚧 Planned |
- First Launch: Kernels are compiled on first use, which may add initial latency
- Subsequent Launches: Cached kernels launch immediately with minimal overhead
- Memory Usage: Compiled kernels are kept in memory for the application lifetime
- Context Switching: gpulite automatically handles CUDA context management
Linux:
- Ensure NVIDIA drivers are installed:
nvidia-smi - Verify CUDA libraries are in library path:
ldconfig -p | grep cuda
Windows:
- Ensure NVIDIA drivers are installed:
nvidia-smi - Verify CUDA libraries are in system PATH or same directory as executable
- Check that CUDA toolkit is installed and DLLs are accessible
- Check kernel syntax using
nvccoffline - Verify compute capability compatibility
- Review compilation options for conflicts
- Enable CUDA error checking in debug builds
- Use
cuda-gdbandcompute-sanitizerfor kernel debugging - Check memory alignment and access patterns
For convenience, platform-specific build scripts are provided in the scripts/ directory.
# Run from the gpulite root directory
./scripts/build_examples_linux.sh# Run from the gpulite root directory
scripts\build_examples_windows.batYou can also build manually with CMake:
# Create build directory
mkdir build && cd build
# Generate Makefiles
cmake ..
# Build the project
make -j$(nproc)# Create build directory
mkdir build
cd build
# Generate Visual Studio project files
cmake .. -G "Visual Studio 16 2019" -A x64
# Build the project
cmake --build . --config ReleaseNote: Ensure you have:
- Visual Studio 2017 or later with C++17 support
- CMake 3.12 or later
- NVIDIA drivers installed (CUDA SDK not required at build time)
This project is licensed under the MIT License - see the LICENSE file for details.
- NVIDIA CUDA Toolkit documentation
- NVRTC API reference
- https://github.com/NVIDIA/jitify
- Contributors and early adopters
For more examples and advanced usage patterns, see the examples/ directory in this repository.