Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <memory>

extern "C" {

Expand Down Expand Up @@ -140,4 +141,34 @@ template <typename Torus>
void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
Torus *d_array, Torus value, Torus n);

template <class T> struct malloc_with_size_tracking_async_deleter {
private:
cudaStream_t _stream;
uint32_t _gpu_index;
uint64_t &_size_tracker;
bool _allocate_gpu_memory;

public:
malloc_with_size_tracking_async_deleter(cudaStream_t stream,
uint32_t gpu_index,
uint64_t &size_tracker,
bool allocate_gpu_memory)
: _stream(stream), _gpu_index(gpu_index), _size_tracker(size_tracker),
_allocate_gpu_memory(allocate_gpu_memory)

{}
void operator()(T *ptr) { cuda_drop_with_size_tracking_async(ptr, _stream, _gpu_index, _allocate_gpu_memory) ; }
};

template <class T>
std::shared_ptr<T> cuda_make_shared_with_size_tracking_async(
uint64_t size, cudaStream_t stream, uint32_t gpu_index,
uint64_t &size_tracker, bool allocate_gpu_memory) {
return std::shared_ptr<T>(
(T*)cuda_malloc_with_size_tracking_async(size, stream, gpu_index,
size_tracker, allocate_gpu_memory),
malloc_with_size_tracking_async_deleter<T>(
stream, gpu_index, size_tracker, allocate_gpu_memory));
}

#endif
89 changes: 89 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,4 +183,93 @@ struct CudaStreams {
}
};

struct CudaStreamsBarrier {
private:
std::vector<cudaEvent_t> _events;
CudaStreams _streams;

CudaStreamsBarrier(const CudaStreamsBarrier &) {} // Prevent copy-construction
CudaStreamsBarrier &operator=(const CudaStreamsBarrier &) {
return *this;
} // Prevent assignment
public:
void create_on(const CudaStreams &streams) {
_streams = streams;

GPU_ASSERT(streams.count() > 1, "CudaStreamsFirstWaitsWorkersBarrier: "
"Attempted to create on single GPU");
_events.resize(streams.count());
for (int i = 0; i < streams.count(); i++) {
_events[i] = cuda_create_event(streams.gpu_index(i));
}
}

CudaStreamsBarrier(){};

void local_streams_wait_for_stream_0(const CudaStreams &user_streams) {
GPU_ASSERT(!_events.empty(),
"CudaStreamsBarrier: must call create_on before use");
GPU_ASSERT(user_streams.gpu_index(0) == _streams.gpu_index(0),
"CudaStreamsBarrier: synchronization can only be performed on "
"the GPUs the barrier was initially created on.");

cuda_event_record(_events[0], user_streams.stream(0),
user_streams.gpu_index(0));
for (int j = 1; j < user_streams.count(); j++) {
GPU_ASSERT(user_streams.gpu_index(j) == _streams.gpu_index(j),
"CudaStreamsBarrier: synchronization can only be performed on "
"the GPUs the barrier was initially created on.");
cuda_stream_wait_event(user_streams.stream(j), _events[0],
user_streams.gpu_index(j));
}
}

void stream_0_wait_for_local_streams(const CudaStreams &user_streams) {
GPU_ASSERT(
!_events.empty(),
"CudaStreamsFirstWaitsWorkersBarrier: must call create_on before use");
GPU_ASSERT(
user_streams.count() <= _events.size(),
"CudaStreamsFirstWaitsWorkersBarrier: trying to synchronize too many "
"streams. "
"The barrier was created on a LUT that had %lu active streams, while "
"the user stream set has %u streams",
_events.size(), user_streams.count());

if (user_streams.count() > 1) {
// Worker GPUs record their events
for (int j = 1; j < user_streams.count(); j++) {
GPU_ASSERT(_streams.gpu_index(j) == user_streams.gpu_index(j),
"CudaStreamsBarrier: The user stream "
"set GPU[%d]=%u while the LUT stream set GPU[%d]=%u",
j, user_streams.gpu_index(j), j, _streams.gpu_index(j));

cuda_event_record(_events[j], user_streams.stream(j),
user_streams.gpu_index(j));
}

// GPU 0 waits for all workers
for (int j = 1; j < user_streams.count(); j++) {
cuda_stream_wait_event(user_streams.stream(0), _events[j],
user_streams.gpu_index(0));
}
}
}

void release() {
for (int j = 0; j < _streams.count(); j++) {
cuda_event_destroy(_events[j], _streams.gpu_index(j));
}

_events.clear();
}

~CudaStreamsBarrier() {
GPU_ASSERT(_events.empty(),
"CudaStreamsBarrier: must "
"call release before destruction: events size = %lu",
_events.size());
}
};

#endif
Loading
Loading