diff --git a/backends/tfhe-cuda-backend/cuda/include/device.h b/backends/tfhe-cuda-backend/cuda/include/device.h index 14666bd6c5..794f8afae6 100644 --- a/backends/tfhe-cuda-backend/cuda/include/device.h +++ b/backends/tfhe-cuda-backend/cuda/include/device.h @@ -5,6 +5,7 @@ #include #include #include +#include extern "C" { @@ -140,4 +141,34 @@ template void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index, Torus *d_array, Torus value, Torus n); +template struct malloc_with_size_tracking_async_deleter { +private: + cudaStream_t _stream; + uint32_t _gpu_index; + uint64_t &_size_tracker; + bool _allocate_gpu_memory; + +public: + malloc_with_size_tracking_async_deleter(cudaStream_t stream, + uint32_t gpu_index, + uint64_t &size_tracker, + bool allocate_gpu_memory) + : _stream(stream), _gpu_index(gpu_index), _size_tracker(size_tracker), + _allocate_gpu_memory(allocate_gpu_memory) + + {} + void operator()(T *ptr) { cuda_drop_with_size_tracking_async(ptr, _stream, _gpu_index, _allocate_gpu_memory) ; } +}; + +template +std::shared_ptr cuda_make_shared_with_size_tracking_async( + uint64_t size, cudaStream_t stream, uint32_t gpu_index, + uint64_t &size_tracker, bool allocate_gpu_memory) { + return std::shared_ptr( + (T*)cuda_malloc_with_size_tracking_async(size, stream, gpu_index, + size_tracker, allocate_gpu_memory), + malloc_with_size_tracking_async_deleter( + stream, gpu_index, size_tracker, allocate_gpu_memory)); +} + #endif diff --git a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h index e46901ea5c..157f36b5e9 100644 --- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h +++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h @@ -183,4 +183,93 @@ struct CudaStreams { } }; +struct CudaStreamsBarrier { +private: + std::vector _events; + CudaStreams _streams; + + CudaStreamsBarrier(const CudaStreamsBarrier &) {} // Prevent copy-construction + CudaStreamsBarrier &operator=(const CudaStreamsBarrier &) { + return *this; + } // Prevent assignment +public: + void create_on(const CudaStreams &streams) { + _streams = streams; + + GPU_ASSERT(streams.count() > 1, "CudaStreamsFirstWaitsWorkersBarrier: " + "Attempted to create on single GPU"); + _events.resize(streams.count()); + for (int i = 0; i < streams.count(); i++) { + _events[i] = cuda_create_event(streams.gpu_index(i)); + } + } + + CudaStreamsBarrier(){}; + + void local_streams_wait_for_stream_0(const CudaStreams &user_streams) { + GPU_ASSERT(!_events.empty(), + "CudaStreamsBarrier: must call create_on before use"); + GPU_ASSERT(user_streams.gpu_index(0) == _streams.gpu_index(0), + "CudaStreamsBarrier: synchronization can only be performed on " + "the GPUs the barrier was initially created on."); + + cuda_event_record(_events[0], user_streams.stream(0), + user_streams.gpu_index(0)); + for (int j = 1; j < user_streams.count(); j++) { + GPU_ASSERT(user_streams.gpu_index(j) == _streams.gpu_index(j), + "CudaStreamsBarrier: synchronization can only be performed on " + "the GPUs the barrier was initially created on."); + cuda_stream_wait_event(user_streams.stream(j), _events[0], + user_streams.gpu_index(j)); + } + } + + void stream_0_wait_for_local_streams(const CudaStreams &user_streams) { + GPU_ASSERT( + !_events.empty(), + "CudaStreamsFirstWaitsWorkersBarrier: must call create_on before use"); + GPU_ASSERT( + user_streams.count() <= _events.size(), + "CudaStreamsFirstWaitsWorkersBarrier: trying to synchronize too many " + "streams. " + "The barrier was created on a LUT that had %lu active streams, while " + "the user stream set has %u streams", + _events.size(), user_streams.count()); + + if (user_streams.count() > 1) { + // Worker GPUs record their events + for (int j = 1; j < user_streams.count(); j++) { + GPU_ASSERT(_streams.gpu_index(j) == user_streams.gpu_index(j), + "CudaStreamsBarrier: The user stream " + "set GPU[%d]=%u while the LUT stream set GPU[%d]=%u", + j, user_streams.gpu_index(j), j, _streams.gpu_index(j)); + + cuda_event_record(_events[j], user_streams.stream(j), + user_streams.gpu_index(j)); + } + + // GPU 0 waits for all workers + for (int j = 1; j < user_streams.count(); j++) { + cuda_stream_wait_event(user_streams.stream(0), _events[j], + user_streams.gpu_index(0)); + } + } + } + + void release() { + for (int j = 0; j < _streams.count(); j++) { + cuda_event_destroy(_events[j], _streams.gpu_index(j)); + } + + _events.clear(); + } + + ~CudaStreamsBarrier() { + GPU_ASSERT(_events.empty(), + "CudaStreamsBarrier: must " + "call release before destruction: events size = %lu", + _events.size()); + } +}; + #endif diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h index 2aaac2f069..5dbd18e08d 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h @@ -276,16 +276,19 @@ struct int_radix_params { }; // Store things needed to apply LUTs -template struct int_radix_lut { +template struct int_radix_lut_generic { int_radix_params params; - uint32_t num_blocks; - uint32_t num_luts; + uint32_t num_blocks = 0; + uint32_t num_input_blocks = 0; + uint32_t num_luts = 0; uint32_t num_many_lut = 1; + uint32_t input_big_lwe_dimension = (uint32_t)-1; + // Tracks the degree of each LUT and the max degree on CPU // The max degree is (message_modulus * carry_modulus - 1) except for many lut // for which it's different - uint64_t *degrees; - uint64_t *max_degrees; + uint64_t *degrees = nullptr; + uint64_t *max_degrees = nullptr; CudaStreams active_streams; bool mem_reuse = false; @@ -297,63 +300,80 @@ template struct int_radix_lut { // These arrays will reside on all GPUs // lut could actually be allocated & initialized GPU per GPU but this is not // done at the moment - std::vector lut_vec; + std::vector lut_vec; std::vector lut_indexes_vec; - Torus *h_lut_indexes; + Torus *h_lut_indexes = nullptr; // All tmp lwe arrays and index arrays for lwe contain the total // amount of blocks to be computed on, there is no split between GPUs // for the moment - Torus *lwe_indexes_in; - Torus *lwe_indexes_out; - Torus *h_lwe_indexes_in; - Torus *h_lwe_indexes_out; + std::shared_ptr lwe_indexes_in = nullptr; + Torus *lwe_indexes_out = nullptr; + Torus *h_lwe_indexes_in = nullptr; + Torus *h_lwe_indexes_out = nullptr; // Enable optimizations if lwe_indexes_(in/out) are trivial bool using_trivial_lwe_indexes = true; // lwe_trivial_indexes is the intermediary index we need in case // lwe_indexes_in != lwe_indexes_out - Torus *lwe_trivial_indexes; - CudaRadixCiphertextFFI *tmp_lwe_before_ks; + Torus *lwe_trivial_indexes = nullptr; + // buffer to store packed message bits of a radix ciphertext + std::shared_ptr tmp_lwe_before_ks; /// For multi GPU execution we create vectors of pointers for inputs and /// outputs std::vector lwe_array_in_vec; std::vector lwe_after_ks_vec; - std::vector lwe_after_pbs_vec; + std::vector lwe_after_pbs_vec; std::vector lwe_trivial_indexes_vec; std::vector lwe_aligned_vec; bool gpu_memory_allocated; - cudaEvent_t event_scatter_in; - cudaEvent_t *event_scatter_out; - cudaEvent_t event_broadcast; - - int_radix_lut(CudaStreams streams, int_radix_params params, uint32_t num_luts, - uint32_t num_radix_blocks, bool allocate_gpu_memory, - uint64_t &size_tracker) { - + CudaStreamsBarrier multi_gpu_scatter_barrier, multi_gpu_broadcast_barrier; + CudaStreamsBarrier multi_gpu_gather_barrier; + + // Setup the LUT configuration: + // input_big_lwe_dimension: BIG LWE dimension of the KS output / PBS input + // params: cryptographic parameters of the PBS output + // num_luts: number of LUTs (or many-LUT sets) in this structure + // num_many_lut: number of LUTs to apply in a single PBS pass + // num_radix_blocks: number of blocks in the radix integer + void setup_config_and_degrees(CudaStreams streams, + uint32_t input_big_lwe_dimension, + int_radix_params params, uint32_t num_luts, + uint32_t num_many_lut, + uint32_t num_radix_blocks, + uint32_t num_input_blocks, + bool allocate_gpu_memory) { this->params = params; this->num_blocks = num_radix_blocks; this->num_luts = num_luts; - gpu_memory_allocated = allocate_gpu_memory; - uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus); - uint64_t lut_buffer_size = - (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus); + this->num_many_lut = num_many_lut; + this->input_big_lwe_dimension = input_big_lwe_dimension; + this->num_input_blocks = num_input_blocks; + this->gpu_memory_allocated = allocate_gpu_memory; - active_streams = streams.active_gpu_subset(num_radix_blocks); + this->active_streams = streams.active_gpu_subset(num_radix_blocks); + } + + void setup_degrees() { + this->degrees = + (uint64_t *)malloc(num_many_lut * num_luts * sizeof(uint64_t)); + this->max_degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t)); + } - /////////////// + void allocate_pbs_buffers(int_radix_params params, uint32_t num_radix_blocks, + bool allocate_gpu_memory, uint64_t &size_tracker) { for (uint i = 0; i < active_streams.count(); i++) { - cuda_set_device(streams.gpu_index(i)); + cuda_set_device(active_streams.gpu_index(i)); int8_t *gpu_pbs_buffer; auto num_blocks_on_gpu = std::max( THRESHOLD_MULTI_GPU, get_num_inputs_on_gpu(num_radix_blocks, i, active_streams.count())); uint64_t size = 0; - execute_scratch_pbs( - streams.stream(i), streams.gpu_index(i), &gpu_pbs_buffer, - params.glwe_dimension, params.small_lwe_dimension, + execute_scratch_pbs( + active_streams.stream(i), active_streams.gpu_index(i), + &gpu_pbs_buffer, params.glwe_dimension, params.small_lwe_dimension, params.polynomial_size, params.pbs_level, params.grouping_factor, num_blocks_on_gpu, params.pbs_type, allocate_gpu_memory, params.noise_reduction_type, size); @@ -363,111 +383,52 @@ template struct int_radix_lut { buffer.push_back(gpu_pbs_buffer); } - // We create the events only if we have multiple GPUs - if (active_streams.count() > 1) { - event_scatter_in = cuda_create_event(streams.gpu_index(0)); - event_broadcast = cuda_create_event(streams.gpu_index(0)); - - event_scatter_out = - (cudaEvent_t *)malloc(active_streams.count() * sizeof(cudaEvent_t)); - for (int i = 0; i < active_streams.count(); i++) { - event_scatter_out[i] = cuda_create_event(active_streams.gpu_index(i)); - } - } - - // Allocate LUT - // LUT is used as a trivial encryption and must be initialized outside - // this constructor - for (uint i = 0; i < active_streams.count(); i++) { - auto lut = (Torus *)cuda_malloc_with_size_tracking_async( - num_luts * lut_buffer_size, streams.stream(i), streams.gpu_index(i), - size_tracker, allocate_gpu_memory); - auto lut_indexes = (Torus *)cuda_malloc_with_size_tracking_async( - lut_indexes_size, streams.stream(i), streams.gpu_index(i), - size_tracker, allocate_gpu_memory); - // lut_indexes is initialized to 0 by default - // if a different behavior is wanted, it should be rewritten later - cuda_memset_with_size_tracking_async( - lut_indexes, 0, lut_indexes_size, streams.stream(i), - streams.gpu_index(i), allocate_gpu_memory); - - lut_vec.push_back(lut); - lut_indexes_vec.push_back(lut_indexes); - } - - // lwe_(input/output)_indexes are initialized to range(num_radix_blocks) - // by default - lwe_indexes_in = (Torus *)cuda_malloc_with_size_tracking_async( - num_radix_blocks * sizeof(Torus), streams.stream(0), - streams.gpu_index(0), size_tracker, allocate_gpu_memory); - lwe_indexes_out = (Torus *)cuda_malloc_with_size_tracking_async( - num_radix_blocks * sizeof(Torus), streams.stream(0), - streams.gpu_index(0), size_tracker, allocate_gpu_memory); - lwe_trivial_indexes = (Torus *)cuda_malloc_with_size_tracking_async( - num_radix_blocks * sizeof(Torus), streams.stream(0), - streams.gpu_index(0), size_tracker, allocate_gpu_memory); - - h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus)); - h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus)); - for (int i = 0; i < num_radix_blocks; i++) - h_lwe_indexes_in[i] = i; + tmp_lwe_before_ks = std::make_shared(); + create_zero_radix_ciphertext_async( + active_streams.stream(0), active_streams.gpu_index(0), + tmp_lwe_before_ks.get(), num_radix_blocks, input_big_lwe_dimension, + size_tracker, allocate_gpu_memory); + } - cuda_memcpy_with_size_tracking_async_to_gpu( - lwe_indexes_in, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus), - streams.stream(0), streams.gpu_index(0), allocate_gpu_memory); - cuda_memcpy_with_size_tracking_async_to_gpu( - lwe_indexes_out, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus), - streams.stream(0), streams.gpu_index(0), allocate_gpu_memory); - cuda_memcpy_with_size_tracking_async_to_gpu( - lwe_trivial_indexes, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus), - streams.stream(0), streams.gpu_index(0), allocate_gpu_memory); - memcpy(h_lwe_indexes_out, h_lwe_indexes_in, - num_radix_blocks * sizeof(Torus)); + void alloc_and_init_multi_gpu_buffers(int_radix_params params, + uint32_t num_radix_blocks, + bool allocate_gpu_memory, + uint64_t &size_tracker) { + GPU_ASSERT(lwe_array_in_vec.empty(), "Multi GPU buffers already allocated"); /// With multiple GPUs we allocate arrays to be pushed to the vectors and /// copy data on each GPU then when we gather data to GPU 0 we can copy /// back to the original indexing - multi_gpu_alloc_lwe_async(streams, lwe_array_in_vec, num_radix_blocks, - params.big_lwe_dimension + 1, size_tracker, - allocate_gpu_memory); - multi_gpu_alloc_lwe_async(streams, lwe_after_ks_vec, num_radix_blocks, - params.small_lwe_dimension + 1, size_tracker, - allocate_gpu_memory); - multi_gpu_alloc_lwe_async(streams, lwe_after_pbs_vec, num_radix_blocks, - params.big_lwe_dimension + 1, size_tracker, - allocate_gpu_memory); - multi_gpu_alloc_array_async(streams, lwe_trivial_indexes_vec, + multi_gpu_alloc_lwe_async(active_streams, lwe_array_in_vec, + num_radix_blocks, params.big_lwe_dimension + 1, + size_tracker, allocate_gpu_memory); + multi_gpu_alloc_lwe_async(active_streams, lwe_after_ks_vec, + num_radix_blocks, params.small_lwe_dimension + 1, + size_tracker, allocate_gpu_memory); + if (num_many_lut > 1) { + multi_gpu_alloc_lwe_many_lut_output_async( + active_streams, lwe_after_pbs_vec, num_radix_blocks, num_many_lut, + params.big_lwe_dimension + 1, size_tracker, allocate_gpu_memory); + } else { + multi_gpu_alloc_lwe_async(active_streams, lwe_after_pbs_vec, + num_radix_blocks, params.big_lwe_dimension + 1, + size_tracker, allocate_gpu_memory); + } + multi_gpu_alloc_array_async(active_streams, lwe_trivial_indexes_vec, num_radix_blocks, size_tracker, allocate_gpu_memory); - cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); + cuda_synchronize_stream(active_streams.stream(0), + active_streams.gpu_index(0)); + + // This call will not copy if allocate_gpu_memory is false + // thus it's safe to call it on a null source pointer multi_gpu_copy_array_async(active_streams, lwe_trivial_indexes_vec, lwe_trivial_indexes, num_radix_blocks, allocate_gpu_memory); - - // Keyswitch - tmp_lwe_before_ks = new CudaRadixCiphertextFFI; - create_zero_radix_ciphertext_async( - streams.stream(0), streams.gpu_index(0), tmp_lwe_before_ks, - num_radix_blocks, params.big_lwe_dimension, size_tracker, - allocate_gpu_memory); - h_lut_indexes = (Torus *)(calloc(num_radix_blocks, sizeof(Torus))); - degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t)); - max_degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t)); } - // constructor to reuse memory - int_radix_lut(CudaStreams streams, int_radix_params params, uint32_t num_luts, - uint32_t num_radix_blocks, int_radix_lut *base_lut_object, - bool allocate_gpu_memory, uint64_t &size_tracker) { - - this->params = params; - this->num_blocks = num_radix_blocks; - this->num_luts = num_luts; - gpu_memory_allocated = allocate_gpu_memory; - uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus); - uint64_t lut_buffer_size = - (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus); - + void setup_mem_reuse(uint32_t num_radix_blocks, + int_radix_lut_generic *base_lut_object) { // base lut object should have bigger or equal memory than current one if (num_radix_blocks > base_lut_object->num_blocks) PANIC("Cuda error: lut does not have enough blocks") @@ -485,38 +446,22 @@ template struct int_radix_lut { lwe_trivial_indexes_vec = base_lut_object->lwe_trivial_indexes_vec; mem_reuse = true; + } - // Allocate LUT - // LUT is used as a trivial encryption and must be initialized outside - // this constructor - active_streams = streams.active_gpu_subset(num_radix_blocks); - for (uint i = 0; i < active_streams.count(); i++) { - auto lut = (Torus *)cuda_malloc_with_size_tracking_async( - num_luts * lut_buffer_size, streams.stream(i), streams.gpu_index(i), - size_tracker, allocate_gpu_memory); - auto lut_indexes = (Torus *)cuda_malloc_with_size_tracking_async( - lut_indexes_size, streams.stream(i), streams.gpu_index(i), - size_tracker, allocate_gpu_memory); - // lut_indexes is initialized to 0 by default - // if a different behavior is wanted, it should be rewritten later - cuda_memset_with_size_tracking_async( - lut_indexes, 0, lut_indexes_size, streams.stream(i), - streams.gpu_index(i), allocate_gpu_memory); - lut_vec.push_back(lut); - lut_indexes_vec.push_back(lut_indexes); - } - + void setup_lwe_trivial_indices(uint32_t num_radix_blocks, + bool allocate_gpu_memory, + uint64_t &size_tracker) { // lwe_(input/output)_indexes are initialized to range(num_radix_blocks) // by default - lwe_indexes_in = (Torus *)cuda_malloc_with_size_tracking_async( - num_radix_blocks * sizeof(Torus), streams.stream(0), - streams.gpu_index(0), size_tracker, allocate_gpu_memory); + lwe_indexes_in = cuda_make_shared_with_size_tracking_async( + num_radix_blocks * sizeof(Torus), active_streams.stream(0), + active_streams.gpu_index(0), size_tracker, allocate_gpu_memory); lwe_indexes_out = (Torus *)cuda_malloc_with_size_tracking_async( - num_radix_blocks * sizeof(Torus), streams.stream(0), - streams.gpu_index(0), size_tracker, allocate_gpu_memory); + num_radix_blocks * sizeof(Torus), active_streams.stream(0), + active_streams.gpu_index(0), size_tracker, allocate_gpu_memory); lwe_trivial_indexes = (Torus *)cuda_malloc_with_size_tracking_async( - num_radix_blocks * sizeof(Torus), streams.stream(0), - streams.gpu_index(0), size_tracker, allocate_gpu_memory); + num_radix_blocks * sizeof(Torus), active_streams.stream(0), + active_streams.gpu_index(0), size_tracker, allocate_gpu_memory); h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus)); h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus)); @@ -525,147 +470,131 @@ template struct int_radix_lut { h_lwe_indexes_in[i] = i; cuda_memcpy_with_size_tracking_async_to_gpu( - lwe_indexes_in, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus), - streams.stream(0), streams.gpu_index(0), allocate_gpu_memory); + lwe_indexes_in.get(), h_lwe_indexes_in, + num_radix_blocks * sizeof(Torus), active_streams.stream(0), + active_streams.gpu_index(0), allocate_gpu_memory); cuda_memcpy_with_size_tracking_async_to_gpu( lwe_indexes_out, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus), - streams.stream(0), streams.gpu_index(0), allocate_gpu_memory); + active_streams.stream(0), active_streams.gpu_index(0), + allocate_gpu_memory); cuda_memcpy_with_size_tracking_async_to_gpu( lwe_trivial_indexes, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus), - streams.stream(0), streams.gpu_index(0), allocate_gpu_memory); + active_streams.stream(0), active_streams.gpu_index(0), + allocate_gpu_memory); memcpy(h_lwe_indexes_out, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus)); + h_lut_indexes = (Torus *)(calloc(num_radix_blocks, sizeof(Torus))); - degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t)); - max_degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t)); } - // Construction for many luts - int_radix_lut(CudaStreams streams, int_radix_params params, uint32_t num_luts, - uint32_t num_radix_blocks, uint32_t num_many_lut, - bool allocate_gpu_memory, uint64_t &size_tracker) { + void setup_multi_gpu(int_radix_params params, uint32_t num_radix_blocks, + bool allocate_gpu_memory, uint64_t &size_tracker) { - this->num_many_lut = num_many_lut; - this->params = params; - this->num_blocks = num_radix_blocks; - this->num_luts = num_luts; - gpu_memory_allocated = allocate_gpu_memory; - uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus); - uint64_t lut_buffer_size = - (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus); - - /////////////// - active_streams = streams.active_gpu_subset(num_radix_blocks); - for (uint i = 0; i < active_streams.count(); i++) { - cuda_set_device(streams.gpu_index(i)); - int8_t *gpu_pbs_buffer; - auto num_blocks_on_gpu = std::max( - THRESHOLD_MULTI_GPU, - get_num_inputs_on_gpu(num_radix_blocks, i, active_streams.count())); + if (!mem_reuse) + alloc_and_init_multi_gpu_buffers(params, num_radix_blocks, + allocate_gpu_memory, size_tracker); - uint64_t size = 0; - execute_scratch_pbs( - streams.stream(i), streams.gpu_index(i), &gpu_pbs_buffer, - params.glwe_dimension, params.small_lwe_dimension, - params.polynomial_size, params.pbs_level, params.grouping_factor, - num_blocks_on_gpu, params.pbs_type, allocate_gpu_memory, - params.noise_reduction_type, size); - if (i == 0) { - size_tracker += size; - } - buffer.push_back(gpu_pbs_buffer); - } - // We create the events only if we have multiple GPUs if (active_streams.count() > 1) { - event_scatter_in = cuda_create_event(streams.gpu_index(0)); - event_broadcast = cuda_create_event(streams.gpu_index(0)); - - event_scatter_out = - (cudaEvent_t *)malloc(active_streams.count() * sizeof(cudaEvent_t)); - for (int i = 0; i < active_streams.count(); i++) { - event_scatter_out[i] = cuda_create_event(active_streams.gpu_index(i)); - } - } - // Allocate LUT - // LUT is used as a trivial encryption and must be initialized outside - // this constructor - for (uint i = 0; i < active_streams.count(); i++) { - auto lut = (Torus *)cuda_malloc_with_size_tracking_async( - num_luts * lut_buffer_size, streams.stream(i), streams.gpu_index(i), - size_tracker, allocate_gpu_memory); - auto lut_indexes = (Torus *)cuda_malloc_with_size_tracking_async( - lut_indexes_size, streams.stream(i), streams.gpu_index(i), - size_tracker, allocate_gpu_memory); - // lut_indexes is initialized to 0 by default - // if a different behavior is wanted, it should be rewritten later - cuda_memset_with_size_tracking_async( - lut_indexes, 0, lut_indexes_size, streams.stream(i), - streams.gpu_index(i), allocate_gpu_memory); - lut_vec.push_back(lut); - lut_indexes_vec.push_back(lut_indexes); + // event_scatter_in = cuda_create_event(active_streams.gpu_index(0)); + multi_gpu_gather_barrier.create_on(active_streams); + multi_gpu_broadcast_barrier.create_on(active_streams); + multi_gpu_scatter_barrier.create_on(active_streams); } + } - // lwe_(input/output)_indexes are initialized to range(num_radix_blocks) - // by default - lwe_indexes_in = (Torus *)cuda_malloc_with_size_tracking_async( - num_radix_blocks * sizeof(Torus), streams.stream(0), - streams.gpu_index(0), size_tracker, allocate_gpu_memory); - lwe_indexes_out = (Torus *)cuda_malloc_with_size_tracking_async( - num_radix_blocks * sizeof(Torus), streams.stream(0), - streams.gpu_index(0), size_tracker, allocate_gpu_memory); - lwe_trivial_indexes = (Torus *)cuda_malloc_with_size_tracking_async( - num_radix_blocks * sizeof(Torus), streams.stream(0), - streams.gpu_index(0), size_tracker, allocate_gpu_memory); + int_radix_lut_generic(CudaStreams streams, int_radix_params params, + uint32_t num_luts, uint32_t num_radix_blocks, + bool allocate_gpu_memory, uint64_t &size_tracker) { - h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus)); - h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus)); + setup_config_and_degrees(streams, params.big_lwe_dimension, params, + num_luts, 1, num_radix_blocks, 1, + allocate_gpu_memory); - for (int i = 0; i < num_radix_blocks; i++) - h_lwe_indexes_in[i] = i; + setup_degrees(); - cuda_memcpy_with_size_tracking_async_to_gpu( - lwe_indexes_in, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus), - streams.stream(0), streams.gpu_index(0), allocate_gpu_memory); - cuda_memcpy_with_size_tracking_async_to_gpu( - lwe_indexes_out, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus), - streams.stream(0), streams.gpu_index(0), allocate_gpu_memory); - cuda_memcpy_with_size_tracking_async_to_gpu( - lwe_trivial_indexes, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus), - streams.stream(0), streams.gpu_index(0), allocate_gpu_memory); - memcpy(h_lwe_indexes_out, h_lwe_indexes_in, - num_radix_blocks * sizeof(Torus)); + allocate_pbs_buffers(params, num_radix_blocks, allocate_gpu_memory, + size_tracker); - /// With multiple GPUs we allocate arrays to be pushed to the vectors and - /// copy data on each GPU then when we gather data to GPU 0 we can copy - /// back to the original indexing - multi_gpu_alloc_lwe_async(active_streams, lwe_array_in_vec, - num_radix_blocks, params.big_lwe_dimension + 1, - size_tracker, allocate_gpu_memory); - multi_gpu_alloc_lwe_async(active_streams, lwe_after_ks_vec, - num_radix_blocks, params.small_lwe_dimension + 1, - size_tracker, allocate_gpu_memory); - multi_gpu_alloc_lwe_many_lut_output_async( - active_streams, lwe_after_pbs_vec, num_radix_blocks, num_many_lut, - params.big_lwe_dimension + 1, size_tracker, allocate_gpu_memory); - multi_gpu_alloc_array_async(active_streams, lwe_trivial_indexes_vec, - num_radix_blocks, size_tracker, - allocate_gpu_memory); - multi_gpu_copy_array_from_cpu_async(active_streams, lwe_trivial_indexes_vec, - h_lwe_indexes_in, num_radix_blocks, - allocate_gpu_memory); - // Keyswitch - tmp_lwe_before_ks = new CudaRadixCiphertextFFI; - create_zero_radix_ciphertext_async( - streams.stream(0), streams.gpu_index(0), tmp_lwe_before_ks, - num_radix_blocks, params.big_lwe_dimension, size_tracker, - allocate_gpu_memory); - h_lut_indexes = (Torus *)(calloc(num_radix_blocks, sizeof(Torus))); - degrees = (uint64_t *)malloc(num_many_lut * num_luts * sizeof(uint64_t)); - max_degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t)); + allocate_luts_and_indexes(num_radix_blocks, size_tracker); + + setup_lwe_trivial_indices(num_radix_blocks, allocate_gpu_memory, + size_tracker); + + setup_multi_gpu(params, num_radix_blocks, allocate_gpu_memory, + size_tracker); + } + + int_radix_lut_generic(CudaStreams streams, uint32_t input_big_lwe_dimension, + int_radix_params params, uint32_t num_luts, + uint32_t num_radix_blocks, uint32_t num_input_blocks, + bool allocate_gpu_memory, uint64_t &size_tracker) { + + setup_config_and_degrees(streams, input_big_lwe_dimension, params, num_luts, + 1, num_radix_blocks, num_input_blocks, + allocate_gpu_memory); + + setup_degrees(); + + allocate_pbs_buffers(params, num_radix_blocks, allocate_gpu_memory, + size_tracker); + + allocate_luts_and_indexes(num_radix_blocks, size_tracker); + + setup_lwe_trivial_indices(num_radix_blocks, allocate_gpu_memory, + size_tracker); + + setup_multi_gpu(params, num_radix_blocks, allocate_gpu_memory, + size_tracker); + } + + // constructor to reuse memory + int_radix_lut_generic(CudaStreams streams, int_radix_params params, + uint32_t num_luts, uint32_t num_radix_blocks, + int_radix_lut_generic *base_lut_object, + bool allocate_gpu_memory, uint64_t &size_tracker) { + setup_config_and_degrees(streams, params.big_lwe_dimension, params, + num_luts, 1, num_radix_blocks, 1, + allocate_gpu_memory); + + setup_degrees(); + + setup_mem_reuse(num_radix_blocks, base_lut_object); + + allocate_luts_and_indexes(num_radix_blocks, size_tracker); + + setup_lwe_trivial_indices(num_radix_blocks, allocate_gpu_memory, + size_tracker); + + setup_multi_gpu(params, num_radix_blocks, allocate_gpu_memory, + size_tracker); + } + + // Construction for many luts + int_radix_lut_generic(CudaStreams streams, int_radix_params params, + uint32_t num_luts, uint32_t num_radix_blocks, + uint32_t num_many_lut, bool allocate_gpu_memory, + uint64_t &size_tracker) { + + setup_config_and_degrees(streams, params.big_lwe_dimension, params, + num_luts, num_many_lut, num_radix_blocks, 1, + allocate_gpu_memory); + + setup_degrees(); + + allocate_pbs_buffers(params, num_radix_blocks, allocate_gpu_memory, + size_tracker); + + allocate_luts_and_indexes(num_radix_blocks, size_tracker); + + setup_lwe_trivial_indices(num_radix_blocks, allocate_gpu_memory, + size_tracker); + + setup_multi_gpu(params, num_radix_blocks, allocate_gpu_memory, + size_tracker); } // Return a pointer to idx-ith lut at gpu_index's global memory - Torus *get_lut(uint32_t gpu_index, size_t idx) { + OutputTorus *get_lut(uint32_t gpu_index, size_t idx) { if (!gpu_memory_allocated) return nullptr; auto lut = lut_vec[gpu_index]; @@ -677,10 +606,16 @@ template struct int_radix_lut { } // Return a pointer to idx-ith degree - uint64_t *get_degree(size_t idx) { return °rees[num_many_lut * idx]; } + uint64_t *get_degree(size_t idx) { + GPU_ASSERT(idx < num_luts, "Invalid degree requested"); + return °rees[num_many_lut * idx]; + } // Return a pointer to idx-ith max degree - uint64_t *get_max_degree(size_t idx) { return &max_degrees[idx]; } + uint64_t *get_max_degree(size_t idx) { + GPU_ASSERT(idx < num_luts, "Invalid degree requested"); + return &max_degrees[idx]; + } // Return a pointer to idx-ith lut indexes at gpu_index's global memory Torus *get_lut_indexes(uint32_t gpu_index, size_t ind) { @@ -690,6 +625,31 @@ template struct int_radix_lut { return &lut_indexes[ind]; } + // Allocate LUT + // LUT is used as a trivial encryption and must be initialized outside + // this constructor + void allocate_luts_and_indexes(uint32_t num_radix_blocks, + uint64_t &size_tracker) { + uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus); + uint64_t lut_buffer_size = (params.glwe_dimension + 1) * + params.polynomial_size * sizeof(OutputTorus); + + for (uint i = 0; i < active_streams.count(); i++) { + auto lut = (OutputTorus *)cuda_malloc_with_size_tracking_async( + num_luts * lut_buffer_size, active_streams.stream(i), + active_streams.gpu_index(i), size_tracker, gpu_memory_allocated); + auto lut_indexes = (Torus *)cuda_malloc_with_size_tracking_async( + lut_indexes_size, active_streams.stream(i), + active_streams.gpu_index(i), size_tracker, gpu_memory_allocated); + // lut_indexes is initialized to 0 by default + // if a different behavior is wanted, it should be rewritten later + cuda_memset_with_size_tracking_async( + lut_indexes, 0, lut_indexes_size, active_streams.stream(i), + active_streams.gpu_index(i), gpu_memory_allocated); + lut_vec.push_back(lut); + lut_indexes_vec.push_back(lut_indexes); + } + } // If this function is called we assume the lwe_indexes_(in/out) are not the // trivial anymore and thus we disable optimizations void set_lwe_indexes(cudaStream_t stream, uint32_t gpu_index, @@ -699,8 +659,8 @@ template struct int_radix_lut { memcpy(h_lwe_indexes_out, h_indexes_out, num_blocks * sizeof(Torus)); cuda_memcpy_with_size_tracking_async_to_gpu( - lwe_indexes_in, h_lwe_indexes_in, num_blocks * sizeof(Torus), stream, - gpu_index, gpu_memory_allocated); + lwe_indexes_in.get(), h_lwe_indexes_in, num_blocks * sizeof(Torus), + stream, gpu_index, gpu_memory_allocated); cuda_memcpy_with_size_tracking_async_to_gpu( lwe_indexes_out, h_lwe_indexes_out, num_blocks * sizeof(Torus), stream, gpu_index, gpu_memory_allocated); @@ -711,47 +671,66 @@ template struct int_radix_lut { // Broadcast luts from device gpu_indexes[0] to all active gpus void broadcast_lut(CudaStreams new_active_streams, bool broadcast_lut_values = true) { + PANIC_IF_FALSE(new_active_streams.gpu_index(0) == + active_streams.gpu_index(0), + "Broadcasting LUTs can only be done using the same GPUs " + " originally assigned to the int_radix_lut"); + // We only do broadcast if there are more than 1 active GPU - if (new_active_streams.count() > 1) { - int active_device = cuda_get_device(); - - uint64_t lut_size = (params.glwe_dimension + 1) * params.polynomial_size; - - auto src_lut = lut_vec[0]; - auto src_lut_indexes = lut_indexes_vec[0]; - if (active_streams.count() > 1) - cuda_event_record(event_broadcast, new_active_streams.stream(0), - new_active_streams.gpu_index(0)); - for (uint i = 0; i < new_active_streams.count(); i++) { - PANIC_IF_FALSE( - new_active_streams.gpu_index(i) == active_streams.gpu_index(i), - "Broadcasting LUTs can only be done to the LUT streams or to new " - "streams that reside on the same GPUs as the source LUTs"); + if (new_active_streams.count() == 1) + return; + + GPU_ASSERT(active_streams.count() >= new_active_streams.count(), + "To broadcast a LUT to a GPU set, it must have been initialized " + "with a GPU set that is greater or equal in size"); + + int active_device = cuda_get_device(); + + uint64_t lut_size = (params.glwe_dimension + 1) * params.polynomial_size; + + // Wait for GPU 0 to receive all data from previous computations + // that may have occurred on different GPUs + multi_gpu_broadcast_barrier.local_streams_wait_for_stream_0( + new_active_streams); + // The LUT and its indexes reside on GPU 0 + // these were filled by calls to generate_device_accumulator + // due to the previous synchronization, we're sure these buffers have + // finished copying to GPU 0 from CPU + auto src_lut = lut_vec[0]; + auto src_lut_indexes = lut_indexes_vec[0]; + + for (uint i = 1; i < new_active_streams.count(); i++) { + PANIC_IF_FALSE( + new_active_streams.gpu_index(i) == active_streams.gpu_index(i), + "Broadcasting LUTs can only be done to the LUT streams or to new " + "streams that reside on the same GPUs as the source LUTs"); + + // Check for redundant copies #ifndef DEBUG_FAKE_MULTI_GPU - if (new_active_streams.gpu_index(i) == new_active_streams.gpu_index(0)) - continue; + PANIC_IF_FALSE(new_active_streams.gpu_index(i) != + new_active_streams.gpu_index(0), + "Broadcast LUT does not handle duplicate GPUs in the " + "active streams set"); #endif - cuda_stream_wait_event(new_active_streams.stream(i), event_broadcast, - new_active_streams.gpu_index(i)); - if (broadcast_lut_values) { - auto dst_lut = lut_vec[i]; - cuda_memcpy_with_size_tracking_async_gpu_to_gpu( - dst_lut, src_lut, num_luts * lut_size * sizeof(Torus), - new_active_streams.stream(i), new_active_streams.gpu_index(i), - gpu_memory_allocated); - } - auto dst_lut_indexes = lut_indexes_vec[i]; + if (broadcast_lut_values) { + auto dst_lut = lut_vec[i]; cuda_memcpy_with_size_tracking_async_gpu_to_gpu( - dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(Torus), + dst_lut, src_lut, num_luts * lut_size * sizeof(Torus), new_active_streams.stream(i), new_active_streams.gpu_index(i), gpu_memory_allocated); } - // Ensure the device set at the end of this method is the same as it was - // set at the beginning - cuda_set_device(active_device); + auto dst_lut_indexes = lut_indexes_vec[i]; + cuda_memcpy_with_size_tracking_async_gpu_to_gpu( + dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(Torus), + new_active_streams.stream(i), new_active_streams.gpu_index(i), + gpu_memory_allocated); } + + // Ensure the device set at the end of this method is the same as it was + // set at the beginning + cuda_set_device(active_device); } void allocate_lwe_vector_for_non_trivial_indexes( @@ -779,75 +758,80 @@ template struct int_radix_lut { PANIC_IF_FALSE(lut_indexes_vec.size() == lut_vec.size(), "Lut vec and Lut vec indexes must have the same size"); for (uint i = 0; i < lut_vec.size(); i++) { - cuda_drop_with_size_tracking_async(lut_vec[i], streams.stream(i), - streams.gpu_index(i), - gpu_memory_allocated); - cuda_drop_with_size_tracking_async(lut_indexes_vec[i], streams.stream(i), - streams.gpu_index(i), + cuda_drop_with_size_tracking_async(lut_vec[i], active_streams.stream(i), + active_streams.gpu_index(i), gpu_memory_allocated); - } - cuda_drop_with_size_tracking_async(lwe_indexes_in, streams.stream(0), - streams.gpu_index(0), - gpu_memory_allocated); - cuda_drop_with_size_tracking_async(lwe_indexes_out, streams.stream(0), - streams.gpu_index(0), - gpu_memory_allocated); - cuda_drop_with_size_tracking_async(lwe_trivial_indexes, streams.stream(0), - streams.gpu_index(0), - gpu_memory_allocated); + cuda_drop_with_size_tracking_async( + lut_indexes_vec[i], active_streams.stream(i), + active_streams.gpu_index(i), gpu_memory_allocated); + } + lwe_indexes_in.reset(); + /*cuda_drop_with_size_tracking_async(lwe_indexes_in, + active_streams.stream(0), active_streams.gpu_index(0), + gpu_memory_allocated);*/ + cuda_drop_with_size_tracking_async( + lwe_indexes_out, active_streams.stream(0), active_streams.gpu_index(0), + gpu_memory_allocated); + cuda_drop_with_size_tracking_async( + lwe_trivial_indexes, active_streams.stream(0), + active_streams.gpu_index(0), gpu_memory_allocated); - cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); + cuda_synchronize_stream(active_streams.stream(0), + active_streams.gpu_index(0)); lut_vec.clear(); lut_indexes_vec.clear(); free(h_lwe_indexes_in); free(h_lwe_indexes_out); + if (active_streams.count() > 1) { + active_streams.synchronize(); + multi_gpu_gather_barrier.release(); + multi_gpu_broadcast_barrier.release(); + multi_gpu_scatter_barrier.release(); + } + if (!mem_reuse) { - release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0), - tmp_lwe_before_ks, gpu_memory_allocated); + GPU_ASSERT(tmp_lwe_before_ks.use_count() == 1, + "This int_radix_lut is still sharing memory with another"); + release_radix_ciphertext_async( + active_streams.stream(0), active_streams.gpu_index(0), + tmp_lwe_before_ks.get(), gpu_memory_allocated); for (int i = 0; i < buffer.size(); i++) { switch (params.pbs_type) { case MULTI_BIT: cleanup_cuda_multi_bit_programmable_bootstrap( - streams.stream(i), streams.gpu_index(i), &buffer[i]); + active_streams.stream(i), active_streams.gpu_index(i), + &buffer[i]); break; case CLASSICAL: - cleanup_cuda_programmable_bootstrap(streams.stream(i), - streams.gpu_index(i), &buffer[i]); + cleanup_cuda_programmable_bootstrap(active_streams.stream(i), + active_streams.gpu_index(i), + &buffer[i]); break; default: PANIC("Cuda error (PBS): unknown PBS type. ") } - cuda_synchronize_stream(streams.stream(i), streams.gpu_index(i)); + cuda_synchronize_stream(active_streams.stream(i), + active_streams.gpu_index(i)); } - delete tmp_lwe_before_ks; + tmp_lwe_before_ks.reset(); buffer.clear(); if (gpu_memory_allocated) { - multi_gpu_release_async(streams, lwe_array_in_vec); - multi_gpu_release_async(streams, lwe_after_ks_vec); - multi_gpu_release_async(streams, lwe_after_pbs_vec); - multi_gpu_release_async(streams, lwe_trivial_indexes_vec); - streams.synchronize(); + multi_gpu_release_async(active_streams, lwe_array_in_vec); + multi_gpu_release_async(active_streams, lwe_after_ks_vec); + multi_gpu_release_async(active_streams, lwe_after_pbs_vec); + multi_gpu_release_async(active_streams, lwe_trivial_indexes_vec); } lwe_array_in_vec.clear(); lwe_after_ks_vec.clear(); lwe_after_pbs_vec.clear(); lwe_trivial_indexes_vec.clear(); - if (active_streams.count() > 1) { - active_streams.synchronize(); - for (uint i = 0; i < active_streams.count(); i++) { - cuda_event_destroy(event_scatter_out[i], active_streams.gpu_index(i)); - } - cuda_event_destroy(event_scatter_in, active_streams.gpu_index(0)); - cuda_event_destroy(event_broadcast, active_streams.gpu_index(0)); - free(event_scatter_out); - } if (lwe_aligned_vec.size() > 0) { for (uint i = 0; i < active_streams.count(); i++) { cuda_drop_with_size_tracking_async( - lwe_aligned_vec[i], streams.stream(0), streams.gpu_index(0), - gpu_memory_allocated); + lwe_aligned_vec[i], active_streams.stream(0), + active_streams.gpu_index(0), gpu_memory_allocated); } lwe_aligned_vec.clear(); } @@ -858,46 +842,13 @@ template struct int_radix_lut { } }; -template struct int_noise_squashing_lut { - - int_radix_params params; - uint32_t input_glwe_dimension; - uint32_t input_polynomial_size; - uint32_t input_big_lwe_dimension; - uint32_t num_blocks; - // Tracks the degree of each LUT and the max degree on CPU - // The max degree is (message_modulus * carry_modulus - 1) except for many lut - // for which it's different - uint64_t *degrees; - uint64_t *max_degrees; - - CudaStreams active_streams; - - // There will be one buffer on each GPU in multi-GPU computations - // (same for tmp lwe arrays) - std::vector pbs_buffer; - - std::vector<__uint128_t *> lut_vec; +template +using int_radix_lut = int_radix_lut_generic; - CudaRadixCiphertextFFI *tmp_lwe_before_ks; +template +struct int_noise_squashing_lut + : int_radix_lut_generic { - // All tmp lwe arrays and index arrays for lwe contain the total - // amount of blocks to be computed on, there is no split between GPUs - // for the moment - InputTorus *lwe_indexes_in; - - InputTorus *h_lwe_indexes_in; - InputTorus *lwe_trivial_indexes; - - /// For multi GPU execution we create vectors of pointers for inputs and - /// outputs - std::vector lwe_array_in_vec; - std::vector lwe_after_ks_vec; - std::vector<__uint128_t *> lwe_after_pbs_vec; - std::vector lwe_trivial_indexes_vec; - - bool using_trivial_lwe_indexes = true; - bool gpu_memory_allocated; std::vector lwe_aligned_scatter_vec; std::vector<__uint128_t *> lwe_aligned_gather_vec; // noise squashing constructor @@ -906,165 +857,26 @@ template struct int_noise_squashing_lut { uint32_t input_polynomial_size, uint32_t num_radix_blocks, uint32_t original_num_blocks, - bool allocate_gpu_memory, uint64_t &size_tracker) { - this->params = params; - this->num_blocks = num_radix_blocks; - gpu_memory_allocated = allocate_gpu_memory; - // This are the glwe dimension and polynomial size before squashing - this->input_glwe_dimension = input_glwe_dimension; - this->input_polynomial_size = input_polynomial_size; - uint32_t input_big_lwe_dimension = - input_glwe_dimension * input_polynomial_size; - this->input_big_lwe_dimension = input_big_lwe_dimension; - - uint64_t lut_buffer_size = (params.glwe_dimension + 1) * - params.polynomial_size * sizeof(__uint128_t); + bool allocate_gpu_memory, uint64_t &size_tracker) - /////////////// - active_streams = streams.active_gpu_subset(num_radix_blocks); - cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); - for (uint i = 0; i < active_streams.count(); i++) { - cuda_set_device(streams.gpu_index(i)); - auto num_radix_blocks_on_gpu = std::max( - THRESHOLD_MULTI_GPU, - get_num_inputs_on_gpu(num_radix_blocks, i, active_streams.count())); - int8_t *gpu_pbs_buffer; - uint64_t size = 0; - execute_scratch_pbs<__uint128_t>( - streams.stream(i), streams.gpu_index(i), &gpu_pbs_buffer, - params.glwe_dimension, params.small_lwe_dimension, - params.polynomial_size, params.pbs_level, params.grouping_factor, - num_radix_blocks_on_gpu, params.pbs_type, allocate_gpu_memory, - params.noise_reduction_type, size); - cuda_synchronize_stream(streams.stream(i), streams.gpu_index(i)); - if (i == 0) { - size_tracker += size; - } - pbs_buffer.push_back(gpu_pbs_buffer); - } - lwe_indexes_in = (InputTorus *)cuda_malloc_with_size_tracking_async( - num_radix_blocks * sizeof(InputTorus), streams.stream(0), - streams.gpu_index(0), size_tracker, allocate_gpu_memory); - lwe_trivial_indexes = (InputTorus *)cuda_malloc_with_size_tracking_async( - num_radix_blocks * sizeof(InputTorus), streams.stream(0), - streams.gpu_index(0), size_tracker, allocate_gpu_memory); - h_lwe_indexes_in = - (InputTorus *)malloc(num_radix_blocks * sizeof(InputTorus)); - for (int i = 0; i < num_radix_blocks; i++) - h_lwe_indexes_in[i] = i; - - cuda_memcpy_with_size_tracking_async_to_gpu( - lwe_indexes_in, h_lwe_indexes_in, num_radix_blocks * sizeof(InputTorus), - streams.stream(0), streams.gpu_index(0), allocate_gpu_memory); - cuda_memcpy_with_size_tracking_async_to_gpu( - lwe_trivial_indexes, h_lwe_indexes_in, - num_radix_blocks * sizeof(InputTorus), streams.stream(0), - streams.gpu_index(0), allocate_gpu_memory); - - multi_gpu_alloc_lwe_async(active_streams, lwe_array_in_vec, - num_radix_blocks, params.big_lwe_dimension + 1, - size_tracker, allocate_gpu_memory); - - multi_gpu_alloc_lwe_async( - active_streams, lwe_after_ks_vec, num_radix_blocks, - params.small_lwe_dimension + 1, size_tracker, allocate_gpu_memory); - multi_gpu_alloc_lwe_async<__uint128_t>( - active_streams, lwe_after_pbs_vec, num_radix_blocks, - params.big_lwe_dimension + 1, size_tracker, allocate_gpu_memory); - multi_gpu_alloc_array_async( - active_streams, lwe_trivial_indexes_vec, num_radix_blocks, size_tracker, - allocate_gpu_memory); - cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); - - multi_gpu_copy_array_async(active_streams, lwe_trivial_indexes_vec, - lwe_trivial_indexes, num_radix_blocks, - allocate_gpu_memory); - if (allocate_gpu_memory) { - // Allocate LUT - // LUT is used as a trivial encryption and must be initialized outside - // this constructor - for (uint i = 0; i < active_streams.count(); i++) { - auto lut = (__uint128_t *)cuda_malloc_with_size_tracking_async( - lut_buffer_size, streams.stream(i), streams.gpu_index(i), - size_tracker, allocate_gpu_memory); - lut_vec.push_back(lut); - cuda_synchronize_stream(streams.stream(i), streams.gpu_index(i)); - } - } - // Keyswitch - tmp_lwe_before_ks = new CudaRadixCiphertextFFI; - create_zero_radix_ciphertext_async( - streams.stream(0), streams.gpu_index(0), tmp_lwe_before_ks, - original_num_blocks, input_big_lwe_dimension, size_tracker, - allocate_gpu_memory); - - degrees = (uint64_t *)malloc(sizeof(uint64_t)); - max_degrees = (uint64_t *)malloc(sizeof(uint64_t)); + : int_radix_lut_generic( + streams, input_glwe_dimension * input_polynomial_size, params, 1, + num_radix_blocks, original_num_blocks, allocate_gpu_memory, + size_tracker) { // lut for the squashing auto f_squash = [](__uint128_t block) -> __uint128_t { return block; }; - // Generate the identity LUT, for now we only use one GPU - for (uint i = 0; i < active_streams.count(); i++) { - auto squash_lut = lut_vec[i]; - generate_device_accumulator<__uint128_t>( - streams.stream(i), streams.gpu_index(i), squash_lut, degrees, - max_degrees, params.glwe_dimension, params.polynomial_size, - params.message_modulus, params.carry_modulus, f_squash, - allocate_gpu_memory); - } - } - void release(CudaStreams streams) { - for (uint i = 0; i < lut_vec.size(); i++) { - cuda_drop_with_size_tracking_async(lut_vec[i], streams.stream(i), - streams.gpu_index(i), - gpu_memory_allocated); - } - cuda_drop_with_size_tracking_async(lwe_indexes_in, streams.stream(0), - streams.gpu_index(0), - gpu_memory_allocated); - cuda_drop_with_size_tracking_async(lwe_trivial_indexes, streams.stream(0), - streams.gpu_index(0), - gpu_memory_allocated); - cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); - lut_vec.clear(); - free(h_lwe_indexes_in); + generate_device_accumulator<__uint128_t>( + this->active_streams.stream(0), this->active_streams.gpu_index(0), + this->get_lut(0, 0), this->get_degree(0), this->get_max_degree(0), + params.glwe_dimension, params.polynomial_size, params.message_modulus, + params.carry_modulus, f_squash, allocate_gpu_memory); - release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0), - tmp_lwe_before_ks, gpu_memory_allocated); - for (int i = 0; i < pbs_buffer.size(); i++) { - switch (params.pbs_type) { - case MULTI_BIT: - cleanup_cuda_multi_bit_programmable_bootstrap_128( - streams.stream(i), streams.gpu_index(i), &pbs_buffer[i]); - break; - case CLASSICAL: - cleanup_cuda_programmable_bootstrap_128( - streams.stream(i), streams.gpu_index(i), &pbs_buffer[i]); - break; - default: - PANIC("Cuda error (PBS): unknown PBS type. ") - } - cuda_synchronize_stream(streams.stream(i), streams.gpu_index(i)); - } - if (lwe_aligned_gather_vec.size() > 0) { - multi_gpu_release_async(streams, lwe_aligned_gather_vec); - multi_gpu_release_async(streams, lwe_aligned_scatter_vec); - } - multi_gpu_release_async(streams, lwe_array_in_vec); - multi_gpu_release_async(streams, lwe_after_ks_vec); - multi_gpu_release_async(streams, lwe_after_pbs_vec); - multi_gpu_release_async(streams, lwe_trivial_indexes_vec); - streams.synchronize(); - - lwe_array_in_vec.clear(); - lwe_after_ks_vec.clear(); - lwe_after_pbs_vec.clear(); - lwe_trivial_indexes_vec.clear(); - - delete tmp_lwe_before_ks; - pbs_buffer.clear(); + this->broadcast_lut(this->active_streams); } + + using int_radix_lut_generic::release; }; // Forward declarations for operation buffers diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu index 7c9191d268..342276f762 100644 --- a/backends/tfhe-cuda-backend/cuda/src/device.cu +++ b/backends/tfhe-cuda-backend/cuda/src/device.cu @@ -266,6 +266,11 @@ void cuda_memcpy_with_size_tracking_async_gpu_to_gpu( uint32_t gpu_index, bool gpu_memory_allocated) { if (size == 0 || !gpu_memory_allocated) return; + GPU_ASSERT(dest != nullptr, + "Cuda error: trying to copy gpu->gpu to null ptr"); + GPU_ASSERT(src != nullptr, + "Cuda error: trying to copy gpu->gpu from null ptr"); + cudaPointerAttributes attr_dest; check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest)); PANIC_IF_FALSE( diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh index 9c9c5d8c9b..f9ee496aaf 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh @@ -31,8 +31,8 @@ __host__ void zero_out_if(CudaStreams streams, // second operand is not an array auto tmp_lwe_array_input = mem_ptr->tmp; host_pack_bivariate_blocks_with_single_block( - streams, tmp_lwe_array_input, predicate->lwe_indexes_in, lwe_array_input, - lwe_condition, predicate->lwe_indexes_in, params.message_modulus, + streams, tmp_lwe_array_input, predicate->lwe_indexes_in.get(), lwe_array_input, + lwe_condition, predicate->lwe_indexes_in.get(), params.message_modulus, num_radix_blocks); integer_radix_apply_univariate_lookup_table_kb( diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh index bfc7c3ff9d..807e8e6e3f 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh @@ -344,7 +344,7 @@ host_integer_decompress(CudaStreams streams, execute_pbs_async( active_streams, (Torus *)d_lwe_array_out->ptr, lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, extracted_lwe, - lut->lwe_indexes_in, d_bsks, lut->buffer, + lut->lwe_indexes_in.get(), d_bsks, lut->buffer, encryption_params.glwe_dimension, compression_params.small_lwe_dimension, encryption_params.polynomial_size, encryption_params.pbs_base_log, @@ -359,17 +359,13 @@ host_integer_decompress(CudaStreams streams, std::vector lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec; - /// Make sure all data that should be on GPU 0 is indeed there - cuda_event_record(lut->event_scatter_in, streams.stream(0), - streams.gpu_index(0)); - for (int j = 1; j < active_streams.count(); j++) { - cuda_stream_wait_event(streams.stream(j), lut->event_scatter_in, - streams.gpu_index(j)); - } + lut->multi_gpu_scatter_barrier.local_streams_wait_for_stream_0( + active_streams); + /// With multiple GPUs we push to the vectors on each GPU then when we /// gather data to GPU 0 we can copy back to the original indexing multi_gpu_scatter_lwe_async( - active_streams, lwe_array_in_vec, extracted_lwe, lut->lwe_indexes_in, + active_streams, lwe_array_in_vec, extracted_lwe, lut->lwe_indexes_in.get(), lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec, lut->active_streams.count(), num_blocks_to_decompress, compression_params.small_lwe_dimension + 1); @@ -395,15 +391,8 @@ host_integer_decompress(CudaStreams streams, /// Synchronize all GPUs // other gpus record their events - for (int j = 1; j < active_streams.count(); j++) { - cuda_event_record(lut->event_scatter_out[j], active_streams.stream(j), - active_streams.gpu_index(j)); - } - // GPU 0 waits for all - for (int j = 1; j < active_streams.count(); j++) { - cuda_stream_wait_event(streams.stream(0), lut->event_scatter_out[j], - streams.gpu_index(0)); - } + lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams( + active_streams); } } else { static_assert(std::is_same_v, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index 4c2d1a13ba..841f31cdc8 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -546,7 +546,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( if (active_streams.count() == 1) { execute_keyswitch_async( streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], - (Torus *)lwe_array_in->ptr, lut->lwe_indexes_in, ksks, + (Torus *)lwe_array_in->ptr, lut->lwe_indexes_in.get(), ksks, big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level, num_radix_blocks); @@ -560,19 +560,15 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride); } else { /// Make sure all data that should be on GPU 0 is indeed there - cuda_event_record(lut->event_scatter_in, streams.stream(0), - streams.gpu_index(0)); - for (int j = 1; j < active_streams.count(); j++) { - cuda_stream_wait_event(streams.stream(j), lut->event_scatter_in, - streams.gpu_index(j)); - } + lut->multi_gpu_scatter_barrier.local_streams_wait_for_stream_0( + active_streams); /// With multiple GPUs we push to the vectors on each GPU then when we /// gather data to GPU 0 we can copy back to the original indexing PUSH_RANGE("scatter") multi_gpu_scatter_lwe_async( active_streams, lwe_array_in_vec, (Torus *)lwe_array_in->ptr, - lut->lwe_indexes_in, lut->using_trivial_lwe_indexes, + lut->lwe_indexes_in.get(), lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec, lut->active_streams.count(), num_radix_blocks, big_lwe_dimension + 1); POP_RANGE() @@ -598,16 +594,8 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( lut->lwe_indexes_out, lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1); POP_RANGE() - // other gpus record their events - for (int j = 1; j < active_streams.count(); j++) { - cuda_event_record(lut->event_scatter_out[j], streams.stream(j), - streams.gpu_index(j)); - } - // GPU 0 waits for all - for (int j = 1; j < active_streams.count(); j++) { - cuda_stream_wait_event(streams.stream(0), lut->event_scatter_out[j], - streams.gpu_index(0)); - } + lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams( + active_streams); } for (uint i = 0; i < num_radix_blocks; i++) { auto degrees_index = lut->h_lut_indexes[i]; @@ -660,7 +648,7 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb( if (active_streams.count() == 1) { execute_keyswitch_async( streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], - (Torus *)lwe_array_in->ptr, lut->lwe_indexes_in, ksks, + (Torus *)lwe_array_in->ptr, lut->lwe_indexes_in.get(), ksks, big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level, num_radix_blocks); @@ -674,18 +662,15 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb( grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride); } else { /// Make sure all data that should be on GPU 0 is indeed there - cuda_event_record(lut->event_scatter_in, streams.stream(0), - streams.gpu_index(0)); - for (int j = 1; j < active_streams.count(); j++) { - cuda_stream_wait_event(streams.stream(j), lut->event_scatter_in, - streams.gpu_index(j)); - } + lut->multi_gpu_scatter_barrier.local_streams_wait_for_stream_0( + active_streams); + /// With multiple GPUs we push to the vectors on each GPU then when we /// gather data to GPU 0 we can copy back to the original indexing PUSH_RANGE("scatter") multi_gpu_scatter_lwe_async( active_streams, lwe_array_in_vec, (Torus *)lwe_array_in->ptr, - lut->lwe_indexes_in, lut->using_trivial_lwe_indexes, + lut->lwe_indexes_in.get(), lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec, lut->active_streams.count(), num_radix_blocks, big_lwe_dimension + 1); POP_RANGE() @@ -712,16 +697,8 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb( num_radix_blocks, big_lwe_dimension + 1, num_many_lut); POP_RANGE() - // other gpus record their events - for (int j = 1; j < active_streams.count(); j++) { - cuda_event_record(lut->event_scatter_out[j], streams.stream(j), - streams.gpu_index(j)); - } - // GPU 0 waits for all - for (int j = 1; j < active_streams.count(); j++) { - cuda_stream_wait_event(streams.stream(0), lut->event_scatter_out[j], - streams.gpu_index(0)); - } + lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams( + active_streams); } for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) { auto degrees_index = lut->h_lut_indexes[i % lut->num_blocks]; @@ -771,10 +748,10 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( uint32_t lut_stride = 0; // Left message is shifted - auto lwe_array_pbs_in = lut->tmp_lwe_before_ks; + auto lwe_array_pbs_in = lut->tmp_lwe_before_ks.get(); host_pack_bivariate_blocks( streams, lwe_array_pbs_in, lut->lwe_trivial_indexes, lwe_array_1, - lwe_array_2, lut->lwe_indexes_in, shift, num_radix_blocks, + lwe_array_2, lut->lwe_indexes_in.get(), shift, num_radix_blocks, params.message_modulus, params.carry_modulus); check_cuda_error(cudaGetLastError()); @@ -789,7 +766,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( if (active_streams.count() == 1) { execute_keyswitch_async( streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], - (Torus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in, ksks, + (Torus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in.get(), ksks, big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level, num_radix_blocks); @@ -802,16 +779,13 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride); } else { - cuda_event_record(lut->event_scatter_in, streams.stream(0), - streams.gpu_index(0)); - for (int j = 1; j < active_streams.count(); j++) { - cuda_stream_wait_event(streams.stream(j), lut->event_scatter_in, - streams.gpu_index(j)); - } + lut->multi_gpu_scatter_barrier.local_streams_wait_for_stream_0( + active_streams); + PUSH_RANGE("scatter") multi_gpu_scatter_lwe_async( active_streams, lwe_array_in_vec, (Torus *)lwe_array_pbs_in->ptr, - lut->lwe_indexes_in, lut->using_trivial_lwe_indexes, + lut->lwe_indexes_in.get(), lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec, lut->active_streams.count(), num_radix_blocks, big_lwe_dimension + 1); POP_RANGE() @@ -837,16 +811,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( lut->lwe_indexes_out, lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1); POP_RANGE() - // other gpus record their events - for (int j = 1; j < active_streams.count(); j++) { - cuda_event_record(lut->event_scatter_out[j], streams.stream(j), - streams.gpu_index(j)); - } - // GPU 0 waits for all - for (int j = 1; j < active_streams.count(); j++) { - cuda_stream_wait_event(streams.stream(0), lut->event_scatter_out[j], - streams.gpu_index(0)); - } + lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams( + active_streams); } for (uint i = 0; i < num_radix_blocks; i++) { auto degrees_index = lut->h_lut_indexes[i]; @@ -2368,7 +2334,7 @@ __host__ void integer_radix_apply_noise_squashing_kb( /// For multi GPU execution we create vectors of pointers for inputs and /// outputs - auto lwe_array_pbs_in = lut->tmp_lwe_before_ks; + auto lwe_array_pbs_in = lut->tmp_lwe_before_ks.get(); std::vector lwe_array_in_vec = lut->lwe_array_in_vec; std::vector lwe_after_ks_vec = lut->lwe_after_ks_vec; std::vector<__uint128_t *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec; @@ -2387,7 +2353,7 @@ __host__ void integer_radix_apply_noise_squashing_kb( if (active_streams.count() == 1) { execute_keyswitch_async( streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], - (InputTorus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in, ksks, + (InputTorus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in.get(), ksks, lut->input_big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level, lwe_array_out->num_radix_blocks); @@ -2399,7 +2365,7 @@ __host__ void integer_radix_apply_noise_squashing_kb( execute_pbs_async( streams.get_ith(0), (__uint128_t *)lwe_array_out->ptr, lwe_trivial_indexes_vec[0], lut->lut_vec, lwe_trivial_indexes_vec, - lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks, lut->pbs_buffer, + lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level, grouping_factor, lwe_array_out->num_radix_blocks, params.pbs_type, 0, 0); @@ -2411,7 +2377,7 @@ __host__ void integer_radix_apply_noise_squashing_kb( /// gather data to GPU 0 we can copy back to the original indexing multi_gpu_scatter_lwe_async( active_streams, lwe_array_in_vec, (InputTorus *)lwe_array_pbs_in->ptr, - lut->lwe_indexes_in, lut->using_trivial_lwe_indexes, + lut->lwe_indexes_in.get(), lut->using_trivial_lwe_indexes, lut->lwe_aligned_scatter_vec, lut->active_streams.count(), lwe_array_out->num_radix_blocks, lut->input_big_lwe_dimension + 1); @@ -2426,7 +2392,7 @@ __host__ void integer_radix_apply_noise_squashing_kb( execute_pbs_async( active_streams, lwe_after_pbs_vec, lwe_trivial_indexes_vec, lut->lut_vec, lwe_trivial_indexes_vec, lwe_after_ks_vec, - lwe_trivial_indexes_vec, bsks, lut->pbs_buffer, glwe_dimension, + lwe_trivial_indexes_vec, bsks, lut->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level, grouping_factor, lwe_array_out->num_radix_blocks, params.pbs_type, 0, 0); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh index 92d53c7f19..d48f171181 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh @@ -375,7 +375,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb( while (needs_processing) { auto luts_message_carry = mem_ptr->luts_message_carry; - auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in; + auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in.get(); auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out; calculate_chunks <<>>( @@ -433,7 +433,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb( if (mem_ptr->reduce_degrees_for_single_carry_propagation) { auto luts_message_carry = mem_ptr->luts_message_carry; - auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in; + auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in.get(); auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out; prepare_final_pbs_indexes <<<1, 2 * num_radix_blocks, 0, streams.stream(0)>>>( diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh index 360a403cfa..5147717fcb 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh @@ -34,7 +34,7 @@ void host_integer_grouped_oprf(CudaStreams streams, execute_pbs_async( streams.get_ith(0), (Torus *)(radix_lwe_out->ptr), lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, - const_cast(seeded_lwe_input), lut->lwe_indexes_in, bsks, + const_cast(seeded_lwe_input), lut->lwe_indexes_in.get(), bsks, lut->buffer, mem_ptr->params.glwe_dimension, mem_ptr->params.small_lwe_dimension, mem_ptr->params.polynomial_size, mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level, @@ -45,16 +45,12 @@ void host_integer_grouped_oprf(CudaStreams streams, std::vector lwe_after_pbs_vec = lut->lwe_after_pbs_vec; std::vector lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec; - cuda_event_record(lut->event_scatter_in, streams.stream(0), - streams.gpu_index(0)); - for (int j = 1; j < active_streams.count(); j++) { - cuda_stream_wait_event(streams.stream(j), lut->event_scatter_in, - streams.gpu_index(j)); - } + lut->multi_gpu_scatter_barrier.local_streams_wait_for_stream_0( + active_streams); PUSH_RANGE("scatter") multi_gpu_scatter_lwe_async( - active_streams, lwe_array_in_vec, seeded_lwe_input, lut->lwe_indexes_in, + active_streams, lwe_array_in_vec, seeded_lwe_input, lut->lwe_indexes_in.get(), lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec, active_streams.count(), num_blocks_to_process, mem_ptr->params.small_lwe_dimension + 1); @@ -76,16 +72,8 @@ void host_integer_grouped_oprf(CudaStreams streams, lut->lwe_aligned_vec, num_blocks_to_process, mem_ptr->params.big_lwe_dimension + 1); POP_RANGE() - // other gpus record their events - for (int j = 1; j < active_streams.count(); j++) { - cuda_event_record(lut->event_scatter_out[j], streams.stream(j), - streams.gpu_index(j)); - } - // GPU 0 waits for all - for (int j = 1; j < active_streams.count(); j++) { - cuda_stream_wait_event(streams.stream(0), lut->event_scatter_out[j], - streams.gpu_index(0)); - } + lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams( + active_streams); } for (uint32_t i = 0; i < num_blocks_to_process; i++) { diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh index 8c183aca1f..3479f45b28 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh @@ -150,7 +150,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace( // control_bit|b|a host_pack_bivariate_blocks( streams, mux_inputs, mux_lut->lwe_indexes_out, rotated_input, - input_bits_a, mux_lut->lwe_indexes_in, 2, total_nb_bits, + input_bits_a, mux_lut->lwe_indexes_in.get(), 2, total_nb_bits, mem->params.message_modulus, mem->params.carry_modulus); // The shift bit is already properly aligned/positioned