diff --git a/backends/tfhe-cuda-backend/cuda/include/device.h b/backends/tfhe-cuda-backend/cuda/include/device.h
index 14666bd6c5..794f8afae6 100644
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -5,6 +5,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cuda_runtime.h>
+#include <memory>
 
 extern "C" {
 
@@ -140,4 +141,34 @@ template <typename Torus>
 void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
                           Torus *d_array, Torus value, Torus n);
 
+template <class T> struct malloc_with_size_tracking_async_deleter {
+private:
+  cudaStream_t _stream;
+  uint32_t _gpu_index;
+  uint64_t &_size_tracker;
+  bool _allocate_gpu_memory;
+
+public:
+  malloc_with_size_tracking_async_deleter(cudaStream_t stream,
+                                          uint32_t gpu_index,
+                                          uint64_t &size_tracker,
+                                          bool allocate_gpu_memory)
+      : _stream(stream), _gpu_index(gpu_index), _size_tracker(size_tracker),
+        _allocate_gpu_memory(allocate_gpu_memory)
+
+  {}
+  void operator()(T *ptr) { cuda_drop_with_size_tracking_async(ptr, _stream, _gpu_index, _allocate_gpu_memory) ; }
+};
+
+template <class T>
+std::shared_ptr<T> cuda_make_shared_with_size_tracking_async(
+    uint64_t size, cudaStream_t stream, uint32_t gpu_index,
+    uint64_t &size_tracker, bool allocate_gpu_memory) {
+  return std::shared_ptr<T>(
+      (T*)cuda_malloc_with_size_tracking_async(size, stream, gpu_index,
+                                           size_tracker, allocate_gpu_memory),
+      malloc_with_size_tracking_async_deleter<T>(
+          stream, gpu_index, size_tracker, allocate_gpu_memory));
+}
+
 #endif
diff --git a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
index e46901ea5c..157f36b5e9 100644
--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -183,4 +183,93 @@ struct CudaStreams {
   }
 };
 
+struct CudaStreamsBarrier {
+private:
+  std::vector<cudaEvent_t> _events;
+  CudaStreams _streams;
+
+  CudaStreamsBarrier(const CudaStreamsBarrier &) {} // Prevent copy-construction
+  CudaStreamsBarrier &operator=(const CudaStreamsBarrier &) {
+    return *this;
+  } // Prevent assignment
+public:
+  void create_on(const CudaStreams &streams) {
+    _streams = streams;
+
+    GPU_ASSERT(streams.count() > 1, "CudaStreamsFirstWaitsWorkersBarrier: "
+                                    "Attempted to create on single GPU");
+    _events.resize(streams.count());
+    for (int i = 0; i < streams.count(); i++) {
+      _events[i] = cuda_create_event(streams.gpu_index(i));
+    }
+  }
+
+  CudaStreamsBarrier(){};
+
+  void local_streams_wait_for_stream_0(const CudaStreams &user_streams) {
+    GPU_ASSERT(!_events.empty(),
+               "CudaStreamsBarrier: must call create_on before use");
+    GPU_ASSERT(user_streams.gpu_index(0) == _streams.gpu_index(0),
+               "CudaStreamsBarrier: synchronization can only be performed on "
+               "the GPUs the barrier was initially created on.");
+
+    cuda_event_record(_events[0], user_streams.stream(0),
+                      user_streams.gpu_index(0));
+    for (int j = 1; j < user_streams.count(); j++) {
+      GPU_ASSERT(user_streams.gpu_index(j) == _streams.gpu_index(j),
+                 "CudaStreamsBarrier: synchronization can only be performed on "
+                 "the GPUs the barrier was initially created on.");
+      cuda_stream_wait_event(user_streams.stream(j), _events[0],
+                             user_streams.gpu_index(j));
+    }
+  }
+
+  void stream_0_wait_for_local_streams(const CudaStreams &user_streams) {
+    GPU_ASSERT(
+        !_events.empty(),
+        "CudaStreamsFirstWaitsWorkersBarrier: must call create_on before use");
+    GPU_ASSERT(
+        user_streams.count() <= _events.size(),
+        "CudaStreamsFirstWaitsWorkersBarrier: trying to synchronize too many "
+        "streams. "
+        "The barrier was created on a LUT that had %lu active streams, while "
+        "the user stream set has %u streams",
+        _events.size(), user_streams.count());
+
+    if (user_streams.count() > 1) {
+      // Worker GPUs record their events
+      for (int j = 1; j < user_streams.count(); j++) {
+        GPU_ASSERT(_streams.gpu_index(j) == user_streams.gpu_index(j),
+                   "CudaStreamsBarrier: The user stream "
+                   "set GPU[%d]=%u while the LUT stream set GPU[%d]=%u",
+                   j, user_streams.gpu_index(j), j, _streams.gpu_index(j));
+
+        cuda_event_record(_events[j], user_streams.stream(j),
+                          user_streams.gpu_index(j));
+      }
+
+      // GPU 0 waits for all workers
+      for (int j = 1; j < user_streams.count(); j++) {
+        cuda_stream_wait_event(user_streams.stream(0), _events[j],
+                               user_streams.gpu_index(0));
+      }
+    }
+  }
+
+  void release() {
+    for (int j = 0; j < _streams.count(); j++) {
+      cuda_event_destroy(_events[j], _streams.gpu_index(j));
+    }
+
+    _events.clear();
+  }
+
+  ~CudaStreamsBarrier() {
+    GPU_ASSERT(_events.empty(),
+               "CudaStreamsBarrier: must "
+               "call release before destruction: events size = %lu",
+               _events.size());
+  }
+};
+
 #endif
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
index 2aaac2f069..5dbd18e08d 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -276,16 +276,19 @@ struct int_radix_params {
 };
 
 // Store things needed to apply LUTs
-template <typename Torus> struct int_radix_lut {
+template <typename Torus, typename OutputTorus> struct int_radix_lut_generic {
   int_radix_params params;
-  uint32_t num_blocks;
-  uint32_t num_luts;
+  uint32_t num_blocks = 0;
+  uint32_t num_input_blocks = 0;
+  uint32_t num_luts = 0;
   uint32_t num_many_lut = 1;
+  uint32_t input_big_lwe_dimension = (uint32_t)-1;
+
   // Tracks the degree of each LUT and the max degree on CPU
   // The max degree is (message_modulus * carry_modulus - 1) except for many lut
   // for which it's different
-  uint64_t *degrees;
-  uint64_t *max_degrees;
+  uint64_t *degrees = nullptr;
+  uint64_t *max_degrees = nullptr;
 
   CudaStreams active_streams;
   bool mem_reuse = false;
@@ -297,63 +300,80 @@ template <typename Torus> struct int_radix_lut {
   // These arrays will reside on all GPUs
   // lut could actually be allocated & initialized GPU per GPU but this is not
   // done at the moment
-  std::vector<Torus *> lut_vec;
+  std::vector<OutputTorus *> lut_vec;
   std::vector<Torus *> lut_indexes_vec;
-  Torus *h_lut_indexes;
+  Torus *h_lut_indexes = nullptr;
   // All tmp lwe arrays and index arrays for lwe contain the total
   // amount of blocks to be computed on, there is no split between GPUs
   // for the moment
-  Torus *lwe_indexes_in;
-  Torus *lwe_indexes_out;
-  Torus *h_lwe_indexes_in;
-  Torus *h_lwe_indexes_out;
+  std::shared_ptr<Torus> lwe_indexes_in = nullptr;
+  Torus *lwe_indexes_out = nullptr;
+  Torus *h_lwe_indexes_in = nullptr;
+  Torus *h_lwe_indexes_out = nullptr;
   // Enable optimizations if lwe_indexes_(in/out) are trivial
   bool using_trivial_lwe_indexes = true;
   // lwe_trivial_indexes is the intermediary index we need in case
   // lwe_indexes_in != lwe_indexes_out
-  Torus *lwe_trivial_indexes;
-  CudaRadixCiphertextFFI *tmp_lwe_before_ks;
+  Torus *lwe_trivial_indexes = nullptr;
+  // buffer to store packed message bits of a radix ciphertext
+  std::shared_ptr<CudaRadixCiphertextFFI> tmp_lwe_before_ks;
 
   /// For multi GPU execution we create vectors of pointers for inputs and
   /// outputs
   std::vector<Torus *> lwe_array_in_vec;
   std::vector<Torus *> lwe_after_ks_vec;
-  std::vector<Torus *> lwe_after_pbs_vec;
+  std::vector<OutputTorus *> lwe_after_pbs_vec;
   std::vector<Torus *> lwe_trivial_indexes_vec;
   std::vector<Torus *> lwe_aligned_vec;
 
   bool gpu_memory_allocated;
 
-  cudaEvent_t event_scatter_in;
-  cudaEvent_t *event_scatter_out;
-  cudaEvent_t event_broadcast;
-
-  int_radix_lut(CudaStreams streams, int_radix_params params, uint32_t num_luts,
-                uint32_t num_radix_blocks, bool allocate_gpu_memory,
-                uint64_t &size_tracker) {
-
+  CudaStreamsBarrier multi_gpu_scatter_barrier, multi_gpu_broadcast_barrier;
+  CudaStreamsBarrier multi_gpu_gather_barrier;
+
+  // Setup the LUT configuration:
+  // input_big_lwe_dimension: BIG LWE dimension of the KS output / PBS input
+  // params: cryptographic parameters of the PBS output
+  // num_luts: number of LUTs (or many-LUT sets) in this structure
+  // num_many_lut: number of LUTs to apply in a single PBS pass
+  // num_radix_blocks: number of blocks in the radix integer
+  void setup_config_and_degrees(CudaStreams streams,
+                                uint32_t input_big_lwe_dimension,
+                                int_radix_params params, uint32_t num_luts,
+                                uint32_t num_many_lut,
+                                uint32_t num_radix_blocks,
+                                uint32_t num_input_blocks,
+                                bool allocate_gpu_memory) {
     this->params = params;
     this->num_blocks = num_radix_blocks;
     this->num_luts = num_luts;
-    gpu_memory_allocated = allocate_gpu_memory;
-    uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus);
-    uint64_t lut_buffer_size =
-        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);
+    this->num_many_lut = num_many_lut;
+    this->input_big_lwe_dimension = input_big_lwe_dimension;
+    this->num_input_blocks = num_input_blocks;
+    this->gpu_memory_allocated = allocate_gpu_memory;
 
-    active_streams = streams.active_gpu_subset(num_radix_blocks);
+    this->active_streams = streams.active_gpu_subset(num_radix_blocks);
+  }
+
+  void setup_degrees() {
+    this->degrees =
+        (uint64_t *)malloc(num_many_lut * num_luts * sizeof(uint64_t));
+    this->max_degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t));
+  }
 
-    ///////////////
+  void allocate_pbs_buffers(int_radix_params params, uint32_t num_radix_blocks,
+                            bool allocate_gpu_memory, uint64_t &size_tracker) {
     for (uint i = 0; i < active_streams.count(); i++) {
-      cuda_set_device(streams.gpu_index(i));
+      cuda_set_device(active_streams.gpu_index(i));
       int8_t *gpu_pbs_buffer;
       auto num_blocks_on_gpu = std::max(
           THRESHOLD_MULTI_GPU,
           get_num_inputs_on_gpu(num_radix_blocks, i, active_streams.count()));
 
       uint64_t size = 0;
-      execute_scratch_pbs<Torus>(
-          streams.stream(i), streams.gpu_index(i), &gpu_pbs_buffer,
-          params.glwe_dimension, params.small_lwe_dimension,
+      execute_scratch_pbs<OutputTorus>(
+          active_streams.stream(i), active_streams.gpu_index(i),
+          &gpu_pbs_buffer, params.glwe_dimension, params.small_lwe_dimension,
           params.polynomial_size, params.pbs_level, params.grouping_factor,
           num_blocks_on_gpu, params.pbs_type, allocate_gpu_memory,
           params.noise_reduction_type, size);
@@ -363,111 +383,52 @@ template <typename Torus> struct int_radix_lut {
       buffer.push_back(gpu_pbs_buffer);
     }
 
-    // We create the events only if we have multiple GPUs
-    if (active_streams.count() > 1) {
-      event_scatter_in = cuda_create_event(streams.gpu_index(0));
-      event_broadcast = cuda_create_event(streams.gpu_index(0));
-
-      event_scatter_out =
-          (cudaEvent_t *)malloc(active_streams.count() * sizeof(cudaEvent_t));
-      for (int i = 0; i < active_streams.count(); i++) {
-        event_scatter_out[i] = cuda_create_event(active_streams.gpu_index(i));
-      }
-    }
-
-    // Allocate LUT
-    // LUT is used as a trivial encryption and must be initialized outside
-    // this constructor
-    for (uint i = 0; i < active_streams.count(); i++) {
-      auto lut = (Torus *)cuda_malloc_with_size_tracking_async(
-          num_luts * lut_buffer_size, streams.stream(i), streams.gpu_index(i),
-          size_tracker, allocate_gpu_memory);
-      auto lut_indexes = (Torus *)cuda_malloc_with_size_tracking_async(
-          lut_indexes_size, streams.stream(i), streams.gpu_index(i),
-          size_tracker, allocate_gpu_memory);
-      // lut_indexes is initialized to 0 by default
-      // if a different behavior is wanted, it should be rewritten later
-      cuda_memset_with_size_tracking_async(
-          lut_indexes, 0, lut_indexes_size, streams.stream(i),
-          streams.gpu_index(i), allocate_gpu_memory);
-
-      lut_vec.push_back(lut);
-      lut_indexes_vec.push_back(lut_indexes);
-    }
-
-    // lwe_(input/output)_indexes are initialized to range(num_radix_blocks)
-    // by default
-    lwe_indexes_in = (Torus *)cuda_malloc_with_size_tracking_async(
-        num_radix_blocks * sizeof(Torus), streams.stream(0),
-        streams.gpu_index(0), size_tracker, allocate_gpu_memory);
-    lwe_indexes_out = (Torus *)cuda_malloc_with_size_tracking_async(
-        num_radix_blocks * sizeof(Torus), streams.stream(0),
-        streams.gpu_index(0), size_tracker, allocate_gpu_memory);
-    lwe_trivial_indexes = (Torus *)cuda_malloc_with_size_tracking_async(
-        num_radix_blocks * sizeof(Torus), streams.stream(0),
-        streams.gpu_index(0), size_tracker, allocate_gpu_memory);
-
-    h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
-    h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
-    for (int i = 0; i < num_radix_blocks; i++)
-      h_lwe_indexes_in[i] = i;
+    tmp_lwe_before_ks = std::make_shared<CudaRadixCiphertextFFI>();
+    create_zero_radix_ciphertext_async<Torus>(
+        active_streams.stream(0), active_streams.gpu_index(0),
+        tmp_lwe_before_ks.get(), num_radix_blocks, input_big_lwe_dimension,
+        size_tracker, allocate_gpu_memory);
+  }
 
-    cuda_memcpy_with_size_tracking_async_to_gpu(
-        lwe_indexes_in, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus),
-        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
-    cuda_memcpy_with_size_tracking_async_to_gpu(
-        lwe_indexes_out, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus),
-        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
-    cuda_memcpy_with_size_tracking_async_to_gpu(
-        lwe_trivial_indexes, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus),
-        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
-    memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
-           num_radix_blocks * sizeof(Torus));
+  void alloc_and_init_multi_gpu_buffers(int_radix_params params,
+                                        uint32_t num_radix_blocks,
+                                        bool allocate_gpu_memory,
+                                        uint64_t &size_tracker) {
+    GPU_ASSERT(lwe_array_in_vec.empty(), "Multi GPU buffers already allocated");
 
     /// With multiple GPUs we allocate arrays to be pushed to the vectors and
     /// copy data on each GPU then when we gather data to GPU 0 we can copy
     /// back to the original indexing
-    multi_gpu_alloc_lwe_async(streams, lwe_array_in_vec, num_radix_blocks,
-                              params.big_lwe_dimension + 1, size_tracker,
-                              allocate_gpu_memory);
-    multi_gpu_alloc_lwe_async(streams, lwe_after_ks_vec, num_radix_blocks,
-                              params.small_lwe_dimension + 1, size_tracker,
-                              allocate_gpu_memory);
-    multi_gpu_alloc_lwe_async(streams, lwe_after_pbs_vec, num_radix_blocks,
-                              params.big_lwe_dimension + 1, size_tracker,
-                              allocate_gpu_memory);
-    multi_gpu_alloc_array_async(streams, lwe_trivial_indexes_vec,
+    multi_gpu_alloc_lwe_async(active_streams, lwe_array_in_vec,
+                              num_radix_blocks, params.big_lwe_dimension + 1,
+                              size_tracker, allocate_gpu_memory);
+    multi_gpu_alloc_lwe_async(active_streams, lwe_after_ks_vec,
+                              num_radix_blocks, params.small_lwe_dimension + 1,
+                              size_tracker, allocate_gpu_memory);
+    if (num_many_lut > 1) {
+      multi_gpu_alloc_lwe_many_lut_output_async(
+          active_streams, lwe_after_pbs_vec, num_radix_blocks, num_many_lut,
+          params.big_lwe_dimension + 1, size_tracker, allocate_gpu_memory);
+    } else {
+      multi_gpu_alloc_lwe_async(active_streams, lwe_after_pbs_vec,
+                                num_radix_blocks, params.big_lwe_dimension + 1,
+                                size_tracker, allocate_gpu_memory);
+    }
+    multi_gpu_alloc_array_async(active_streams, lwe_trivial_indexes_vec,
                                 num_radix_blocks, size_tracker,
                                 allocate_gpu_memory);
-    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+    cuda_synchronize_stream(active_streams.stream(0),
+                            active_streams.gpu_index(0));
+
+    // This call will not copy if allocate_gpu_memory is false
+    // thus it's safe to call it on a null source pointer
     multi_gpu_copy_array_async(active_streams, lwe_trivial_indexes_vec,
                                lwe_trivial_indexes, num_radix_blocks,
                                allocate_gpu_memory);
-
-    // Keyswitch
-    tmp_lwe_before_ks = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), tmp_lwe_before_ks,
-        num_radix_blocks, params.big_lwe_dimension, size_tracker,
-        allocate_gpu_memory);
-    h_lut_indexes = (Torus *)(calloc(num_radix_blocks, sizeof(Torus)));
-    degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t));
-    max_degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t));
   }
 
-  // constructor to reuse memory
-  int_radix_lut(CudaStreams streams, int_radix_params params, uint32_t num_luts,
-                uint32_t num_radix_blocks, int_radix_lut *base_lut_object,
-                bool allocate_gpu_memory, uint64_t &size_tracker) {
-
-    this->params = params;
-    this->num_blocks = num_radix_blocks;
-    this->num_luts = num_luts;
-    gpu_memory_allocated = allocate_gpu_memory;
-    uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus);
-    uint64_t lut_buffer_size =
-        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);
-
+  void setup_mem_reuse(uint32_t num_radix_blocks,
+                       int_radix_lut_generic *base_lut_object) {
     // base lut object should have bigger or equal memory than current one
     if (num_radix_blocks > base_lut_object->num_blocks)
       PANIC("Cuda error: lut does not have enough blocks")
@@ -485,38 +446,22 @@ template <typename Torus> struct int_radix_lut {
     lwe_trivial_indexes_vec = base_lut_object->lwe_trivial_indexes_vec;
 
     mem_reuse = true;
+  }
 
-    // Allocate LUT
-    // LUT is used as a trivial encryption and must be initialized outside
-    // this constructor
-    active_streams = streams.active_gpu_subset(num_radix_blocks);
-    for (uint i = 0; i < active_streams.count(); i++) {
-      auto lut = (Torus *)cuda_malloc_with_size_tracking_async(
-          num_luts * lut_buffer_size, streams.stream(i), streams.gpu_index(i),
-          size_tracker, allocate_gpu_memory);
-      auto lut_indexes = (Torus *)cuda_malloc_with_size_tracking_async(
-          lut_indexes_size, streams.stream(i), streams.gpu_index(i),
-          size_tracker, allocate_gpu_memory);
-      // lut_indexes is initialized to 0 by default
-      // if a different behavior is wanted, it should be rewritten later
-      cuda_memset_with_size_tracking_async(
-          lut_indexes, 0, lut_indexes_size, streams.stream(i),
-          streams.gpu_index(i), allocate_gpu_memory);
-      lut_vec.push_back(lut);
-      lut_indexes_vec.push_back(lut_indexes);
-    }
-
+  void setup_lwe_trivial_indices(uint32_t num_radix_blocks,
+                                 bool allocate_gpu_memory,
+                                 uint64_t &size_tracker) {
     // lwe_(input/output)_indexes are initialized to range(num_radix_blocks)
     // by default
-    lwe_indexes_in = (Torus *)cuda_malloc_with_size_tracking_async(
-        num_radix_blocks * sizeof(Torus), streams.stream(0),
-        streams.gpu_index(0), size_tracker, allocate_gpu_memory);
+    lwe_indexes_in = cuda_make_shared_with_size_tracking_async<Torus>(
+        num_radix_blocks * sizeof(Torus), active_streams.stream(0),
+        active_streams.gpu_index(0), size_tracker, allocate_gpu_memory);
     lwe_indexes_out = (Torus *)cuda_malloc_with_size_tracking_async(
-        num_radix_blocks * sizeof(Torus), streams.stream(0),
-        streams.gpu_index(0), size_tracker, allocate_gpu_memory);
+        num_radix_blocks * sizeof(Torus), active_streams.stream(0),
+        active_streams.gpu_index(0), size_tracker, allocate_gpu_memory);
     lwe_trivial_indexes = (Torus *)cuda_malloc_with_size_tracking_async(
-        num_radix_blocks * sizeof(Torus), streams.stream(0),
-        streams.gpu_index(0), size_tracker, allocate_gpu_memory);
+        num_radix_blocks * sizeof(Torus), active_streams.stream(0),
+        active_streams.gpu_index(0), size_tracker, allocate_gpu_memory);
 
     h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
     h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
@@ -525,147 +470,131 @@ template <typename Torus> struct int_radix_lut {
       h_lwe_indexes_in[i] = i;
 
     cuda_memcpy_with_size_tracking_async_to_gpu(
-        lwe_indexes_in, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus),
-        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
+        lwe_indexes_in.get(), h_lwe_indexes_in,
+        num_radix_blocks * sizeof(Torus), active_streams.stream(0),
+        active_streams.gpu_index(0), allocate_gpu_memory);
     cuda_memcpy_with_size_tracking_async_to_gpu(
         lwe_indexes_out, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus),
-        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
+        active_streams.stream(0), active_streams.gpu_index(0),
+        allocate_gpu_memory);
     cuda_memcpy_with_size_tracking_async_to_gpu(
         lwe_trivial_indexes, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus),
-        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
+        active_streams.stream(0), active_streams.gpu_index(0),
+        allocate_gpu_memory);
     memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
            num_radix_blocks * sizeof(Torus));
+
     h_lut_indexes = (Torus *)(calloc(num_radix_blocks, sizeof(Torus)));
-    degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t));
-    max_degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t));
   }
 
-  // Construction for many luts
-  int_radix_lut(CudaStreams streams, int_radix_params params, uint32_t num_luts,
-                uint32_t num_radix_blocks, uint32_t num_many_lut,
-                bool allocate_gpu_memory, uint64_t &size_tracker) {
+  void setup_multi_gpu(int_radix_params params, uint32_t num_radix_blocks,
+                       bool allocate_gpu_memory, uint64_t &size_tracker) {
 
-    this->num_many_lut = num_many_lut;
-    this->params = params;
-    this->num_blocks = num_radix_blocks;
-    this->num_luts = num_luts;
-    gpu_memory_allocated = allocate_gpu_memory;
-    uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus);
-    uint64_t lut_buffer_size =
-        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);
-
-    ///////////////
-    active_streams = streams.active_gpu_subset(num_radix_blocks);
-    for (uint i = 0; i < active_streams.count(); i++) {
-      cuda_set_device(streams.gpu_index(i));
-      int8_t *gpu_pbs_buffer;
-      auto num_blocks_on_gpu = std::max(
-          THRESHOLD_MULTI_GPU,
-          get_num_inputs_on_gpu(num_radix_blocks, i, active_streams.count()));
+    if (!mem_reuse)
+      alloc_and_init_multi_gpu_buffers(params, num_radix_blocks,
+                                       allocate_gpu_memory, size_tracker);
 
-      uint64_t size = 0;
-      execute_scratch_pbs<Torus>(
-          streams.stream(i), streams.gpu_index(i), &gpu_pbs_buffer,
-          params.glwe_dimension, params.small_lwe_dimension,
-          params.polynomial_size, params.pbs_level, params.grouping_factor,
-          num_blocks_on_gpu, params.pbs_type, allocate_gpu_memory,
-          params.noise_reduction_type, size);
-      if (i == 0) {
-        size_tracker += size;
-      }
-      buffer.push_back(gpu_pbs_buffer);
-    }
-    // We create the events only if we have multiple GPUs
     if (active_streams.count() > 1) {
-      event_scatter_in = cuda_create_event(streams.gpu_index(0));
-      event_broadcast = cuda_create_event(streams.gpu_index(0));
-
-      event_scatter_out =
-          (cudaEvent_t *)malloc(active_streams.count() * sizeof(cudaEvent_t));
-      for (int i = 0; i < active_streams.count(); i++) {
-        event_scatter_out[i] = cuda_create_event(active_streams.gpu_index(i));
-      }
-    }
-    // Allocate LUT
-    // LUT is used as a trivial encryption and must be initialized outside
-    // this constructor
-    for (uint i = 0; i < active_streams.count(); i++) {
-      auto lut = (Torus *)cuda_malloc_with_size_tracking_async(
-          num_luts * lut_buffer_size, streams.stream(i), streams.gpu_index(i),
-          size_tracker, allocate_gpu_memory);
-      auto lut_indexes = (Torus *)cuda_malloc_with_size_tracking_async(
-          lut_indexes_size, streams.stream(i), streams.gpu_index(i),
-          size_tracker, allocate_gpu_memory);
-      // lut_indexes is initialized to 0 by default
-      // if a different behavior is wanted, it should be rewritten later
-      cuda_memset_with_size_tracking_async(
-          lut_indexes, 0, lut_indexes_size, streams.stream(i),
-          streams.gpu_index(i), allocate_gpu_memory);
-      lut_vec.push_back(lut);
-      lut_indexes_vec.push_back(lut_indexes);
+      // event_scatter_in = cuda_create_event(active_streams.gpu_index(0));
+      multi_gpu_gather_barrier.create_on(active_streams);
+      multi_gpu_broadcast_barrier.create_on(active_streams);
+      multi_gpu_scatter_barrier.create_on(active_streams);
     }
+  }
 
-    // lwe_(input/output)_indexes are initialized to range(num_radix_blocks)
-    // by default
-    lwe_indexes_in = (Torus *)cuda_malloc_with_size_tracking_async(
-        num_radix_blocks * sizeof(Torus), streams.stream(0),
-        streams.gpu_index(0), size_tracker, allocate_gpu_memory);
-    lwe_indexes_out = (Torus *)cuda_malloc_with_size_tracking_async(
-        num_radix_blocks * sizeof(Torus), streams.stream(0),
-        streams.gpu_index(0), size_tracker, allocate_gpu_memory);
-    lwe_trivial_indexes = (Torus *)cuda_malloc_with_size_tracking_async(
-        num_radix_blocks * sizeof(Torus), streams.stream(0),
-        streams.gpu_index(0), size_tracker, allocate_gpu_memory);
+  int_radix_lut_generic(CudaStreams streams, int_radix_params params,
+                        uint32_t num_luts, uint32_t num_radix_blocks,
+                        bool allocate_gpu_memory, uint64_t &size_tracker) {
 
-    h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
-    h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
+    setup_config_and_degrees(streams, params.big_lwe_dimension, params,
+                             num_luts, 1, num_radix_blocks, 1,
+                             allocate_gpu_memory);
 
-    for (int i = 0; i < num_radix_blocks; i++)
-      h_lwe_indexes_in[i] = i;
+    setup_degrees();
 
-    cuda_memcpy_with_size_tracking_async_to_gpu(
-        lwe_indexes_in, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus),
-        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
-    cuda_memcpy_with_size_tracking_async_to_gpu(
-        lwe_indexes_out, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus),
-        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
-    cuda_memcpy_with_size_tracking_async_to_gpu(
-        lwe_trivial_indexes, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus),
-        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
-    memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
-           num_radix_blocks * sizeof(Torus));
+    allocate_pbs_buffers(params, num_radix_blocks, allocate_gpu_memory,
+                         size_tracker);
 
-    /// With multiple GPUs we allocate arrays to be pushed to the vectors and
-    /// copy data on each GPU then when we gather data to GPU 0 we can copy
-    /// back to the original indexing
-    multi_gpu_alloc_lwe_async(active_streams, lwe_array_in_vec,
-                              num_radix_blocks, params.big_lwe_dimension + 1,
-                              size_tracker, allocate_gpu_memory);
-    multi_gpu_alloc_lwe_async(active_streams, lwe_after_ks_vec,
-                              num_radix_blocks, params.small_lwe_dimension + 1,
-                              size_tracker, allocate_gpu_memory);
-    multi_gpu_alloc_lwe_many_lut_output_async(
-        active_streams, lwe_after_pbs_vec, num_radix_blocks, num_many_lut,
-        params.big_lwe_dimension + 1, size_tracker, allocate_gpu_memory);
-    multi_gpu_alloc_array_async(active_streams, lwe_trivial_indexes_vec,
-                                num_radix_blocks, size_tracker,
-                                allocate_gpu_memory);
-    multi_gpu_copy_array_from_cpu_async(active_streams, lwe_trivial_indexes_vec,
-                                        h_lwe_indexes_in, num_radix_blocks,
-                                        allocate_gpu_memory);
-    // Keyswitch
-    tmp_lwe_before_ks = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), tmp_lwe_before_ks,
-        num_radix_blocks, params.big_lwe_dimension, size_tracker,
-        allocate_gpu_memory);
-    h_lut_indexes = (Torus *)(calloc(num_radix_blocks, sizeof(Torus)));
-    degrees = (uint64_t *)malloc(num_many_lut * num_luts * sizeof(uint64_t));
-    max_degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t));
+    allocate_luts_and_indexes(num_radix_blocks, size_tracker);
+
+    setup_lwe_trivial_indices(num_radix_blocks, allocate_gpu_memory,
+                              size_tracker);
+
+    setup_multi_gpu(params, num_radix_blocks, allocate_gpu_memory,
+                    size_tracker);
+  }
+
+  int_radix_lut_generic(CudaStreams streams, uint32_t input_big_lwe_dimension,
+                        int_radix_params params, uint32_t num_luts,
+                        uint32_t num_radix_blocks, uint32_t num_input_blocks,
+                        bool allocate_gpu_memory, uint64_t &size_tracker) {
+
+    setup_config_and_degrees(streams, input_big_lwe_dimension, params, num_luts,
+                             1, num_radix_blocks, num_input_blocks,
+                             allocate_gpu_memory);
+
+    setup_degrees();
+
+    allocate_pbs_buffers(params, num_radix_blocks, allocate_gpu_memory,
+                         size_tracker);
+
+    allocate_luts_and_indexes(num_radix_blocks, size_tracker);
+
+    setup_lwe_trivial_indices(num_radix_blocks, allocate_gpu_memory,
+                              size_tracker);
+
+    setup_multi_gpu(params, num_radix_blocks, allocate_gpu_memory,
+                    size_tracker);
+  }
+
+  // constructor to reuse memory
+  int_radix_lut_generic(CudaStreams streams, int_radix_params params,
+                        uint32_t num_luts, uint32_t num_radix_blocks,
+                        int_radix_lut_generic *base_lut_object,
+                        bool allocate_gpu_memory, uint64_t &size_tracker) {
+    setup_config_and_degrees(streams, params.big_lwe_dimension, params,
+                             num_luts, 1, num_radix_blocks, 1,
+                             allocate_gpu_memory);
+
+    setup_degrees();
+
+    setup_mem_reuse(num_radix_blocks, base_lut_object);
+
+    allocate_luts_and_indexes(num_radix_blocks, size_tracker);
+
+    setup_lwe_trivial_indices(num_radix_blocks, allocate_gpu_memory,
+                              size_tracker);
+
+    setup_multi_gpu(params, num_radix_blocks, allocate_gpu_memory,
+                    size_tracker);
+  }
+
+  // Construction for many luts
+  int_radix_lut_generic(CudaStreams streams, int_radix_params params,
+                        uint32_t num_luts, uint32_t num_radix_blocks,
+                        uint32_t num_many_lut, bool allocate_gpu_memory,
+                        uint64_t &size_tracker) {
+
+    setup_config_and_degrees(streams, params.big_lwe_dimension, params,
+                             num_luts, num_many_lut, num_radix_blocks, 1,
+                             allocate_gpu_memory);
+
+    setup_degrees();
+
+    allocate_pbs_buffers(params, num_radix_blocks, allocate_gpu_memory,
+                         size_tracker);
+
+    allocate_luts_and_indexes(num_radix_blocks, size_tracker);
+
+    setup_lwe_trivial_indices(num_radix_blocks, allocate_gpu_memory,
+                              size_tracker);
+
+    setup_multi_gpu(params, num_radix_blocks, allocate_gpu_memory,
+                    size_tracker);
   }
 
   // Return a pointer to idx-ith lut at gpu_index's global memory
-  Torus *get_lut(uint32_t gpu_index, size_t idx) {
+  OutputTorus *get_lut(uint32_t gpu_index, size_t idx) {
     if (!gpu_memory_allocated)
       return nullptr;
     auto lut = lut_vec[gpu_index];
@@ -677,10 +606,16 @@ template <typename Torus> struct int_radix_lut {
   }
 
   // Return a pointer to idx-ith degree
-  uint64_t *get_degree(size_t idx) { return &degrees[num_many_lut * idx]; }
+  uint64_t *get_degree(size_t idx) {
+    GPU_ASSERT(idx < num_luts, "Invalid degree requested");
+    return &degrees[num_many_lut * idx];
+  }
 
   // Return a pointer to idx-ith max degree
-  uint64_t *get_max_degree(size_t idx) { return &max_degrees[idx]; }
+  uint64_t *get_max_degree(size_t idx) {
+    GPU_ASSERT(idx < num_luts, "Invalid degree requested");
+    return &max_degrees[idx];
+  }
 
   // Return a pointer to idx-ith lut indexes at gpu_index's global memory
   Torus *get_lut_indexes(uint32_t gpu_index, size_t ind) {
@@ -690,6 +625,31 @@ template <typename Torus> struct int_radix_lut {
     return &lut_indexes[ind];
   }
 
+  // Allocate LUT
+  // LUT is used as a trivial encryption and must be initialized outside
+  // this constructor
+  void allocate_luts_and_indexes(uint32_t num_radix_blocks,
+                                 uint64_t &size_tracker) {
+    uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus);
+    uint64_t lut_buffer_size = (params.glwe_dimension + 1) *
+                               params.polynomial_size * sizeof(OutputTorus);
+
+    for (uint i = 0; i < active_streams.count(); i++) {
+      auto lut = (OutputTorus *)cuda_malloc_with_size_tracking_async(
+          num_luts * lut_buffer_size, active_streams.stream(i),
+          active_streams.gpu_index(i), size_tracker, gpu_memory_allocated);
+      auto lut_indexes = (Torus *)cuda_malloc_with_size_tracking_async(
+          lut_indexes_size, active_streams.stream(i),
+          active_streams.gpu_index(i), size_tracker, gpu_memory_allocated);
+      // lut_indexes is initialized to 0 by default
+      // if a different behavior is wanted, it should be rewritten later
+      cuda_memset_with_size_tracking_async(
+          lut_indexes, 0, lut_indexes_size, active_streams.stream(i),
+          active_streams.gpu_index(i), gpu_memory_allocated);
+      lut_vec.push_back(lut);
+      lut_indexes_vec.push_back(lut_indexes);
+    }
+  }
   // If this function is called we assume the lwe_indexes_(in/out) are not the
   // trivial anymore and thus we disable optimizations
   void set_lwe_indexes(cudaStream_t stream, uint32_t gpu_index,
@@ -699,8 +659,8 @@ template <typename Torus> struct int_radix_lut {
     memcpy(h_lwe_indexes_out, h_indexes_out, num_blocks * sizeof(Torus));
 
     cuda_memcpy_with_size_tracking_async_to_gpu(
-        lwe_indexes_in, h_lwe_indexes_in, num_blocks * sizeof(Torus), stream,
-        gpu_index, gpu_memory_allocated);
+        lwe_indexes_in.get(), h_lwe_indexes_in, num_blocks * sizeof(Torus),
+        stream, gpu_index, gpu_memory_allocated);
     cuda_memcpy_with_size_tracking_async_to_gpu(
         lwe_indexes_out, h_lwe_indexes_out, num_blocks * sizeof(Torus), stream,
         gpu_index, gpu_memory_allocated);
@@ -711,47 +671,66 @@ template <typename Torus> struct int_radix_lut {
   // Broadcast luts from device gpu_indexes[0] to all active gpus
   void broadcast_lut(CudaStreams new_active_streams,
                      bool broadcast_lut_values = true) {
+    PANIC_IF_FALSE(new_active_streams.gpu_index(0) ==
+                       active_streams.gpu_index(0),
+                   "Broadcasting LUTs can only be done using the same GPUs "
+                   " originally assigned to the int_radix_lut");
+
     // We only do broadcast if there are more than 1 active GPU
-    if (new_active_streams.count() > 1) {
-      int active_device = cuda_get_device();
-
-      uint64_t lut_size = (params.glwe_dimension + 1) * params.polynomial_size;
-
-      auto src_lut = lut_vec[0];
-      auto src_lut_indexes = lut_indexes_vec[0];
-      if (active_streams.count() > 1)
-        cuda_event_record(event_broadcast, new_active_streams.stream(0),
-                          new_active_streams.gpu_index(0));
-      for (uint i = 0; i < new_active_streams.count(); i++) {
-        PANIC_IF_FALSE(
-            new_active_streams.gpu_index(i) == active_streams.gpu_index(i),
-            "Broadcasting LUTs can only be done to the LUT streams or to new "
-            "streams that reside on the same GPUs as the source LUTs");
+    if (new_active_streams.count() == 1)
+      return;
+
+    GPU_ASSERT(active_streams.count() >= new_active_streams.count(),
+               "To broadcast a LUT to a GPU set, it must have been initialized "
+               "with a GPU set that is greater or equal in size");
+
+    int active_device = cuda_get_device();
+
+    uint64_t lut_size = (params.glwe_dimension + 1) * params.polynomial_size;
+
+    // Wait for GPU 0 to receive all data from previous computations
+    // that may have occurred on different GPUs
+    multi_gpu_broadcast_barrier.local_streams_wait_for_stream_0(
+        new_active_streams);
 
+    // The LUT and its indexes reside on GPU 0
+    // these were filled by calls to generate_device_accumulator
+    // due to the previous synchronization, we're sure these buffers have
+    // finished copying to GPU 0 from CPU
+    auto src_lut = lut_vec[0];
+    auto src_lut_indexes = lut_indexes_vec[0];
+
+    for (uint i = 1; i < new_active_streams.count(); i++) {
+      PANIC_IF_FALSE(
+          new_active_streams.gpu_index(i) == active_streams.gpu_index(i),
+          "Broadcasting LUTs can only be done to the LUT streams or to new "
+          "streams that reside on the same GPUs as the source LUTs");
+
+      // Check for redundant copies
 #ifndef DEBUG_FAKE_MULTI_GPU
-        if (new_active_streams.gpu_index(i) == new_active_streams.gpu_index(0))
-          continue;
+      PANIC_IF_FALSE(new_active_streams.gpu_index(i) !=
+                         new_active_streams.gpu_index(0),
+                     "Broadcast LUT does not handle duplicate GPUs in the "
+                     "active streams set");
 #endif
 
-        cuda_stream_wait_event(new_active_streams.stream(i), event_broadcast,
-                               new_active_streams.gpu_index(i));
-        if (broadcast_lut_values) {
-          auto dst_lut = lut_vec[i];
-          cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
-              dst_lut, src_lut, num_luts * lut_size * sizeof(Torus),
-              new_active_streams.stream(i), new_active_streams.gpu_index(i),
-              gpu_memory_allocated);
-        }
-        auto dst_lut_indexes = lut_indexes_vec[i];
+      if (broadcast_lut_values) {
+        auto dst_lut = lut_vec[i];
         cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
-            dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(Torus),
+            dst_lut, src_lut, num_luts * lut_size * sizeof(Torus),
             new_active_streams.stream(i), new_active_streams.gpu_index(i),
             gpu_memory_allocated);
       }
-      // Ensure the device set at the end of this method is the same as it was
-      // set at the beginning
-      cuda_set_device(active_device);
+      auto dst_lut_indexes = lut_indexes_vec[i];
+      cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
+          dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(Torus),
+          new_active_streams.stream(i), new_active_streams.gpu_index(i),
+          gpu_memory_allocated);
     }
+
+    // Ensure the device set at the end of this method is the same as it was
+    // set at the beginning
+    cuda_set_device(active_device);
   }
 
   void allocate_lwe_vector_for_non_trivial_indexes(
@@ -779,75 +758,80 @@ template <typename Torus> struct int_radix_lut {
     PANIC_IF_FALSE(lut_indexes_vec.size() == lut_vec.size(),
                    "Lut vec and Lut vec indexes must have the same size");
     for (uint i = 0; i < lut_vec.size(); i++) {
-      cuda_drop_with_size_tracking_async(lut_vec[i], streams.stream(i),
-                                         streams.gpu_index(i),
-                                         gpu_memory_allocated);
-      cuda_drop_with_size_tracking_async(lut_indexes_vec[i], streams.stream(i),
-                                         streams.gpu_index(i),
+      cuda_drop_with_size_tracking_async(lut_vec[i], active_streams.stream(i),
+                                         active_streams.gpu_index(i),
                                          gpu_memory_allocated);
-    }
-    cuda_drop_with_size_tracking_async(lwe_indexes_in, streams.stream(0),
-                                       streams.gpu_index(0),
-                                       gpu_memory_allocated);
-    cuda_drop_with_size_tracking_async(lwe_indexes_out, streams.stream(0),
-                                       streams.gpu_index(0),
-                                       gpu_memory_allocated);
-    cuda_drop_with_size_tracking_async(lwe_trivial_indexes, streams.stream(0),
-                                       streams.gpu_index(0),
-                                       gpu_memory_allocated);
+      cuda_drop_with_size_tracking_async(
+          lut_indexes_vec[i], active_streams.stream(i),
+          active_streams.gpu_index(i), gpu_memory_allocated);
+    }
+    lwe_indexes_in.reset();
+    /*cuda_drop_with_size_tracking_async(lwe_indexes_in,
+       active_streams.stream(0), active_streams.gpu_index(0),
+                                       gpu_memory_allocated);*/
+    cuda_drop_with_size_tracking_async(
+        lwe_indexes_out, active_streams.stream(0), active_streams.gpu_index(0),
+        gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(
+        lwe_trivial_indexes, active_streams.stream(0),
+        active_streams.gpu_index(0), gpu_memory_allocated);
 
-    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+    cuda_synchronize_stream(active_streams.stream(0),
+                            active_streams.gpu_index(0));
     lut_vec.clear();
     lut_indexes_vec.clear();
     free(h_lwe_indexes_in);
     free(h_lwe_indexes_out);
 
+    if (active_streams.count() > 1) {
+      active_streams.synchronize();
+      multi_gpu_gather_barrier.release();
+      multi_gpu_broadcast_barrier.release();
+      multi_gpu_scatter_barrier.release();
+    }
+
     if (!mem_reuse) {
-      release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                     tmp_lwe_before_ks, gpu_memory_allocated);
+      GPU_ASSERT(tmp_lwe_before_ks.use_count() == 1,
+                 "This int_radix_lut is still sharing memory with another");
+      release_radix_ciphertext_async(
+          active_streams.stream(0), active_streams.gpu_index(0),
+          tmp_lwe_before_ks.get(), gpu_memory_allocated);
       for (int i = 0; i < buffer.size(); i++) {
         switch (params.pbs_type) {
         case MULTI_BIT:
           cleanup_cuda_multi_bit_programmable_bootstrap(
-              streams.stream(i), streams.gpu_index(i), &buffer[i]);
+              active_streams.stream(i), active_streams.gpu_index(i),
+              &buffer[i]);
           break;
         case CLASSICAL:
-          cleanup_cuda_programmable_bootstrap(streams.stream(i),
-                                              streams.gpu_index(i), &buffer[i]);
+          cleanup_cuda_programmable_bootstrap(active_streams.stream(i),
+                                              active_streams.gpu_index(i),
+                                              &buffer[i]);
           break;
         default:
           PANIC("Cuda error (PBS): unknown PBS type. ")
         }
-        cuda_synchronize_stream(streams.stream(i), streams.gpu_index(i));
+        cuda_synchronize_stream(active_streams.stream(i),
+                                active_streams.gpu_index(i));
       }
-      delete tmp_lwe_before_ks;
+      tmp_lwe_before_ks.reset();
       buffer.clear();
 
       if (gpu_memory_allocated) {
-        multi_gpu_release_async(streams, lwe_array_in_vec);
-        multi_gpu_release_async(streams, lwe_after_ks_vec);
-        multi_gpu_release_async(streams, lwe_after_pbs_vec);
-        multi_gpu_release_async(streams, lwe_trivial_indexes_vec);
-        streams.synchronize();
+        multi_gpu_release_async(active_streams, lwe_array_in_vec);
+        multi_gpu_release_async(active_streams, lwe_after_ks_vec);
+        multi_gpu_release_async(active_streams, lwe_after_pbs_vec);
+        multi_gpu_release_async(active_streams, lwe_trivial_indexes_vec);
       }
       lwe_array_in_vec.clear();
       lwe_after_ks_vec.clear();
       lwe_after_pbs_vec.clear();
       lwe_trivial_indexes_vec.clear();
-      if (active_streams.count() > 1) {
-        active_streams.synchronize();
-        for (uint i = 0; i < active_streams.count(); i++) {
-          cuda_event_destroy(event_scatter_out[i], active_streams.gpu_index(i));
-        }
-        cuda_event_destroy(event_scatter_in, active_streams.gpu_index(0));
-        cuda_event_destroy(event_broadcast, active_streams.gpu_index(0));
-        free(event_scatter_out);
-      }
       if (lwe_aligned_vec.size() > 0) {
         for (uint i = 0; i < active_streams.count(); i++) {
           cuda_drop_with_size_tracking_async(
-              lwe_aligned_vec[i], streams.stream(0), streams.gpu_index(0),
-              gpu_memory_allocated);
+              lwe_aligned_vec[i], active_streams.stream(0),
+              active_streams.gpu_index(0), gpu_memory_allocated);
         }
         lwe_aligned_vec.clear();
       }
@@ -858,46 +842,13 @@ template <typename Torus> struct int_radix_lut {
   }
 };
 
-template <typename InputTorus> struct int_noise_squashing_lut {
-
-  int_radix_params params;
-  uint32_t input_glwe_dimension;
-  uint32_t input_polynomial_size;
-  uint32_t input_big_lwe_dimension;
-  uint32_t num_blocks;
-  // Tracks the degree of each LUT and the max degree on CPU
-  // The max degree is (message_modulus * carry_modulus - 1) except for many lut
-  // for which it's different
-  uint64_t *degrees;
-  uint64_t *max_degrees;
-
-  CudaStreams active_streams;
-
-  // There will be one buffer on each GPU in multi-GPU computations
-  // (same for tmp lwe arrays)
-  std::vector<int8_t *> pbs_buffer;
-
-  std::vector<__uint128_t *> lut_vec;
+template <typename Torus, typename OutputTorus = Torus>
+using int_radix_lut = int_radix_lut_generic<Torus, Torus>;
 
-  CudaRadixCiphertextFFI *tmp_lwe_before_ks;
+template <typename InputTorus>
+struct int_noise_squashing_lut
+    : int_radix_lut_generic<InputTorus, __uint128_t> {
 
-  // All tmp lwe arrays and index arrays for lwe contain the total
-  // amount of blocks to be computed on, there is no split between GPUs
-  // for the moment
-  InputTorus *lwe_indexes_in;
-
-  InputTorus *h_lwe_indexes_in;
-  InputTorus *lwe_trivial_indexes;
-
-  /// For multi GPU execution we create vectors of pointers for inputs and
-  /// outputs
-  std::vector<InputTorus *> lwe_array_in_vec;
-  std::vector<InputTorus *> lwe_after_ks_vec;
-  std::vector<__uint128_t *> lwe_after_pbs_vec;
-  std::vector<InputTorus *> lwe_trivial_indexes_vec;
-
-  bool using_trivial_lwe_indexes = true;
-  bool gpu_memory_allocated;
   std::vector<InputTorus *> lwe_aligned_scatter_vec;
   std::vector<__uint128_t *> lwe_aligned_gather_vec;
   // noise squashing constructor
@@ -906,165 +857,26 @@ template <typename InputTorus> struct int_noise_squashing_lut {
                           uint32_t input_polynomial_size,
                           uint32_t num_radix_blocks,
                           uint32_t original_num_blocks,
-                          bool allocate_gpu_memory, uint64_t &size_tracker) {
-    this->params = params;
-    this->num_blocks = num_radix_blocks;
-    gpu_memory_allocated = allocate_gpu_memory;
-    // This are the glwe dimension and polynomial size before squashing
-    this->input_glwe_dimension = input_glwe_dimension;
-    this->input_polynomial_size = input_polynomial_size;
-    uint32_t input_big_lwe_dimension =
-        input_glwe_dimension * input_polynomial_size;
-    this->input_big_lwe_dimension = input_big_lwe_dimension;
-
-    uint64_t lut_buffer_size = (params.glwe_dimension + 1) *
-                               params.polynomial_size * sizeof(__uint128_t);
+                          bool allocate_gpu_memory, uint64_t &size_tracker)
 
-    ///////////////
-    active_streams = streams.active_gpu_subset(num_radix_blocks);
-    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
-    for (uint i = 0; i < active_streams.count(); i++) {
-      cuda_set_device(streams.gpu_index(i));
-      auto num_radix_blocks_on_gpu = std::max(
-          THRESHOLD_MULTI_GPU,
-          get_num_inputs_on_gpu(num_radix_blocks, i, active_streams.count()));
-      int8_t *gpu_pbs_buffer;
-      uint64_t size = 0;
-      execute_scratch_pbs<__uint128_t>(
-          streams.stream(i), streams.gpu_index(i), &gpu_pbs_buffer,
-          params.glwe_dimension, params.small_lwe_dimension,
-          params.polynomial_size, params.pbs_level, params.grouping_factor,
-          num_radix_blocks_on_gpu, params.pbs_type, allocate_gpu_memory,
-          params.noise_reduction_type, size);
-      cuda_synchronize_stream(streams.stream(i), streams.gpu_index(i));
-      if (i == 0) {
-        size_tracker += size;
-      }
-      pbs_buffer.push_back(gpu_pbs_buffer);
-    }
-    lwe_indexes_in = (InputTorus *)cuda_malloc_with_size_tracking_async(
-        num_radix_blocks * sizeof(InputTorus), streams.stream(0),
-        streams.gpu_index(0), size_tracker, allocate_gpu_memory);
-    lwe_trivial_indexes = (InputTorus *)cuda_malloc_with_size_tracking_async(
-        num_radix_blocks * sizeof(InputTorus), streams.stream(0),
-        streams.gpu_index(0), size_tracker, allocate_gpu_memory);
-    h_lwe_indexes_in =
-        (InputTorus *)malloc(num_radix_blocks * sizeof(InputTorus));
-    for (int i = 0; i < num_radix_blocks; i++)
-      h_lwe_indexes_in[i] = i;
-
-    cuda_memcpy_with_size_tracking_async_to_gpu(
-        lwe_indexes_in, h_lwe_indexes_in, num_radix_blocks * sizeof(InputTorus),
-        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
-    cuda_memcpy_with_size_tracking_async_to_gpu(
-        lwe_trivial_indexes, h_lwe_indexes_in,
-        num_radix_blocks * sizeof(InputTorus), streams.stream(0),
-        streams.gpu_index(0), allocate_gpu_memory);
-
-    multi_gpu_alloc_lwe_async(active_streams, lwe_array_in_vec,
-                              num_radix_blocks, params.big_lwe_dimension + 1,
-                              size_tracker, allocate_gpu_memory);
-
-    multi_gpu_alloc_lwe_async<InputTorus>(
-        active_streams, lwe_after_ks_vec, num_radix_blocks,
-        params.small_lwe_dimension + 1, size_tracker, allocate_gpu_memory);
-    multi_gpu_alloc_lwe_async<__uint128_t>(
-        active_streams, lwe_after_pbs_vec, num_radix_blocks,
-        params.big_lwe_dimension + 1, size_tracker, allocate_gpu_memory);
-    multi_gpu_alloc_array_async<InputTorus>(
-        active_streams, lwe_trivial_indexes_vec, num_radix_blocks, size_tracker,
-        allocate_gpu_memory);
-    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
-
-    multi_gpu_copy_array_async(active_streams, lwe_trivial_indexes_vec,
-                               lwe_trivial_indexes, num_radix_blocks,
-                               allocate_gpu_memory);
-    if (allocate_gpu_memory) {
-      // Allocate LUT
-      // LUT is used as a trivial encryption and must be initialized outside
-      // this constructor
-      for (uint i = 0; i < active_streams.count(); i++) {
-        auto lut = (__uint128_t *)cuda_malloc_with_size_tracking_async(
-            lut_buffer_size, streams.stream(i), streams.gpu_index(i),
-            size_tracker, allocate_gpu_memory);
-        lut_vec.push_back(lut);
-        cuda_synchronize_stream(streams.stream(i), streams.gpu_index(i));
-      }
-    }
-    // Keyswitch
-    tmp_lwe_before_ks = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<InputTorus>(
-        streams.stream(0), streams.gpu_index(0), tmp_lwe_before_ks,
-        original_num_blocks, input_big_lwe_dimension, size_tracker,
-        allocate_gpu_memory);
-
-    degrees = (uint64_t *)malloc(sizeof(uint64_t));
-    max_degrees = (uint64_t *)malloc(sizeof(uint64_t));
+      : int_radix_lut_generic<InputTorus, __uint128_t>(
+            streams, input_glwe_dimension * input_polynomial_size, params, 1,
+            num_radix_blocks, original_num_blocks, allocate_gpu_memory,
+            size_tracker) {
 
     // lut for the squashing
     auto f_squash = [](__uint128_t block) -> __uint128_t { return block; };
 
-    // Generate the identity LUT, for now we only use one GPU
-    for (uint i = 0; i < active_streams.count(); i++) {
-      auto squash_lut = lut_vec[i];
-      generate_device_accumulator<__uint128_t>(
-          streams.stream(i), streams.gpu_index(i), squash_lut, degrees,
-          max_degrees, params.glwe_dimension, params.polynomial_size,
-          params.message_modulus, params.carry_modulus, f_squash,
-          allocate_gpu_memory);
-    }
-  }
-  void release(CudaStreams streams) {
-    for (uint i = 0; i < lut_vec.size(); i++) {
-      cuda_drop_with_size_tracking_async(lut_vec[i], streams.stream(i),
-                                         streams.gpu_index(i),
-                                         gpu_memory_allocated);
-    }
-    cuda_drop_with_size_tracking_async(lwe_indexes_in, streams.stream(0),
-                                       streams.gpu_index(0),
-                                       gpu_memory_allocated);
-    cuda_drop_with_size_tracking_async(lwe_trivial_indexes, streams.stream(0),
-                                       streams.gpu_index(0),
-                                       gpu_memory_allocated);
-    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
-    lut_vec.clear();
-    free(h_lwe_indexes_in);
+    generate_device_accumulator<__uint128_t>(
+        this->active_streams.stream(0), this->active_streams.gpu_index(0),
+        this->get_lut(0, 0), this->get_degree(0), this->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, f_squash, allocate_gpu_memory);
 
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   tmp_lwe_before_ks, gpu_memory_allocated);
-    for (int i = 0; i < pbs_buffer.size(); i++) {
-      switch (params.pbs_type) {
-      case MULTI_BIT:
-        cleanup_cuda_multi_bit_programmable_bootstrap_128(
-            streams.stream(i), streams.gpu_index(i), &pbs_buffer[i]);
-        break;
-      case CLASSICAL:
-        cleanup_cuda_programmable_bootstrap_128(
-            streams.stream(i), streams.gpu_index(i), &pbs_buffer[i]);
-        break;
-      default:
-        PANIC("Cuda error (PBS): unknown PBS type. ")
-      }
-      cuda_synchronize_stream(streams.stream(i), streams.gpu_index(i));
-    }
-    if (lwe_aligned_gather_vec.size() > 0) {
-      multi_gpu_release_async(streams, lwe_aligned_gather_vec);
-      multi_gpu_release_async(streams, lwe_aligned_scatter_vec);
-    }
-    multi_gpu_release_async(streams, lwe_array_in_vec);
-    multi_gpu_release_async(streams, lwe_after_ks_vec);
-    multi_gpu_release_async(streams, lwe_after_pbs_vec);
-    multi_gpu_release_async(streams, lwe_trivial_indexes_vec);
-    streams.synchronize();
-
-    lwe_array_in_vec.clear();
-    lwe_after_ks_vec.clear();
-    lwe_after_pbs_vec.clear();
-    lwe_trivial_indexes_vec.clear();
-
-    delete tmp_lwe_before_ks;
-    pbs_buffer.clear();
+    this->broadcast_lut(this->active_streams);
   }
+
+  using int_radix_lut_generic<InputTorus, __uint128_t>::release;
 };
 
 // Forward declarations for operation buffers
diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu
index 7c9191d268..342276f762 100644
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -266,6 +266,11 @@ void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
     uint32_t gpu_index, bool gpu_memory_allocated) {
   if (size == 0 || !gpu_memory_allocated)
     return;
+  GPU_ASSERT(dest != nullptr,
+             "Cuda error: trying to copy gpu->gpu to null ptr");
+  GPU_ASSERT(src != nullptr,
+             "Cuda error: trying to copy gpu->gpu from null ptr");
+
   cudaPointerAttributes attr_dest;
   check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
   PANIC_IF_FALSE(
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
index 9c9c5d8c9b..f9ee496aaf 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -31,8 +31,8 @@ __host__ void zero_out_if(CudaStreams streams,
   // second operand is not an array
   auto tmp_lwe_array_input = mem_ptr->tmp;
   host_pack_bivariate_blocks_with_single_block<Torus>(
-      streams, tmp_lwe_array_input, predicate->lwe_indexes_in, lwe_array_input,
-      lwe_condition, predicate->lwe_indexes_in, params.message_modulus,
+      streams, tmp_lwe_array_input, predicate->lwe_indexes_in.get(), lwe_array_input,
+      lwe_condition, predicate->lwe_indexes_in.get(), params.message_modulus,
       num_radix_blocks);
 
   integer_radix_apply_univariate_lookup_table_kb<Torus>(
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
index bfc7c3ff9d..807e8e6e3f 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -344,7 +344,7 @@ host_integer_decompress(CudaStreams streams,
       execute_pbs_async<Torus, Torus>(
           active_streams, (Torus *)d_lwe_array_out->ptr, lut->lwe_indexes_out,
           lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
-          lut->lwe_indexes_in, d_bsks, lut->buffer,
+          lut->lwe_indexes_in.get(), d_bsks, lut->buffer,
           encryption_params.glwe_dimension,
           compression_params.small_lwe_dimension,
           encryption_params.polynomial_size, encryption_params.pbs_base_log,
@@ -359,17 +359,13 @@ host_integer_decompress(CudaStreams streams,
       std::vector<Torus *> lwe_trivial_indexes_vec =
           lut->lwe_trivial_indexes_vec;
 
-      /// Make sure all data that should be on GPU 0 is indeed there
-      cuda_event_record(lut->event_scatter_in, streams.stream(0),
-                        streams.gpu_index(0));
-      for (int j = 1; j < active_streams.count(); j++) {
-        cuda_stream_wait_event(streams.stream(j), lut->event_scatter_in,
-                               streams.gpu_index(j));
-      }
+      lut->multi_gpu_scatter_barrier.local_streams_wait_for_stream_0(
+          active_streams);
+
       /// With multiple GPUs we push to the vectors on each GPU then when we
       /// gather data to GPU 0 we can copy back to the original indexing
       multi_gpu_scatter_lwe_async<Torus>(
-          active_streams, lwe_array_in_vec, extracted_lwe, lut->lwe_indexes_in,
+          active_streams, lwe_array_in_vec, extracted_lwe, lut->lwe_indexes_in.get(),
           lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
           lut->active_streams.count(), num_blocks_to_decompress,
           compression_params.small_lwe_dimension + 1);
@@ -395,15 +391,8 @@ host_integer_decompress(CudaStreams streams,
 
       /// Synchronize all GPUs
       // other gpus record their events
-      for (int j = 1; j < active_streams.count(); j++) {
-        cuda_event_record(lut->event_scatter_out[j], active_streams.stream(j),
-                          active_streams.gpu_index(j));
-      }
-      // GPU 0 waits for all
-      for (int j = 1; j < active_streams.count(); j++) {
-        cuda_stream_wait_event(streams.stream(0), lut->event_scatter_out[j],
-                               streams.gpu_index(0));
-      }
+      lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
+          active_streams);
     }
   } else {
     static_assert(std::is_same_v<Torus, __uint128_t>,
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
index 4c2d1a13ba..841f31cdc8 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -546,7 +546,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
   if (active_streams.count() == 1) {
     execute_keyswitch_async<Torus>(
         streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
-        (Torus *)lwe_array_in->ptr, lut->lwe_indexes_in, ksks,
+        (Torus *)lwe_array_in->ptr, lut->lwe_indexes_in.get(), ksks,
         big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level,
         num_radix_blocks);
 
@@ -560,19 +560,15 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
         grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
   } else {
     /// Make sure all data that should be on GPU 0 is indeed there
-    cuda_event_record(lut->event_scatter_in, streams.stream(0),
-                      streams.gpu_index(0));
-    for (int j = 1; j < active_streams.count(); j++) {
-      cuda_stream_wait_event(streams.stream(j), lut->event_scatter_in,
-                             streams.gpu_index(j));
-    }
+    lut->multi_gpu_scatter_barrier.local_streams_wait_for_stream_0(
+        active_streams);
 
     /// With multiple GPUs we push to the vectors on each GPU then when we
     /// gather data to GPU 0 we can copy back to the original indexing
     PUSH_RANGE("scatter")
     multi_gpu_scatter_lwe_async<Torus>(
         active_streams, lwe_array_in_vec, (Torus *)lwe_array_in->ptr,
-        lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
+        lut->lwe_indexes_in.get(), lut->using_trivial_lwe_indexes,
         lut->lwe_aligned_vec, lut->active_streams.count(), num_radix_blocks,
         big_lwe_dimension + 1);
     POP_RANGE()
@@ -598,16 +594,8 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
         lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
         lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
     POP_RANGE()
-    // other gpus record their events
-    for (int j = 1; j < active_streams.count(); j++) {
-      cuda_event_record(lut->event_scatter_out[j], streams.stream(j),
-                        streams.gpu_index(j));
-    }
-    // GPU 0 waits for all
-    for (int j = 1; j < active_streams.count(); j++) {
-      cuda_stream_wait_event(streams.stream(0), lut->event_scatter_out[j],
-                             streams.gpu_index(0));
-    }
+    lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
+        active_streams);
   }
   for (uint i = 0; i < num_radix_blocks; i++) {
     auto degrees_index = lut->h_lut_indexes[i];
@@ -660,7 +648,7 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
   if (active_streams.count() == 1) {
     execute_keyswitch_async<Torus>(
         streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
-        (Torus *)lwe_array_in->ptr, lut->lwe_indexes_in, ksks,
+        (Torus *)lwe_array_in->ptr, lut->lwe_indexes_in.get(), ksks,
         big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level,
         num_radix_blocks);
 
@@ -674,18 +662,15 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
         grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
   } else {
     /// Make sure all data that should be on GPU 0 is indeed there
-    cuda_event_record(lut->event_scatter_in, streams.stream(0),
-                      streams.gpu_index(0));
-    for (int j = 1; j < active_streams.count(); j++) {
-      cuda_stream_wait_event(streams.stream(j), lut->event_scatter_in,
-                             streams.gpu_index(j));
-    }
+    lut->multi_gpu_scatter_barrier.local_streams_wait_for_stream_0(
+        active_streams);
+
     /// With multiple GPUs we push to the vectors on each GPU then when we
     /// gather data to GPU 0 we can copy back to the original indexing
     PUSH_RANGE("scatter")
     multi_gpu_scatter_lwe_async<Torus>(
         active_streams, lwe_array_in_vec, (Torus *)lwe_array_in->ptr,
-        lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
+        lut->lwe_indexes_in.get(), lut->using_trivial_lwe_indexes,
         lut->lwe_aligned_vec, lut->active_streams.count(), num_radix_blocks,
         big_lwe_dimension + 1);
     POP_RANGE()
@@ -712,16 +697,8 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
         num_radix_blocks, big_lwe_dimension + 1, num_many_lut);
     POP_RANGE()
 
-    // other gpus record their events
-    for (int j = 1; j < active_streams.count(); j++) {
-      cuda_event_record(lut->event_scatter_out[j], streams.stream(j),
-                        streams.gpu_index(j));
-    }
-    // GPU 0 waits for all
-    for (int j = 1; j < active_streams.count(); j++) {
-      cuda_stream_wait_event(streams.stream(0), lut->event_scatter_out[j],
-                             streams.gpu_index(0));
-    }
+    lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
+        active_streams);
   }
   for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
     auto degrees_index = lut->h_lut_indexes[i % lut->num_blocks];
@@ -771,10 +748,10 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
   uint32_t lut_stride = 0;
 
   // Left message is shifted
-  auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
+  auto lwe_array_pbs_in = lut->tmp_lwe_before_ks.get();
   host_pack_bivariate_blocks<Torus>(
       streams, lwe_array_pbs_in, lut->lwe_trivial_indexes, lwe_array_1,
-      lwe_array_2, lut->lwe_indexes_in, shift, num_radix_blocks,
+      lwe_array_2, lut->lwe_indexes_in.get(), shift, num_radix_blocks,
       params.message_modulus, params.carry_modulus);
   check_cuda_error(cudaGetLastError());
 
@@ -789,7 +766,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
   if (active_streams.count() == 1) {
     execute_keyswitch_async<Torus>(
         streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
-        (Torus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in, ksks,
+        (Torus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in.get(), ksks,
         big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level,
         num_radix_blocks);
 
@@ -802,16 +779,13 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
         small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
         grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
   } else {
-    cuda_event_record(lut->event_scatter_in, streams.stream(0),
-                      streams.gpu_index(0));
-    for (int j = 1; j < active_streams.count(); j++) {
-      cuda_stream_wait_event(streams.stream(j), lut->event_scatter_in,
-                             streams.gpu_index(j));
-    }
+    lut->multi_gpu_scatter_barrier.local_streams_wait_for_stream_0(
+        active_streams);
+
     PUSH_RANGE("scatter")
     multi_gpu_scatter_lwe_async<Torus>(
         active_streams, lwe_array_in_vec, (Torus *)lwe_array_pbs_in->ptr,
-        lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
+        lut->lwe_indexes_in.get(), lut->using_trivial_lwe_indexes,
         lut->lwe_aligned_vec, lut->active_streams.count(), num_radix_blocks,
         big_lwe_dimension + 1);
     POP_RANGE()
@@ -837,16 +811,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
         lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
         lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
     POP_RANGE()
-    // other gpus record their events
-    for (int j = 1; j < active_streams.count(); j++) {
-      cuda_event_record(lut->event_scatter_out[j], streams.stream(j),
-                        streams.gpu_index(j));
-    }
-    // GPU 0 waits for all
-    for (int j = 1; j < active_streams.count(); j++) {
-      cuda_stream_wait_event(streams.stream(0), lut->event_scatter_out[j],
-                             streams.gpu_index(0));
-    }
+    lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
+        active_streams);
   }
   for (uint i = 0; i < num_radix_blocks; i++) {
     auto degrees_index = lut->h_lut_indexes[i];
@@ -2368,7 +2334,7 @@ __host__ void integer_radix_apply_noise_squashing_kb(
 
   /// For multi GPU execution we create vectors of pointers for inputs and
   /// outputs
-  auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
+  auto lwe_array_pbs_in = lut->tmp_lwe_before_ks.get();
   std::vector<InputTorus *> lwe_array_in_vec = lut->lwe_array_in_vec;
   std::vector<InputTorus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
   std::vector<__uint128_t *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
@@ -2387,7 +2353,7 @@ __host__ void integer_radix_apply_noise_squashing_kb(
   if (active_streams.count() == 1) {
     execute_keyswitch_async<InputTorus>(
         streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
-        (InputTorus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in, ksks,
+        (InputTorus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in.get(), ksks,
         lut->input_big_lwe_dimension, small_lwe_dimension, ks_base_log,
         ks_level, lwe_array_out->num_radix_blocks);
 
@@ -2399,7 +2365,7 @@ __host__ void integer_radix_apply_noise_squashing_kb(
     execute_pbs_async<uint64_t, __uint128_t>(
         streams.get_ith(0), (__uint128_t *)lwe_array_out->ptr,
         lwe_trivial_indexes_vec[0], lut->lut_vec, lwe_trivial_indexes_vec,
-        lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks, lut->pbs_buffer,
+        lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks, lut->buffer,
         glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
         pbs_level, grouping_factor, lwe_array_out->num_radix_blocks,
         params.pbs_type, 0, 0);
@@ -2411,7 +2377,7 @@ __host__ void integer_radix_apply_noise_squashing_kb(
     /// gather data to GPU 0 we can copy back to the original indexing
     multi_gpu_scatter_lwe_async<InputTorus>(
         active_streams, lwe_array_in_vec, (InputTorus *)lwe_array_pbs_in->ptr,
-        lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
+        lut->lwe_indexes_in.get(), lut->using_trivial_lwe_indexes,
         lut->lwe_aligned_scatter_vec, lut->active_streams.count(),
         lwe_array_out->num_radix_blocks, lut->input_big_lwe_dimension + 1);
 
@@ -2426,7 +2392,7 @@ __host__ void integer_radix_apply_noise_squashing_kb(
     execute_pbs_async<uint64_t, __uint128_t>(
         active_streams, lwe_after_pbs_vec, lwe_trivial_indexes_vec,
         lut->lut_vec, lwe_trivial_indexes_vec, lwe_after_ks_vec,
-        lwe_trivial_indexes_vec, bsks, lut->pbs_buffer, glwe_dimension,
+        lwe_trivial_indexes_vec, bsks, lut->buffer, glwe_dimension,
         small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
         grouping_factor, lwe_array_out->num_radix_blocks, params.pbs_type, 0,
         0);
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
index 92d53c7f19..d48f171181 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -375,7 +375,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
 
   while (needs_processing) {
     auto luts_message_carry = mem_ptr->luts_message_carry;
-    auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
+    auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in.get();
     auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
     calculate_chunks<Torus>
         <<<number_of_blocks_2d, number_of_threads, 0, streams.stream(0)>>>(
@@ -433,7 +433,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
 
   if (mem_ptr->reduce_degrees_for_single_carry_propagation) {
     auto luts_message_carry = mem_ptr->luts_message_carry;
-    auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
+    auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in.get();
     auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
     prepare_final_pbs_indexes<Torus>
         <<<1, 2 * num_radix_blocks, 0, streams.stream(0)>>>(
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
index 360a403cfa..5147717fcb 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
@@ -34,7 +34,7 @@ void host_integer_grouped_oprf(CudaStreams streams,
     execute_pbs_async<Torus, Torus>(
         streams.get_ith(0), (Torus *)(radix_lwe_out->ptr), lut->lwe_indexes_out,
         lut->lut_vec, lut->lut_indexes_vec,
-        const_cast<Torus *>(seeded_lwe_input), lut->lwe_indexes_in, bsks,
+        const_cast<Torus *>(seeded_lwe_input), lut->lwe_indexes_in.get(), bsks,
         lut->buffer, mem_ptr->params.glwe_dimension,
         mem_ptr->params.small_lwe_dimension, mem_ptr->params.polynomial_size,
         mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
@@ -45,16 +45,12 @@ void host_integer_grouped_oprf(CudaStreams streams,
     std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
     std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
 
-    cuda_event_record(lut->event_scatter_in, streams.stream(0),
-                      streams.gpu_index(0));
-    for (int j = 1; j < active_streams.count(); j++) {
-      cuda_stream_wait_event(streams.stream(j), lut->event_scatter_in,
-                             streams.gpu_index(j));
-    }
+    lut->multi_gpu_scatter_barrier.local_streams_wait_for_stream_0(
+        active_streams);
 
     PUSH_RANGE("scatter")
     multi_gpu_scatter_lwe_async<Torus>(
-        active_streams, lwe_array_in_vec, seeded_lwe_input, lut->lwe_indexes_in,
+        active_streams, lwe_array_in_vec, seeded_lwe_input, lut->lwe_indexes_in.get(),
         lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
         active_streams.count(), num_blocks_to_process,
         mem_ptr->params.small_lwe_dimension + 1);
@@ -76,16 +72,8 @@ void host_integer_grouped_oprf(CudaStreams streams,
         lut->lwe_aligned_vec, num_blocks_to_process,
         mem_ptr->params.big_lwe_dimension + 1);
     POP_RANGE()
-    // other gpus record their events
-    for (int j = 1; j < active_streams.count(); j++) {
-      cuda_event_record(lut->event_scatter_out[j], streams.stream(j),
-                        streams.gpu_index(j));
-    }
-    // GPU 0 waits for all
-    for (int j = 1; j < active_streams.count(); j++) {
-      cuda_stream_wait_event(streams.stream(0), lut->event_scatter_out[j],
-                             streams.gpu_index(0));
-    }
+    lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
+        active_streams);
   }
 
   for (uint32_t i = 0; i < num_blocks_to_process; i++) {
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
index 8c183aca1f..3479f45b28 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
@@ -150,7 +150,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
     // control_bit|b|a
     host_pack_bivariate_blocks<Torus>(
         streams, mux_inputs, mux_lut->lwe_indexes_out, rotated_input,
-        input_bits_a, mux_lut->lwe_indexes_in, 2, total_nb_bits,
+        input_bits_a, mux_lut->lwe_indexes_in.get(), 2, total_nb_bits,
         mem->params.message_modulus, mem->params.carry_modulus);
 
     // The shift bit is already properly aligned/positioned