zama-ai · andrei-stoian-zama · Sep 11, 2025 · Oct 3, 2025
@@ -5,6 +5,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cuda_runtime.h>
+#include <memory>
 
 extern "C" {
 
@@ -140,4 +141,34 @@ template <typename Torus>
 void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
                           Torus *d_array, Torus value, Torus n);
 
+template <class T> struct malloc_with_size_tracking_async_deleter {
+private:
+  cudaStream_t _stream;
+  uint32_t _gpu_index;
+  uint64_t &_size_tracker;
+  bool _allocate_gpu_memory;
+
+public:
+  malloc_with_size_tracking_async_deleter(cudaStream_t stream,
+                                          uint32_t gpu_index,
+                                          uint64_t &size_tracker,
+                                          bool allocate_gpu_memory)
+      : _stream(stream), _gpu_index(gpu_index), _size_tracker(size_tracker),
+        _allocate_gpu_memory(allocate_gpu_memory)
+
+  {}
+  void operator()(T *ptr) { cuda_drop_with_size_tracking_async(ptr, _stream, _gpu_index, _allocate_gpu_memory) ; }
+};
+
+template <class T>
+std::shared_ptr<T> cuda_make_shared_with_size_tracking_async(
+    uint64_t size, cudaStream_t stream, uint32_t gpu_index,
+    uint64_t &size_tracker, bool allocate_gpu_memory) {
+  return std::shared_ptr<T>(
+      (T*)cuda_malloc_with_size_tracking_async(size, stream, gpu_index,
+                                           size_tracker, allocate_gpu_memory),
+      malloc_with_size_tracking_async_deleter<T>(
+          stream, gpu_index, size_tracker, allocate_gpu_memory));
+}
+
 #endif
@@ -183,4 +183,93 @@ struct CudaStreams {
   }
 };
 
+struct CudaStreamsBarrier {
+private:
+  std::vector<cudaEvent_t> _events;
+  CudaStreams _streams;
+
+  CudaStreamsBarrier(const CudaStreamsBarrier &) {} // Prevent copy-construction
+  CudaStreamsBarrier &operator=(const CudaStreamsBarrier &) {
+    return *this;
+  } // Prevent assignment
+public:
+  void create_on(const CudaStreams &streams) {
+    _streams = streams;
+
+    GPU_ASSERT(streams.count() > 1, "CudaStreamsFirstWaitsWorkersBarrier: "
+                                    "Attempted to create on single GPU");
+    _events.resize(streams.count());
+    for (int i = 0; i < streams.count(); i++) {
+      _events[i] = cuda_create_event(streams.gpu_index(i));
+    }
+  }
+
+  CudaStreamsBarrier(){};
+
+  void local_streams_wait_for_stream_0(const CudaStreams &user_streams) {
+    GPU_ASSERT(!_events.empty(),
+               "CudaStreamsBarrier: must call create_on before use");
+    GPU_ASSERT(user_streams.gpu_index(0) == _streams.gpu_index(0),
+               "CudaStreamsBarrier: synchronization can only be performed on "
+               "the GPUs the barrier was initially created on.");
+
+    cuda_event_record(_events[0], user_streams.stream(0),
+                      user_streams.gpu_index(0));
+    for (int j = 1; j < user_streams.count(); j++) {
+      GPU_ASSERT(user_streams.gpu_index(j) == _streams.gpu_index(j),
+                 "CudaStreamsBarrier: synchronization can only be performed on "
+                 "the GPUs the barrier was initially created on.");
+      cuda_stream_wait_event(user_streams.stream(j), _events[0],
+                             user_streams.gpu_index(j));
+    }
+  }
+
+  void stream_0_wait_for_local_streams(const CudaStreams &user_streams) {
+    GPU_ASSERT(
+        !_events.empty(),
+        "CudaStreamsFirstWaitsWorkersBarrier: must call create_on before use");
+    GPU_ASSERT(
+        user_streams.count() <= _events.size(),
+        "CudaStreamsFirstWaitsWorkersBarrier: trying to synchronize too many "
+        "streams. "
+        "The barrier was created on a LUT that had %lu active streams, while "
+        "the user stream set has %u streams",
+        _events.size(), user_streams.count());
+
+    if (user_streams.count() > 1) {
+      // Worker GPUs record their events
+      for (int j = 1; j < user_streams.count(); j++) {
+        GPU_ASSERT(_streams.gpu_index(j) == user_streams.gpu_index(j),
+                   "CudaStreamsBarrier: The user stream "
+                   "set GPU[%d]=%u while the LUT stream set GPU[%d]=%u",
+                   j, user_streams.gpu_index(j), j, _streams.gpu_index(j));
+
+        cuda_event_record(_events[j], user_streams.stream(j),
+                          user_streams.gpu_index(j));
+      }
+
+      // GPU 0 waits for all workers
+      for (int j = 1; j < user_streams.count(); j++) {
+        cuda_stream_wait_event(user_streams.stream(0), _events[j],
+                               user_streams.gpu_index(0));
+      }
+    }
+  }
+
+  void release() {
+    for (int j = 0; j < _streams.count(); j++) {
+      cuda_event_destroy(_events[j], _streams.gpu_index(j));
+    }
+
+    _events.clear();
+  }
+
+  ~CudaStreamsBarrier() {
+    GPU_ASSERT(_events.empty(),
+               "CudaStreamsBarrier: must "
+               "call release before destruction: events size = %lu",
+               _events.size());
+  }
+};
+
 #endif