sst-integration-stream: make SST integration works with streams (#103)

William-An · web-flow · commit 6ab2ca48f7f3 · 2025-03-06T19:06:03.000Z
* sst-integration-stream: add apis to make SST integration works with stream

* Add dev container specs
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,4 @@
+{
+    "name": "CUDA 12.8",
+    "image": "ghcr.io/accel-sim/accel-sim-framework:Ubuntu-24.04-cuda-12.8"
+}
diff --git a/.devcontainer/sst_integration/devcontainer.json b/.devcontainer/sst_integration/devcontainer.json
@@ -0,0 +1,4 @@
+{
+    "name": "SST CUDA 11.7",
+    "image": "ghcr.io/accel-sim/accel-sim-framework:SST-Integration-Ubuntu-22.04-cuda-11.7-llvm-18.1.8-riscv-gnu-2024.08.06-nightly"
+}
diff --git a/libcuda/cuda_runtime_api.cc b/libcuda/cuda_runtime_api.cc
@@ -1809,6 +1809,8 @@ cudaDeviceGetAttributeInternal(int *value, enum cudaDeviceAttr attr, int device,
       case 19:
         *value = 0;
         break;
+      case 20:  // cudaDevAttrComputeMode for controlling cudaSetDevice for threads
+        *value = 0; // Dummy value, should not affect simulation
       case 21:
       case 22:
       case 23:
@@ -2429,6 +2431,18 @@ void SST_gpgpusim_numcores_equal_check(unsigned sst_numcores) {
       ->SST_gpgpusim_numcores_equal_check(sst_numcores);
 }
 
+/**
+ * @brief For SST to check if kernel launch is blocking
+ *        Future: we will need a better interface to the 
+ *        GPGPU-Sim config for integration with outside
+ *        simulators.
+ * 
+ */
+bool SST_gpgpusim_launch_blocking() {
+  return GPGPU_Context()->the_gpgpusim->g_stream_manager->is_blocking();
+
+}
+
 uint64_t cudaMallocSST(void **devPtr, size_t size) {
   if (g_debug_execution >= 3) {
     announce_call(__my_func__);
@@ -2979,6 +2993,40 @@ __host__ cudaError_t CUDARTAPI cudaStreamSynchronize(cudaStream_t stream) {
   return cudaStreamSynchronizeInternal(stream);
 }
 
+__host__ cudaError_t CUDARTAPI cudaStreamSynchronizeSST(cudaStream_t stream) {
+  // For SST, perform a one-time check
+  gpgpu_context *ctx = GPGPU_Context();
+  if (g_debug_execution >= 3) {
+    announce_call(__my_func__);
+  }
+
+  // default stream: all is done
+  // other streams: no more ops
+  g_last_cudaError = cudaSuccess;
+  if (stream == NULL) {
+    // For default stream, sync is equivalent to cudaThreadSync
+    bool thread_synced = ctx->synchronize_check();
+    if (thread_synced) {
+      // We are already done, so no need to poll for sync done
+      return cudaSuccess;
+    } else {
+      // Otherwise we mark we should wait for default strem to sync
+      ctx->the_gpgpusim->g_stream_manager->get_stream_zero()->set_request_synchronize();
+      return cudaErrorNotReady;
+    }
+  } else {
+    // For other stream, check if it is already sync'ed
+    bool stream_synced = stream->synchronize_check();
+    if (stream_synced) {
+      return cudaSuccess;
+    } else {
+      stream->set_request_synchronize();
+      return cudaErrorNotReady;
+    }
+  }
+  return g_last_cudaError = cudaSuccess;
+}
+
 __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream) {
   if (g_debug_execution >= 3) {
     announce_call(__my_func__);
@@ -3054,6 +3102,28 @@ __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event) {
   return g_last_cudaError = cudaSuccess;
 }
 
+__host__ cudaError_t CUDARTAPI cudaEventSynchronizeSST(cudaEvent_t event) {
+  // For SST, perform a one-time check
+  // and let stream manager send the callback once the event is done
+  if (g_debug_execution >= 3) {
+    announce_call(__my_func__);
+  }
+  printf("GPGPU-Sim API: cudaEventSynchronize ** waiting for event\n");
+  fflush(stdout);
+  CUevent_st *e = (CUevent_st *)event;
+  bool event_sync_done = e->done();
+  if (event_sync_done) {
+    printf("GPGPU-Sim API: cudaEventSynchronize ** event detected\n");
+    fflush(stdout);
+    return cudaSuccess;
+  } else {
+    printf("GPGPU-Sim API: cudaEventSynchronize ** still waiting for event\n");
+    // Mark this event as waiting for synchronization
+    e->set_request_synchronize();
+    return cudaErrorNotReady;
+  }
+}
+
 __host__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event) {
   if (g_debug_execution >= 3) {
     announce_call(__my_func__);
@@ -3113,6 +3183,7 @@ __host__ cudaError_t CUDARTAPI cudaThreadSynchronizeSST(void) {
     ctx->requested_synchronize = false;
     return cudaSuccess;
   } else {
+    ctx->requested_synchronize = true;
     return cudaErrorNotReady;
   }
 }
@@ -4022,6 +4093,18 @@ cudaError_t CUDARTAPI cudaSetDeviceFlags(int flags) {
   }
 }
 
+cudaError_t CUDARTAPI cudaSetDeviceFlagsSST(int flags) {
+  if (g_debug_execution >= 3) {
+    announce_call(__my_func__);
+  }
+  // SST's simple stream example relies on this
+  // currently just set it to no-op
+  printf(
+    "GPGPU-Sim PTX: Execution warning: ignoring call to \"%s ( flag=%p)\"\n",
+    __my_func__, flags);
+  return cudaSuccess;
+}
+
 cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr,
                                             const char *hostFun) {
   return cudaFuncGetAttributesInternal(attr, hostFun);
diff --git a/src/gpgpusim_entrypoint.cc b/src/gpgpusim_entrypoint.cc
@@ -56,7 +56,9 @@ class stream_manager *g_stream_manager() {
 
 // SST callback
 extern void SST_callback_cudaThreadSynchronize_done();
+extern void SST_callback_cudaStreamSynchronize_done(cudaStream_t stream);
 __attribute__((weak)) void SST_callback_cudaThreadSynchronize_done() {}
+__attribute__((weak)) void SST_callback_cudaStreamSynchronize_done(cudaStream_t stream) {}
 
 void *gpgpu_sim_thread_sequential(void *ctx_ptr) {
   gpgpu_context *ctx = (gpgpu_context *)ctx_ptr;
@@ -189,12 +191,33 @@ bool SST_Cycle() {
   // Check if Synchronize is done when SST previously requested
   // cudaThreadSynchronize
   if (GPGPU_Context()->requested_synchronize &&
-      ((g_stream_manager()->empty() && !GPGPUsim_ctx_ptr()->g_sim_active) ||
+      ((g_stream_manager()->empty_protected() && !GPGPUsim_ctx_ptr()->g_sim_active) ||
        GPGPUsim_ctx_ptr()->g_sim_done)) {
     SST_callback_cudaThreadSynchronize_done();
     GPGPU_Context()->requested_synchronize = false;
   }
 
+  // Polling to check for each stream if it is marked for requested with sync
+  if (g_stream_manager()->get_stream_zero()->requested_synchronize() &&
+      ((g_stream_manager()->empty_protected() && !GPGPUsim_ctx_ptr()->g_sim_active) || 
+      GPGPUsim_ctx_ptr()->g_sim_done)) {
+    SST_callback_cudaStreamSynchronize_done(0);
+    g_stream_manager()->get_stream_zero()->reset_request_synchronize();
+  }
+
+  // Iterate through each stream to check if SST is waiting on
+  // it and it does not have any operation
+  std::list<CUstream_st *>& streams = g_stream_manager()->get_concurrent_streams();
+  for (auto it = streams.begin(); it != streams.end(); it++) {
+    CUstream_st *stream = *it;
+    if (stream->requested_synchronize() &&
+        stream->empty()) {
+      // This stream is ready
+      SST_callback_cudaStreamSynchronize_done(stream);
+      stream->reset_request_synchronize();
+    }
+  }
+
   if (g_stream_manager()->empty_protected() &&
       !GPGPUsim_ctx_ptr()->g_sim_done && !g_the_gpu()->active()) {
     GPGPUsim_ctx_ptr()->g_sim_active = false;
@@ -272,7 +295,6 @@ void gpgpu_context::synchronize() {
 
 bool gpgpu_context::synchronize_check() {
   // printf("GPGPU-Sim: synchronize checking for inactive GPU simulation\n");
-  requested_synchronize = true;
   the_gpgpusim->g_stream_manager->print(stdout);
   fflush(stdout);
   //    sem_wait(&g_sim_signal_finish);
diff --git a/src/stream_manager.cc b/src/stream_manager.cc
@@ -34,15 +34,20 @@
 
 unsigned CUstream_st::sm_next_stream_uid = 0;
 
-// SST memcpy callbacks
-extern void SST_callback_memcpy_H2D_done();
-extern void SST_callback_memcpy_D2H_done();
+// SST memcpy callbacks, called after a stream operation is done via record_next_done()
+extern void SST_callback_memcpy_H2D_done(uint64_t dst, uint64_t src, size_t count, cudaStream_t stream);
+extern void SST_callback_memcpy_D2H_done(uint64_t dst, uint64_t src, size_t count, cudaStream_t stream);
 extern void SST_callback_memcpy_to_symbol_done();
 extern void SST_callback_memcpy_from_symbol_done();
-__attribute__((weak)) void SST_callback_memcpy_H2D_done() {}
-__attribute__((weak)) void SST_callback_memcpy_D2H_done() {}
+extern void SST_callback_cudaEventSynchronize_done(cudaEvent_t event);
+extern void SST_callback_kernel_done(cudaStream_t stream);
+__attribute__((weak)) void SST_callback_memcpy_H2D_done(uint64_t dst, uint64_t src, size_t count, cudaStream_t stream) {}
+__attribute__((weak)) void SST_callback_memcpy_D2H_done(uint64_t dst, uint64_t src, size_t count, cudaStream_t stream) {}
 __attribute__((weak)) void SST_callback_memcpy_to_symbol_done() {}
 __attribute__((weak)) void SST_callback_memcpy_from_symbol_done() {}
+__attribute__((weak)) void SST_callback_cudaEventSynchronize_done(cudaEvent_t event);
+__attribute__((weak)) void SST_callback_kernel_done(cudaStream_t stream);
+
 
 CUstream_st::CUstream_st() {
   m_pending = false;
@@ -74,6 +79,10 @@ void CUstream_st::synchronize() {
   } while (!done);
 }
 
+bool CUstream_st::synchronize_check() {
+  return m_operations.empty();
+}
+
 void CUstream_st::push(const stream_operation &op) {
   // called by host thread
   pthread_mutex_lock(&m_lock);
@@ -132,13 +141,15 @@ bool stream_operation::do_operation(gpgpu_sim *gpu) {
       if (g_debug_execution >= 3) printf("memcpy host-to-device\n");
       gpu->memcpy_to_gpu(m_device_address_dst, m_host_address_src, m_cnt);
       m_stream->record_next_done();
-      if (gpu->is_SST_mode()) SST_callback_memcpy_H2D_done();
+      if (gpu->is_SST_mode()) {
+        SST_callback_memcpy_H2D_done((uint64_t) m_device_address_dst, (uint64_t) m_host_address_src, m_cnt, m_stream->is_stream_zero_stream() ? 0 : m_stream);
+      }
       break;
     case stream_memcpy_device_to_host:
       if (g_debug_execution >= 3) printf("memcpy device-to-host\n");
       gpu->memcpy_from_gpu(m_host_address_dst, m_device_address_src, m_cnt);
       m_stream->record_next_done();
-      if (gpu->is_SST_mode()) SST_callback_memcpy_D2H_done();
+      if (gpu->is_SST_mode()) SST_callback_memcpy_D2H_done((uint64_t) m_host_address_dst, (uint64_t) m_device_address_src, m_cnt, m_stream->is_stream_zero_stream() ? 0 : m_stream);
       break;
     case stream_memcpy_device_to_device:
       if (g_debug_execution >= 3) printf("memcpy device-to-device\n");
@@ -194,6 +205,13 @@ bool stream_operation::do_operation(gpgpu_sim *gpu) {
       time_t wallclock = time((time_t *)NULL);
       m_event->update(gpu->gpu_tot_sim_cycle, wallclock);
       m_stream->record_next_done();
+      if ((gpu->is_SST_mode()) && m_event->done() &&
+          m_event->requested_synchronize()) {
+        // Notify that the event is done
+        SST_callback_cudaEventSynchronize_done(m_event);
+        // Reset the sync flag as we have notified SST
+        m_event->reset_request_synchronize();
+      }
     } break;
     case stream_wait_event:
       // only allows next op to go if event is done
@@ -252,6 +270,9 @@ stream_manager::stream_manager(gpgpu_sim *gpu, bool cuda_launch_blocking) {
   m_cuda_launch_blocking = cuda_launch_blocking;
   pthread_mutex_init(&m_lock, NULL);
   m_last_stream = m_streams.begin();
+
+  // Mark stream zero as the default stream
+  m_stream_zero.set_stream_zero();
 }
 
 bool stream_manager::operation(bool *sim) {
@@ -303,6 +324,11 @@ bool stream_manager::register_finished_kernel(unsigned grid_uid) {
       //            grid_uid, stream->get_uid()); kernel_stat.flush();
       //            kernel_stat.close();
       stream->record_next_done();
+      // Callback to notify a kernel is done for SST's stream
+      // manager to support with nonblocking + blocking kernel launch
+      if (m_gpu->is_SST_mode()) {
+        SST_callback_kernel_done(stream->is_stream_zero_stream() ? 0 : stream);
+      }
       m_grid_id_to_stream.erase(grid_uid);
       kernel->notify_parent_finished();
       delete kernel;
diff --git a/src/stream_manager.h b/src/stream_manager.h
@@ -69,6 +69,11 @@ struct CUevent_st {
   void issue() { m_issued++; }
   unsigned int num_issued() const { return m_issued; }
 
+  // SST related, stating this event is requested to synchronize
+  void set_request_synchronize() { m_requested_synchronize = true; }
+  void reset_request_synchronize() { m_requested_synchronize = false; }
+  bool requested_synchronize() const { return m_requested_synchronize; }
+
  private:
   int m_uid;
   bool m_blocking;
@@ -77,6 +82,9 @@ struct CUevent_st {
   unsigned int m_issued;
   time_t m_wallclock;
   double m_gpu_tot_sim_cycle;
+  
+  // SST related
+  bool m_requested_synchronize = false;
 
   static int m_next_event_uid;
 };
@@ -226,13 +234,20 @@ struct CUstream_st {
   bool empty();
   bool busy();
   void synchronize();
+  bool synchronize_check();
   void push(const stream_operation &op);
   void record_next_done();
   stream_operation next();
   void cancel_front();  // front operation fails, cancle the pending status
   stream_operation &front() { return m_operations.front(); }
   void print(FILE *fp);
   unsigned get_uid() const { return m_uid; }
+  void set_request_synchronize() { m_requested_synchronize = true; }
+  void reset_request_synchronize() { m_requested_synchronize = false; }
+  bool requested_synchronize() const { return m_requested_synchronize; }
+  void set_stream_zero() { is_stream_zero = true; }
+  bool is_stream_zero_stream() { return is_stream_zero; }
+  void reset_stream_zero() { is_stream_zero = false; }
 
  private:
   unsigned m_uid;
@@ -243,6 +258,11 @@ struct CUstream_st {
 
   pthread_mutex_t m_lock;  // ensure only one host or gpu manipulates stream
                            // operation at one time
+
+  // SST related, use to record the stream is requested to synchronize
+  bool m_requested_synchronize = false;
+  // Whether this is the default stream
+  bool is_stream_zero = false;
 };
 
 class stream_manager {
@@ -263,6 +283,8 @@ class stream_manager {
   void stop_all_running_kernels();
   unsigned size() { return m_streams.size(); };
   bool is_blocking() { return m_cuda_launch_blocking; };
+  CUstream_st *get_stream_zero() { return &m_stream_zero; };
+  std::list<CUstream_st *>& get_concurrent_streams() { return m_streams; };
 
  private:
   void print_impl(FILE *fp);

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +    "name": "CUDA 12.8",
 +    "image": "ghcr.io/accel-sim/accel-sim-framework:Ubuntu-24.04-cuda-12.8"
 +}