Skip to content

Commit 6ab2ca4

Browse files
authored
sst-integration-stream: make SST integration works with streams (#103)
* sst-integration-stream: add apis to make SST integration works with stream * Add dev container specs
1 parent 63e2548 commit 6ab2ca4

File tree

6 files changed

+170
-9
lines changed

6 files changed

+170
-9
lines changed

.devcontainer/devcontainer.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"name": "CUDA 12.8",
3+
"image": "ghcr.io/accel-sim/accel-sim-framework:Ubuntu-24.04-cuda-12.8"
4+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"name": "SST CUDA 11.7",
3+
"image": "ghcr.io/accel-sim/accel-sim-framework:SST-Integration-Ubuntu-22.04-cuda-11.7-llvm-18.1.8-riscv-gnu-2024.08.06-nightly"
4+
}

libcuda/cuda_runtime_api.cc

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1809,6 +1809,8 @@ cudaDeviceGetAttributeInternal(int *value, enum cudaDeviceAttr attr, int device,
18091809
case 19:
18101810
*value = 0;
18111811
break;
1812+
case 20: // cudaDevAttrComputeMode for controlling cudaSetDevice for threads
1813+
*value = 0; // Dummy value, should not affect simulation
18121814
case 21:
18131815
case 22:
18141816
case 23:
@@ -2429,6 +2431,18 @@ void SST_gpgpusim_numcores_equal_check(unsigned sst_numcores) {
24292431
->SST_gpgpusim_numcores_equal_check(sst_numcores);
24302432
}
24312433

2434+
/**
2435+
* @brief For SST to check if kernel launch is blocking
2436+
* Future: we will need a better interface to the
2437+
* GPGPU-Sim config for integration with outside
2438+
* simulators.
2439+
*
2440+
*/
2441+
bool SST_gpgpusim_launch_blocking() {
2442+
return GPGPU_Context()->the_gpgpusim->g_stream_manager->is_blocking();
2443+
2444+
}
2445+
24322446
uint64_t cudaMallocSST(void **devPtr, size_t size) {
24332447
if (g_debug_execution >= 3) {
24342448
announce_call(__my_func__);
@@ -2979,6 +2993,40 @@ __host__ cudaError_t CUDARTAPI cudaStreamSynchronize(cudaStream_t stream) {
29792993
return cudaStreamSynchronizeInternal(stream);
29802994
}
29812995

2996+
__host__ cudaError_t CUDARTAPI cudaStreamSynchronizeSST(cudaStream_t stream) {
2997+
// For SST, perform a one-time check
2998+
gpgpu_context *ctx = GPGPU_Context();
2999+
if (g_debug_execution >= 3) {
3000+
announce_call(__my_func__);
3001+
}
3002+
3003+
// default stream: all is done
3004+
// other streams: no more ops
3005+
g_last_cudaError = cudaSuccess;
3006+
if (stream == NULL) {
3007+
// For default stream, sync is equivalent to cudaThreadSync
3008+
bool thread_synced = ctx->synchronize_check();
3009+
if (thread_synced) {
3010+
// We are already done, so no need to poll for sync done
3011+
return cudaSuccess;
3012+
} else {
3013+
// Otherwise we mark we should wait for default strem to sync
3014+
ctx->the_gpgpusim->g_stream_manager->get_stream_zero()->set_request_synchronize();
3015+
return cudaErrorNotReady;
3016+
}
3017+
} else {
3018+
// For other stream, check if it is already sync'ed
3019+
bool stream_synced = stream->synchronize_check();
3020+
if (stream_synced) {
3021+
return cudaSuccess;
3022+
} else {
3023+
stream->set_request_synchronize();
3024+
return cudaErrorNotReady;
3025+
}
3026+
}
3027+
return g_last_cudaError = cudaSuccess;
3028+
}
3029+
29823030
__host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream) {
29833031
if (g_debug_execution >= 3) {
29843032
announce_call(__my_func__);
@@ -3054,6 +3102,28 @@ __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event) {
30543102
return g_last_cudaError = cudaSuccess;
30553103
}
30563104

3105+
__host__ cudaError_t CUDARTAPI cudaEventSynchronizeSST(cudaEvent_t event) {
3106+
// For SST, perform a one-time check
3107+
// and let stream manager send the callback once the event is done
3108+
if (g_debug_execution >= 3) {
3109+
announce_call(__my_func__);
3110+
}
3111+
printf("GPGPU-Sim API: cudaEventSynchronize ** waiting for event\n");
3112+
fflush(stdout);
3113+
CUevent_st *e = (CUevent_st *)event;
3114+
bool event_sync_done = e->done();
3115+
if (event_sync_done) {
3116+
printf("GPGPU-Sim API: cudaEventSynchronize ** event detected\n");
3117+
fflush(stdout);
3118+
return cudaSuccess;
3119+
} else {
3120+
printf("GPGPU-Sim API: cudaEventSynchronize ** still waiting for event\n");
3121+
// Mark this event as waiting for synchronization
3122+
e->set_request_synchronize();
3123+
return cudaErrorNotReady;
3124+
}
3125+
}
3126+
30573127
__host__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event) {
30583128
if (g_debug_execution >= 3) {
30593129
announce_call(__my_func__);
@@ -3113,6 +3183,7 @@ __host__ cudaError_t CUDARTAPI cudaThreadSynchronizeSST(void) {
31133183
ctx->requested_synchronize = false;
31143184
return cudaSuccess;
31153185
} else {
3186+
ctx->requested_synchronize = true;
31163187
return cudaErrorNotReady;
31173188
}
31183189
}
@@ -4022,6 +4093,18 @@ cudaError_t CUDARTAPI cudaSetDeviceFlags(int flags) {
40224093
}
40234094
}
40244095

4096+
cudaError_t CUDARTAPI cudaSetDeviceFlagsSST(int flags) {
4097+
if (g_debug_execution >= 3) {
4098+
announce_call(__my_func__);
4099+
}
4100+
// SST's simple stream example relies on this
4101+
// currently just set it to no-op
4102+
printf(
4103+
"GPGPU-Sim PTX: Execution warning: ignoring call to \"%s ( flag=%p)\"\n",
4104+
__my_func__, flags);
4105+
return cudaSuccess;
4106+
}
4107+
40254108
cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr,
40264109
const char *hostFun) {
40274110
return cudaFuncGetAttributesInternal(attr, hostFun);

src/gpgpusim_entrypoint.cc

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,9 @@ class stream_manager *g_stream_manager() {
5656

5757
// SST callback
5858
extern void SST_callback_cudaThreadSynchronize_done();
59+
extern void SST_callback_cudaStreamSynchronize_done(cudaStream_t stream);
5960
__attribute__((weak)) void SST_callback_cudaThreadSynchronize_done() {}
61+
__attribute__((weak)) void SST_callback_cudaStreamSynchronize_done(cudaStream_t stream) {}
6062

6163
void *gpgpu_sim_thread_sequential(void *ctx_ptr) {
6264
gpgpu_context *ctx = (gpgpu_context *)ctx_ptr;
@@ -189,12 +191,33 @@ bool SST_Cycle() {
189191
// Check if Synchronize is done when SST previously requested
190192
// cudaThreadSynchronize
191193
if (GPGPU_Context()->requested_synchronize &&
192-
((g_stream_manager()->empty() && !GPGPUsim_ctx_ptr()->g_sim_active) ||
194+
((g_stream_manager()->empty_protected() && !GPGPUsim_ctx_ptr()->g_sim_active) ||
193195
GPGPUsim_ctx_ptr()->g_sim_done)) {
194196
SST_callback_cudaThreadSynchronize_done();
195197
GPGPU_Context()->requested_synchronize = false;
196198
}
197199

200+
// Polling to check for each stream if it is marked for requested with sync
201+
if (g_stream_manager()->get_stream_zero()->requested_synchronize() &&
202+
((g_stream_manager()->empty_protected() && !GPGPUsim_ctx_ptr()->g_sim_active) ||
203+
GPGPUsim_ctx_ptr()->g_sim_done)) {
204+
SST_callback_cudaStreamSynchronize_done(0);
205+
g_stream_manager()->get_stream_zero()->reset_request_synchronize();
206+
}
207+
208+
// Iterate through each stream to check if SST is waiting on
209+
// it and it does not have any operation
210+
std::list<CUstream_st *>& streams = g_stream_manager()->get_concurrent_streams();
211+
for (auto it = streams.begin(); it != streams.end(); it++) {
212+
CUstream_st *stream = *it;
213+
if (stream->requested_synchronize() &&
214+
stream->empty()) {
215+
// This stream is ready
216+
SST_callback_cudaStreamSynchronize_done(stream);
217+
stream->reset_request_synchronize();
218+
}
219+
}
220+
198221
if (g_stream_manager()->empty_protected() &&
199222
!GPGPUsim_ctx_ptr()->g_sim_done && !g_the_gpu()->active()) {
200223
GPGPUsim_ctx_ptr()->g_sim_active = false;
@@ -272,7 +295,6 @@ void gpgpu_context::synchronize() {
272295

273296
bool gpgpu_context::synchronize_check() {
274297
// printf("GPGPU-Sim: synchronize checking for inactive GPU simulation\n");
275-
requested_synchronize = true;
276298
the_gpgpusim->g_stream_manager->print(stdout);
277299
fflush(stdout);
278300
// sem_wait(&g_sim_signal_finish);

src/stream_manager.cc

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,15 +34,20 @@
3434

3535
unsigned CUstream_st::sm_next_stream_uid = 0;
3636

37-
// SST memcpy callbacks
38-
extern void SST_callback_memcpy_H2D_done();
39-
extern void SST_callback_memcpy_D2H_done();
37+
// SST memcpy callbacks, called after a stream operation is done via record_next_done()
38+
extern void SST_callback_memcpy_H2D_done(uint64_t dst, uint64_t src, size_t count, cudaStream_t stream);
39+
extern void SST_callback_memcpy_D2H_done(uint64_t dst, uint64_t src, size_t count, cudaStream_t stream);
4040
extern void SST_callback_memcpy_to_symbol_done();
4141
extern void SST_callback_memcpy_from_symbol_done();
42-
__attribute__((weak)) void SST_callback_memcpy_H2D_done() {}
43-
__attribute__((weak)) void SST_callback_memcpy_D2H_done() {}
42+
extern void SST_callback_cudaEventSynchronize_done(cudaEvent_t event);
43+
extern void SST_callback_kernel_done(cudaStream_t stream);
44+
__attribute__((weak)) void SST_callback_memcpy_H2D_done(uint64_t dst, uint64_t src, size_t count, cudaStream_t stream) {}
45+
__attribute__((weak)) void SST_callback_memcpy_D2H_done(uint64_t dst, uint64_t src, size_t count, cudaStream_t stream) {}
4446
__attribute__((weak)) void SST_callback_memcpy_to_symbol_done() {}
4547
__attribute__((weak)) void SST_callback_memcpy_from_symbol_done() {}
48+
__attribute__((weak)) void SST_callback_cudaEventSynchronize_done(cudaEvent_t event);
49+
__attribute__((weak)) void SST_callback_kernel_done(cudaStream_t stream);
50+
4651

4752
CUstream_st::CUstream_st() {
4853
m_pending = false;
@@ -74,6 +79,10 @@ void CUstream_st::synchronize() {
7479
} while (!done);
7580
}
7681

82+
bool CUstream_st::synchronize_check() {
83+
return m_operations.empty();
84+
}
85+
7786
void CUstream_st::push(const stream_operation &op) {
7887
// called by host thread
7988
pthread_mutex_lock(&m_lock);
@@ -132,13 +141,15 @@ bool stream_operation::do_operation(gpgpu_sim *gpu) {
132141
if (g_debug_execution >= 3) printf("memcpy host-to-device\n");
133142
gpu->memcpy_to_gpu(m_device_address_dst, m_host_address_src, m_cnt);
134143
m_stream->record_next_done();
135-
if (gpu->is_SST_mode()) SST_callback_memcpy_H2D_done();
144+
if (gpu->is_SST_mode()) {
145+
SST_callback_memcpy_H2D_done((uint64_t) m_device_address_dst, (uint64_t) m_host_address_src, m_cnt, m_stream->is_stream_zero_stream() ? 0 : m_stream);
146+
}
136147
break;
137148
case stream_memcpy_device_to_host:
138149
if (g_debug_execution >= 3) printf("memcpy device-to-host\n");
139150
gpu->memcpy_from_gpu(m_host_address_dst, m_device_address_src, m_cnt);
140151
m_stream->record_next_done();
141-
if (gpu->is_SST_mode()) SST_callback_memcpy_D2H_done();
152+
if (gpu->is_SST_mode()) SST_callback_memcpy_D2H_done((uint64_t) m_host_address_dst, (uint64_t) m_device_address_src, m_cnt, m_stream->is_stream_zero_stream() ? 0 : m_stream);
142153
break;
143154
case stream_memcpy_device_to_device:
144155
if (g_debug_execution >= 3) printf("memcpy device-to-device\n");
@@ -194,6 +205,13 @@ bool stream_operation::do_operation(gpgpu_sim *gpu) {
194205
time_t wallclock = time((time_t *)NULL);
195206
m_event->update(gpu->gpu_tot_sim_cycle, wallclock);
196207
m_stream->record_next_done();
208+
if ((gpu->is_SST_mode()) && m_event->done() &&
209+
m_event->requested_synchronize()) {
210+
// Notify that the event is done
211+
SST_callback_cudaEventSynchronize_done(m_event);
212+
// Reset the sync flag as we have notified SST
213+
m_event->reset_request_synchronize();
214+
}
197215
} break;
198216
case stream_wait_event:
199217
// only allows next op to go if event is done
@@ -252,6 +270,9 @@ stream_manager::stream_manager(gpgpu_sim *gpu, bool cuda_launch_blocking) {
252270
m_cuda_launch_blocking = cuda_launch_blocking;
253271
pthread_mutex_init(&m_lock, NULL);
254272
m_last_stream = m_streams.begin();
273+
274+
// Mark stream zero as the default stream
275+
m_stream_zero.set_stream_zero();
255276
}
256277

257278
bool stream_manager::operation(bool *sim) {
@@ -303,6 +324,11 @@ bool stream_manager::register_finished_kernel(unsigned grid_uid) {
303324
// grid_uid, stream->get_uid()); kernel_stat.flush();
304325
// kernel_stat.close();
305326
stream->record_next_done();
327+
// Callback to notify a kernel is done for SST's stream
328+
// manager to support with nonblocking + blocking kernel launch
329+
if (m_gpu->is_SST_mode()) {
330+
SST_callback_kernel_done(stream->is_stream_zero_stream() ? 0 : stream);
331+
}
306332
m_grid_id_to_stream.erase(grid_uid);
307333
kernel->notify_parent_finished();
308334
delete kernel;

src/stream_manager.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,11 @@ struct CUevent_st {
6969
void issue() { m_issued++; }
7070
unsigned int num_issued() const { return m_issued; }
7171

72+
// SST related, stating this event is requested to synchronize
73+
void set_request_synchronize() { m_requested_synchronize = true; }
74+
void reset_request_synchronize() { m_requested_synchronize = false; }
75+
bool requested_synchronize() const { return m_requested_synchronize; }
76+
7277
private:
7378
int m_uid;
7479
bool m_blocking;
@@ -77,6 +82,9 @@ struct CUevent_st {
7782
unsigned int m_issued;
7883
time_t m_wallclock;
7984
double m_gpu_tot_sim_cycle;
85+
86+
// SST related
87+
bool m_requested_synchronize = false;
8088

8189
static int m_next_event_uid;
8290
};
@@ -226,13 +234,20 @@ struct CUstream_st {
226234
bool empty();
227235
bool busy();
228236
void synchronize();
237+
bool synchronize_check();
229238
void push(const stream_operation &op);
230239
void record_next_done();
231240
stream_operation next();
232241
void cancel_front(); // front operation fails, cancle the pending status
233242
stream_operation &front() { return m_operations.front(); }
234243
void print(FILE *fp);
235244
unsigned get_uid() const { return m_uid; }
245+
void set_request_synchronize() { m_requested_synchronize = true; }
246+
void reset_request_synchronize() { m_requested_synchronize = false; }
247+
bool requested_synchronize() const { return m_requested_synchronize; }
248+
void set_stream_zero() { is_stream_zero = true; }
249+
bool is_stream_zero_stream() { return is_stream_zero; }
250+
void reset_stream_zero() { is_stream_zero = false; }
236251

237252
private:
238253
unsigned m_uid;
@@ -243,6 +258,11 @@ struct CUstream_st {
243258

244259
pthread_mutex_t m_lock; // ensure only one host or gpu manipulates stream
245260
// operation at one time
261+
262+
// SST related, use to record the stream is requested to synchronize
263+
bool m_requested_synchronize = false;
264+
// Whether this is the default stream
265+
bool is_stream_zero = false;
246266
};
247267

248268
class stream_manager {
@@ -263,6 +283,8 @@ class stream_manager {
263283
void stop_all_running_kernels();
264284
unsigned size() { return m_streams.size(); };
265285
bool is_blocking() { return m_cuda_launch_blocking; };
286+
CUstream_st *get_stream_zero() { return &m_stream_zero; };
287+
std::list<CUstream_st *>& get_concurrent_streams() { return m_streams; };
266288

267289
private:
268290
void print_impl(FILE *fp);

0 commit comments

Comments
 (0)