|
34 | 34 |
|
35 | 35 | unsigned CUstream_st::sm_next_stream_uid = 0; |
36 | 36 |
|
37 | | -// SST memcpy callbacks |
38 | | -extern void SST_callback_memcpy_H2D_done(); |
39 | | -extern void SST_callback_memcpy_D2H_done(); |
| 37 | +// SST memcpy callbacks, called after a stream operation is done via record_next_done() |
| 38 | +extern void SST_callback_memcpy_H2D_done(uint64_t dst, uint64_t src, size_t count, cudaStream_t stream); |
| 39 | +extern void SST_callback_memcpy_D2H_done(uint64_t dst, uint64_t src, size_t count, cudaStream_t stream); |
40 | 40 | extern void SST_callback_memcpy_to_symbol_done(); |
41 | 41 | extern void SST_callback_memcpy_from_symbol_done(); |
42 | | -__attribute__((weak)) void SST_callback_memcpy_H2D_done() {} |
43 | | -__attribute__((weak)) void SST_callback_memcpy_D2H_done() {} |
| 42 | +extern void SST_callback_cudaEventSynchronize_done(cudaEvent_t event); |
| 43 | +extern void SST_callback_kernel_done(cudaStream_t stream); |
| 44 | +__attribute__((weak)) void SST_callback_memcpy_H2D_done(uint64_t dst, uint64_t src, size_t count, cudaStream_t stream) {} |
| 45 | +__attribute__((weak)) void SST_callback_memcpy_D2H_done(uint64_t dst, uint64_t src, size_t count, cudaStream_t stream) {} |
44 | 46 | __attribute__((weak)) void SST_callback_memcpy_to_symbol_done() {} |
45 | 47 | __attribute__((weak)) void SST_callback_memcpy_from_symbol_done() {} |
| 48 | +__attribute__((weak)) void SST_callback_cudaEventSynchronize_done(cudaEvent_t event); |
| 49 | +__attribute__((weak)) void SST_callback_kernel_done(cudaStream_t stream); |
| 50 | + |
46 | 51 |
|
47 | 52 | CUstream_st::CUstream_st() { |
48 | 53 | m_pending = false; |
@@ -74,6 +79,10 @@ void CUstream_st::synchronize() { |
74 | 79 | } while (!done); |
75 | 80 | } |
76 | 81 |
|
| 82 | +bool CUstream_st::synchronize_check() { |
| 83 | + return m_operations.empty(); |
| 84 | +} |
| 85 | + |
77 | 86 | void CUstream_st::push(const stream_operation &op) { |
78 | 87 | // called by host thread |
79 | 88 | pthread_mutex_lock(&m_lock); |
@@ -132,13 +141,15 @@ bool stream_operation::do_operation(gpgpu_sim *gpu) { |
132 | 141 | if (g_debug_execution >= 3) printf("memcpy host-to-device\n"); |
133 | 142 | gpu->memcpy_to_gpu(m_device_address_dst, m_host_address_src, m_cnt); |
134 | 143 | m_stream->record_next_done(); |
135 | | - if (gpu->is_SST_mode()) SST_callback_memcpy_H2D_done(); |
| 144 | + if (gpu->is_SST_mode()) { |
| 145 | + SST_callback_memcpy_H2D_done((uint64_t) m_device_address_dst, (uint64_t) m_host_address_src, m_cnt, m_stream->is_stream_zero_stream() ? 0 : m_stream); |
| 146 | + } |
136 | 147 | break; |
137 | 148 | case stream_memcpy_device_to_host: |
138 | 149 | if (g_debug_execution >= 3) printf("memcpy device-to-host\n"); |
139 | 150 | gpu->memcpy_from_gpu(m_host_address_dst, m_device_address_src, m_cnt); |
140 | 151 | m_stream->record_next_done(); |
141 | | - if (gpu->is_SST_mode()) SST_callback_memcpy_D2H_done(); |
| 152 | + if (gpu->is_SST_mode()) SST_callback_memcpy_D2H_done((uint64_t) m_host_address_dst, (uint64_t) m_device_address_src, m_cnt, m_stream->is_stream_zero_stream() ? 0 : m_stream); |
142 | 153 | break; |
143 | 154 | case stream_memcpy_device_to_device: |
144 | 155 | if (g_debug_execution >= 3) printf("memcpy device-to-device\n"); |
@@ -194,6 +205,13 @@ bool stream_operation::do_operation(gpgpu_sim *gpu) { |
194 | 205 | time_t wallclock = time((time_t *)NULL); |
195 | 206 | m_event->update(gpu->gpu_tot_sim_cycle, wallclock); |
196 | 207 | m_stream->record_next_done(); |
| 208 | + if ((gpu->is_SST_mode()) && m_event->done() && |
| 209 | + m_event->requested_synchronize()) { |
| 210 | + // Notify that the event is done |
| 211 | + SST_callback_cudaEventSynchronize_done(m_event); |
| 212 | + // Reset the sync flag as we have notified SST |
| 213 | + m_event->reset_request_synchronize(); |
| 214 | + } |
197 | 215 | } break; |
198 | 216 | case stream_wait_event: |
199 | 217 | // only allows next op to go if event is done |
@@ -252,6 +270,9 @@ stream_manager::stream_manager(gpgpu_sim *gpu, bool cuda_launch_blocking) { |
252 | 270 | m_cuda_launch_blocking = cuda_launch_blocking; |
253 | 271 | pthread_mutex_init(&m_lock, NULL); |
254 | 272 | m_last_stream = m_streams.begin(); |
| 273 | + |
| 274 | + // Mark stream zero as the default stream |
| 275 | + m_stream_zero.set_stream_zero(); |
255 | 276 | } |
256 | 277 |
|
257 | 278 | bool stream_manager::operation(bool *sim) { |
@@ -303,6 +324,11 @@ bool stream_manager::register_finished_kernel(unsigned grid_uid) { |
303 | 324 | // grid_uid, stream->get_uid()); kernel_stat.flush(); |
304 | 325 | // kernel_stat.close(); |
305 | 326 | stream->record_next_done(); |
| 327 | + // Callback to notify a kernel is done for SST's stream |
| 328 | + // manager to support with nonblocking + blocking kernel launch |
| 329 | + if (m_gpu->is_SST_mode()) { |
| 330 | + SST_callback_kernel_done(stream->is_stream_zero_stream() ? 0 : stream); |
| 331 | + } |
306 | 332 | m_grid_id_to_stream.erase(grid_uid); |
307 | 333 | kernel->notify_parent_finished(); |
308 | 334 | delete kernel; |
|
0 commit comments