@@ -715,16 +715,12 @@ struct AMDGPUQueueTy {
715
715
std::lock_guard<std::mutex> Lock (Mutex);
716
716
assert (Queue && " Interacted with a non-initialized queue!" );
717
717
718
- // Avoid defining the input dependency if already satisfied.
719
- if (InputSignal && !InputSignal->load ())
720
- InputSignal = nullptr ;
721
-
722
718
// Add a barrier packet before the kernel packet in case there is a pending
723
719
// preceding operation. The barrier packet will delay the processing of
724
720
// subsequent queue's packets until the barrier input signal are satisfied.
725
721
// No need output signal needed because the dependency is already guaranteed
726
722
// by the queue barrier itself.
727
- if (InputSignal)
723
+ if (InputSignal && InputSignal-> load () )
728
724
if (auto Err = pushBarrierImpl (nullptr , InputSignal))
729
725
return Err;
730
726
@@ -1254,12 +1250,8 @@ struct AMDGPUStreamTy {
1254
1250
// Consume stream slot and compute dependencies.
1255
1251
auto [Curr, InputSignal] = consume (OutputSignal);
1256
1252
1257
- // Avoid defining the input dependency if already satisfied.
1258
- if (InputSignal && !InputSignal->load ())
1259
- InputSignal = nullptr ;
1260
-
1261
1253
// Issue the async memory copy.
1262
- if (InputSignal) {
1254
+ if (InputSignal && InputSignal-> load () ) {
1263
1255
hsa_signal_t InputSignalRaw = InputSignal->get ();
1264
1256
return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
1265
1257
CopySize, 1 , &InputSignalRaw,
@@ -1293,17 +1285,13 @@ struct AMDGPUStreamTy {
1293
1285
// Consume stream slot and compute dependencies.
1294
1286
auto [Curr, InputSignal] = consume (OutputSignals[0 ]);
1295
1287
1296
- // Avoid defining the input dependency if already satisfied.
1297
- if (InputSignal && !InputSignal->load ())
1298
- InputSignal = nullptr ;
1299
-
1300
1288
// Setup the post action for releasing the intermediate buffer.
1301
1289
if (auto Err = Slots[Curr].schedReleaseBuffer (Inter, MemoryManager))
1302
1290
return Err;
1303
1291
1304
1292
// Issue the first step: device to host transfer. Avoid defining the input
1305
1293
// dependency if already satisfied.
1306
- if (InputSignal) {
1294
+ if (InputSignal && InputSignal-> load () ) {
1307
1295
hsa_signal_t InputSignalRaw = InputSignal->get ();
1308
1296
if (auto Err = utils::asyncMemCopy (
1309
1297
UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1 ,
@@ -1361,12 +1349,8 @@ struct AMDGPUStreamTy {
1361
1349
// Consume stream slot and compute dependencies.
1362
1350
auto [Curr, InputSignal] = consume (OutputSignal);
1363
1351
1364
- // Avoid defining the input dependency if already satisfied.
1365
- if (InputSignal && !InputSignal->load ())
1366
- InputSignal = nullptr ;
1367
-
1368
1352
// Issue the first step: host to host transfer.
1369
- if (InputSignal) {
1353
+ if (InputSignal && InputSignal-> load () ) {
1370
1354
// The std::memcpy is done asynchronously using an async handler. We store
1371
1355
// the function's information in the action but it is not actually a
1372
1356
// post action.
@@ -1429,10 +1413,6 @@ struct AMDGPUStreamTy {
1429
1413
// Consume stream slot and compute dependencies.
1430
1414
auto [Curr, InputSignal] = consume (OutputSignal);
1431
1415
1432
- // Avoid defining the input dependency if already satisfied.
1433
- if (InputSignal && !InputSignal->load ())
1434
- InputSignal = nullptr ;
1435
-
1436
1416
// The agents need to have access to the corresponding memory
1437
1417
// This is presently only true if the pointers were originally
1438
1418
// allocated by this runtime or the caller made the appropriate
0 commit comments