[Offload] Guard HSA implicit arguments if they aren't created (#133073)

jhuber6 · web-flow · commit 75f810e02566 · 2025-03-26T08:54:33.000-05:00
Summary: We conditionally allocate the implicit arguments, so they possibly are null. The flang compiler seems to hit this case, even though it shouldn't when it's supposed to conform to the HSA code object. For now guard this to fix the regression and cover a case in the future where someone rolls a fully custom implementatation. Fixes: #132982
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3363,16 +3363,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
     return Err;
 
-  hsa_utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr;
-  if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) {
-    // Initialize implicit arguments.
-    ImplArgs = reinterpret_cast<hsa_utils::AMDGPUImplicitArgsTy *>(
-        utils::advancePtr(AllArgs, LaunchParams.Size));
-
-    // Initialize the implicit arguments to zero.
-    std::memset(ImplArgs, 0, getImplicitArgsSize());
-  }
-
   // Copy the explicit arguments.
   // TODO: We should expose the args memory manager alloc to the common part as
   // 	   alternative to copying them twice.
@@ -3385,17 +3375,24 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream))
     return Err;
 
-  // Set the COV5+ implicit arguments to the appropriate values.
-  ImplArgs->BlockCountX = NumBlocks[0];
-  ImplArgs->BlockCountY = NumBlocks[1];
-  ImplArgs->BlockCountZ = NumBlocks[2];
-  ImplArgs->GroupSizeX = NumThreads[0];
-  ImplArgs->GroupSizeY = NumThreads[1];
-  ImplArgs->GroupSizeZ = NumThreads[2];
-  ImplArgs->GridDims = NumBlocks[2] * NumThreads[2] > 1
-                           ? 3
-                           : 1 + (NumBlocks[1] * NumThreads[1] != 1);
-  ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem;
+  hsa_utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr;
+  if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) {
+    ImplArgs = reinterpret_cast<hsa_utils::AMDGPUImplicitArgsTy *>(
+        utils::advancePtr(AllArgs, LaunchParams.Size));
+
+    // Set the COV5+ implicit arguments to the appropriate values.
+    std::memset(ImplArgs, 0, getImplicitArgsSize());
+    ImplArgs->BlockCountX = NumBlocks[0];
+    ImplArgs->BlockCountY = NumBlocks[1];
+    ImplArgs->BlockCountZ = NumBlocks[2];
+    ImplArgs->GroupSizeX = NumThreads[0];
+    ImplArgs->GroupSizeY = NumThreads[1];
+    ImplArgs->GroupSizeZ = NumThreads[2];
+    ImplArgs->GridDims = NumBlocks[2] * NumThreads[2] > 1
+                             ? 3
+                             : 1 + (NumBlocks[1] * NumThreads[1] != 1);
+    ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem;
+  }
 
   // Push the kernel launch into the stream.
   return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,