[STF] Fix incorrect level index in 3-depth execution policy (#6089)

19970126ljl · caugonnet · pre-commit-ci[bot] · web-flow · commit b39237aca79d · 2025-10-06T09:48:22.000Z
* [STF] Fix incorrect level index in 3-depth execution policy
      Fixed a typo in places.cuh line 1665 where l2_size was incorrectly
      getting width from level 1 instead of level 2, causing Unsatisfiable

* Add a new test to check that we are using the proper CUDA kernel configuration in multi-level specs

* [pre-commit.ci] auto code formatting

---------

Co-authored-by: Cédric Augonnet &lt;caugonnet@nvidia.com&gt;
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
Co-authored-by: Cédric Augonnet &lt;158148890+caugonnet@users.noreply.github.com&gt;
diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh
@@ -1661,7 +1661,7 @@ interpreted_execution_policy<spec...>::interpreted_execution_policy(
   {
     size_t l0_size = p.get_width(0);
     size_t l1_size = p.get_width(1);
-    size_t l2_size = p.get_width(1);
+    size_t l2_size = p.get_width(2);
     bool l0_sync   = thread_hierarchy_spec<spec...>::template is_synchronizable<0>;
     bool l1_sync   = thread_hierarchy_spec<spec...>::template is_synchronizable<1>;
     bool l2_sync   = thread_hierarchy_spec<spec...>::template is_synchronizable<2>;
diff --git a/cudax/test/stf/CMakeLists.txt b/cudax/test/stf/CMakeLists.txt
@@ -52,6 +52,7 @@ set(stf_test_sources
   places/non_current_device.cu
   places/place_partition.cu
   places/recursion.cu
+  places/execution_policy_kernel_launch_test.cu
   reclaiming/graph.cu
   reclaiming/graph_2.cu
   reclaiming/graph_real_oom.cu
diff --git a/cudax/test/stf/places/execution_policy_kernel_launch_test.cu b/cudax/test/stf/places/execution_policy_kernel_launch_test.cu
@@ -0,0 +1,56 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDASTF in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+//! file
+//! !brief Check that multi-level launch specification are fulfilled
+
+#include <cuda/experimental/stf.cuh>
+
+#include <cassert>
+#include <iostream>
+
+using namespace cuda::experimental::stf;
+
+int main()
+{
+  stream_ctx ctx;
+
+  // Create a 3-level thread hierarchy specification that would expose the bug:
+  // Level 0: only 1 device to run on CI
+  // Level 1: 4 blocks per device (width 4)
+  // Level 2: 64 threads per block (width 64)
+  //
+  auto spec = par(hw_scope::device, 1, con<4>(hw_scope::block, con<64>(hw_scope::thread)));
+
+  int test_result    = 0;
+  auto l_test_result = ctx.logical_data(make_slice(&test_result, 1));
+
+  ctx.launch(spec, exec_place::current_device(), l_test_result.rw())->*[] __device__(auto th, auto result) {
+    if (th.rank() == 0)
+    {
+      bool level0_correct = (th.size(0) == 1); // device level
+      bool level1_correct = (th.size(1) == 1 * 4) && (gridDim.x == 4); // blocks per device
+      bool level2_correct = (th.size(2) == 1 * 4 * 64) && (blockDim.x == 64); // threads per block
+
+      // Set test result based on whether all levels are correct
+      result[0] = level0_correct && level1_correct && level2_correct ? 1 : 0;
+    }
+  };
+
+  ctx.finalize();
+
+  if (test_result != 1)
+  {
+    fprintf(stderr, "FAIL: Hierarchy dimensions are incorrect!\n");
+    return 1;
+  }
+
+  return 0;
+}

Original file line number	Diff line number	Diff line change
`@@ -1661,7 +1661,7 @@ interpreted_execution_policy<spec...>::interpreted_execution_policy(`
`1661`	`1661`	`{`
`1662`	`1662`	`size_t l0_size = p.get_width(0);`
`1663`	`1663`	`size_t l1_size = p.get_width(1);`
`1664`		`- size_t l2_size = p.get_width(1);`
	`1664`	`+ size_t l2_size = p.get_width(2);`
`1665`	`1665`	`bool l0_sync = thread_hierarchy_spec<spec...>::template is_synchronizable<0>;`
`1666`	`1666`	`bool l1_sync = thread_hierarchy_spec<spec...>::template is_synchronizable<1>;`
`1667`	`1667`	`bool l2_sync = thread_hierarchy_spec<spec...>::template is_synchronizable<2>;`