Skip to content

Commit b39237a

Browse files
19970126ljlcaugonnetpre-commit-ci[bot]
authored
[STF] Fix incorrect level index in 3-depth execution policy (#6089)
* [STF] Fix incorrect level index in 3-depth execution policy Fixed a typo in places.cuh line 1665 where l2_size was incorrectly getting width from level 1 instead of level 2, causing Unsatisfiable * Add a new test to check that we are using the proper CUDA kernel configuration in multi-level specs * [pre-commit.ci] auto code formatting --------- Co-authored-by: Cédric Augonnet <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Cédric Augonnet <[email protected]>
1 parent a1683b9 commit b39237a

File tree

3 files changed

+58
-1
lines changed

3 files changed

+58
-1
lines changed

cudax/include/cuda/experimental/__stf/places/places.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1661,7 +1661,7 @@ interpreted_execution_policy<spec...>::interpreted_execution_policy(
16611661
{
16621662
size_t l0_size = p.get_width(0);
16631663
size_t l1_size = p.get_width(1);
1664-
size_t l2_size = p.get_width(1);
1664+
size_t l2_size = p.get_width(2);
16651665
bool l0_sync = thread_hierarchy_spec<spec...>::template is_synchronizable<0>;
16661666
bool l1_sync = thread_hierarchy_spec<spec...>::template is_synchronizable<1>;
16671667
bool l2_sync = thread_hierarchy_spec<spec...>::template is_synchronizable<2>;

cudax/test/stf/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ set(stf_test_sources
5252
places/non_current_device.cu
5353
places/place_partition.cu
5454
places/recursion.cu
55+
places/execution_policy_kernel_launch_test.cu
5556
reclaiming/graph.cu
5657
reclaiming/graph_2.cu
5758
reclaiming/graph_real_oom.cu
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of CUDASTF in CUDA C++ Core Libraries,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
//! file
12+
//! !brief Check that multi-level launch specification are fulfilled
13+
14+
#include <cuda/experimental/stf.cuh>
15+
16+
#include <cassert>
17+
#include <iostream>
18+
19+
using namespace cuda::experimental::stf;
20+
21+
int main()
22+
{
23+
stream_ctx ctx;
24+
25+
// Create a 3-level thread hierarchy specification that would expose the bug:
26+
// Level 0: only 1 device to run on CI
27+
// Level 1: 4 blocks per device (width 4)
28+
// Level 2: 64 threads per block (width 64)
29+
//
30+
auto spec = par(hw_scope::device, 1, con<4>(hw_scope::block, con<64>(hw_scope::thread)));
31+
32+
int test_result = 0;
33+
auto l_test_result = ctx.logical_data(make_slice(&test_result, 1));
34+
35+
ctx.launch(spec, exec_place::current_device(), l_test_result.rw())->*[] __device__(auto th, auto result) {
36+
if (th.rank() == 0)
37+
{
38+
bool level0_correct = (th.size(0) == 1); // device level
39+
bool level1_correct = (th.size(1) == 1 * 4) && (gridDim.x == 4); // blocks per device
40+
bool level2_correct = (th.size(2) == 1 * 4 * 64) && (blockDim.x == 64); // threads per block
41+
42+
// Set test result based on whether all levels are correct
43+
result[0] = level0_correct && level1_correct && level2_correct ? 1 : 0;
44+
}
45+
};
46+
47+
ctx.finalize();
48+
49+
if (test_result != 1)
50+
{
51+
fprintf(stderr, "FAIL: Hierarchy dimensions are incorrect!\n");
52+
return 1;
53+
}
54+
55+
return 0;
56+
}

0 commit comments

Comments
 (0)