Skip to content

Commit 092d854

Browse files
committed
[DevTSAN] Support device thread sanitizer for sycl::buffer
1 parent 7d40de9 commit 092d854

File tree

7 files changed

+1367
-0
lines changed

7 files changed

+1367
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
// REQUIRES: linux, cpu || (gpu && level_zero)
2+
// ALLOW_RETRIES: 10
3+
// RUN: %{build} %device_tsan_flags -O0 -g -o %t1.out
4+
// RUN: %{run} %t1.out 2>&1 | FileCheck %s
5+
// RUN: %{build} %device_tsan_flags -O2 -g -o %t2.out
6+
// RUN: %{run} %t2.out 2>&1 | FileCheck %s
7+
8+
#include <sycl/detail/core.hpp>
9+
10+
static const int N = 16;
11+
12+
int main() {
13+
sycl::queue q;
14+
15+
std::vector<int> v(N);
16+
17+
{
18+
// We intentionally test sycl::buffer uses host ptr and trigger data write
19+
// back here because in unified runtime we intercept sycl::buffer with usm,
20+
// we need to cover that pattern here.
21+
sycl::buffer<int, 1> buf(v.data(), v.size());
22+
q.submit([&](sycl::handler &h) {
23+
auto A = buf.get_access<sycl::access::mode::read_write>(h);
24+
h.parallel_for<class Test>(
25+
sycl::nd_range<1>(N, 1),
26+
[=](sycl::nd_item<1>) { A[0]++; });
27+
}).wait();
28+
// CHECK: WARNING: DeviceSanitizer: data race
29+
// CHECK-NEXT: When write of size 4 at 0x{{.*}} in kernel <{{.*}}Test>
30+
// CHECK-NEXT: #0 {{.*}}check_buffer.cpp:[[@LINE-4]]
31+
}
32+
33+
return 0;
34+
}

unified-runtime/source/loader/CMakeLists.txt

+2
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,8 @@ if(UR_ENABLE_SANITIZER)
163163
${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/msan/msan_report.hpp
164164
${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/msan/msan_shadow.cpp
165165
${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/msan/msan_shadow.hpp
166+
${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/tsan/tsan_buffer.cpp
167+
${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/tsan/tsan_buffer.hpp
166168
${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/tsan/tsan_ddi.cpp
167169
${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/tsan/tsan_ddi.hpp
168170
${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/tsan/tsan_interceptor.cpp
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
/*
2+
*
3+
* Copyright (C) 2025 Intel Corporation
4+
*
5+
* Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
6+
* Exceptions. See LICENSE.TXT
7+
*
8+
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
9+
*
10+
* @file tsan_buffer.cpp
11+
*
12+
*/
13+
14+
#include "tsan_buffer.hpp"
15+
#include "tsan_interceptor.hpp"
16+
#include "sanitizer_common/sanitizer_utils.hpp"
17+
#include "ur_sanitizer_layer.hpp"
18+
19+
namespace ur_sanitizer_layer {
20+
namespace tsan {
21+
22+
ur_result_t EnqueueMemCopyRectHelper(
23+
ur_queue_handle_t Queue, char *pSrc, char *pDst, ur_rect_offset_t SrcOffset,
24+
ur_rect_offset_t DstOffset, ur_rect_region_t Region, size_t SrcRowPitch,
25+
size_t SrcSlicePitch, size_t DstRowPitch, size_t DstSlicePitch,
26+
bool Blocking, uint32_t NumEventsInWaitList,
27+
const ur_event_handle_t *EventWaitList, ur_event_handle_t *Event) {
28+
// If user doesn't determine src/dst row pitch and slice pitch, just use
29+
// region for it.
30+
if (SrcRowPitch == 0) {
31+
SrcRowPitch = Region.width;
32+
}
33+
34+
if (SrcSlicePitch == 0) {
35+
SrcSlicePitch = SrcRowPitch * Region.height;
36+
}
37+
38+
if (DstRowPitch == 0) {
39+
DstRowPitch = Region.width;
40+
}
41+
42+
if (DstSlicePitch == 0) {
43+
DstSlicePitch = DstRowPitch * Region.height;
44+
}
45+
46+
// Calculate the src and dst addresses that actually will be copied.
47+
char *SrcOrigin = pSrc + SrcOffset.x + SrcRowPitch * SrcOffset.y +
48+
SrcSlicePitch * SrcOffset.z;
49+
char *DstOrigin = pDst + DstOffset.x + DstRowPitch * DstOffset.y +
50+
DstSlicePitch * DstOffset.z;
51+
52+
ur_device_handle_t Device = GetDevice(Queue);
53+
std::shared_ptr<DeviceInfo> DeviceInfo =
54+
getTsanInterceptor()->getDeviceInfo(Device);
55+
std::vector<ur_event_handle_t> Events;
56+
57+
// For now, USM doesn't support 3D memory copy operation, so we can only
58+
// loop call 2D memory copy function to implement it.
59+
for (size_t i = 0; i < Region.depth; i++) {
60+
ur_event_handle_t NewEvent{};
61+
UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy2D(
62+
Queue, false, DstOrigin + (i * DstSlicePitch), DstRowPitch,
63+
SrcOrigin + (i * SrcSlicePitch), SrcRowPitch, Region.width,
64+
Region.height, NumEventsInWaitList, EventWaitList, &NewEvent));
65+
Events.push_back(NewEvent);
66+
}
67+
68+
if (Blocking) {
69+
UR_CALL(getContext()->urDdiTable.Event.pfnWait(Events.size(), &Events[0]));
70+
}
71+
72+
if (Event) {
73+
UR_CALL(getContext()->urDdiTable.Enqueue.pfnEventsWait(Queue, Events.size(),
74+
&Events[0], Event));
75+
}
76+
77+
return UR_RESULT_SUCCESS;
78+
}
79+
80+
ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) {
81+
// Sub-buffers don't maintain own allocations but rely on parent buffer.
82+
if (SubBuffer) {
83+
UR_CALL(SubBuffer->Parent->getHandle(Device, Handle));
84+
Handle += SubBuffer->Origin;
85+
return UR_RESULT_SUCCESS;
86+
}
87+
88+
// Device may be null, we follow the L0 adapter's practice to use the first
89+
// device
90+
if (!Device) {
91+
auto Devices = GetDevices(Context);
92+
assert(Devices.size() > 0 && "Devices should not be empty");
93+
Device = Devices[0];
94+
}
95+
assert((void *)Device != nullptr && "Device cannot be nullptr");
96+
97+
std::scoped_lock<ur_shared_mutex> Guard(Mutex);
98+
auto &Allocation = Allocations[Device];
99+
ur_result_t URes = UR_RESULT_SUCCESS;
100+
if (!Allocation) {
101+
ur_usm_desc_t USMDesc{};
102+
USMDesc.align = getAlignment();
103+
ur_usm_pool_handle_t Pool{};
104+
URes = getTsanInterceptor()->allocateMemory(Context, Device, &USMDesc, Pool,
105+
Size, AllocType::DEVICE_USM,
106+
ur_cast<void **>(&Allocation));
107+
if (URes != UR_RESULT_SUCCESS) {
108+
getContext()->logger.error(
109+
"Failed to allocate {} bytes memory for buffer {}", Size, this);
110+
return URes;
111+
}
112+
113+
if (HostPtr) {
114+
ManagedQueue Queue(Context, Device);
115+
URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
116+
Queue, true, Allocation, HostPtr, Size, 0, nullptr, nullptr);
117+
if (URes != UR_RESULT_SUCCESS) {
118+
getContext()->logger.error("Failed to copy {} bytes data from host "
119+
"pointer {} to buffer {}",
120+
Size, HostPtr, this);
121+
return URes;
122+
}
123+
}
124+
}
125+
126+
Handle = Allocation;
127+
128+
if (!LastSyncedDevice.hDevice) {
129+
LastSyncedDevice = MemBuffer::Device_t{Device, Handle};
130+
return URes;
131+
}
132+
133+
// If the device required to allocate memory is not the previous one, we
134+
// need to do data migration.
135+
if (Device != LastSyncedDevice.hDevice) {
136+
auto &HostAllocation = Allocations[nullptr];
137+
if (!HostAllocation) {
138+
ur_usm_desc_t USMDesc{};
139+
USMDesc.align = getAlignment();
140+
ur_usm_pool_handle_t Pool{};
141+
URes = getContext()->urDdiTable.USM.pfnHostAlloc(
142+
Context, &USMDesc, Pool, Size, ur_cast<void **>(&HostAllocation));
143+
if (URes != UR_RESULT_SUCCESS) {
144+
getContext()->logger.error("Failed to allocate {} bytes host "
145+
"USM for buffer {} migration",
146+
Size, this);
147+
return URes;
148+
}
149+
}
150+
151+
// Copy data from last synced device to host
152+
{
153+
ManagedQueue Queue(Context, LastSyncedDevice.hDevice);
154+
URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
155+
Queue, true, HostAllocation, LastSyncedDevice.MemHandle, Size, 0,
156+
nullptr, nullptr);
157+
if (URes != UR_RESULT_SUCCESS) {
158+
getContext()->logger.error("Failed to migrate memory buffer data");
159+
return URes;
160+
}
161+
}
162+
163+
// Sync data back to device
164+
{
165+
ManagedQueue Queue(Context, Device);
166+
URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
167+
Queue, true, Allocation, HostAllocation, Size, 0, nullptr, nullptr);
168+
if (URes != UR_RESULT_SUCCESS) {
169+
getContext()->logger.error("Failed to migrate memory buffer data");
170+
return URes;
171+
}
172+
}
173+
}
174+
175+
LastSyncedDevice = MemBuffer::Device_t{Device, Handle};
176+
177+
return URes;
178+
}
179+
180+
ur_result_t MemBuffer::free() {
181+
for (const auto &[_, Ptr] : Allocations) {
182+
ur_result_t URes = getContext()->urDdiTable.USM.pfnFree(Context, Ptr);
183+
if (URes != UR_RESULT_SUCCESS) {
184+
getContext()->logger.error("Failed to free buffer handle {}", Ptr);
185+
return URes;
186+
}
187+
}
188+
Allocations.clear();
189+
return UR_RESULT_SUCCESS;
190+
}
191+
192+
size_t MemBuffer::getAlignment() {
193+
// Choose an alignment that is at most 128 and is the next power of 2
194+
// for sizes less than 128.
195+
// TODO: If we don't set the alignment size explicitly, the device will
196+
// usually choose a very large size (more than 1k). Then sanitizer will
197+
// allocate extra unnessary memory. Not sure if this will impact
198+
// performance.
199+
size_t MsbIdx = 63 - __builtin_clzl(Size);
200+
size_t Alignment = (1ULL << (MsbIdx + 1));
201+
if (Alignment > 128) {
202+
Alignment = 128;
203+
}
204+
return Alignment;
205+
}
206+
207+
} // namespace tsan
208+
} // namespace ur_sanitizer_layer
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
/*
2+
*
3+
* Copyright (C) 2025 Intel Corporation
4+
*
5+
* Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
6+
* Exceptions. See LICENSE.TXT
7+
*
8+
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
9+
*
10+
* @file tsan_buffer.hpp
11+
*
12+
*/
13+
14+
#pragma once
15+
16+
#include <atomic>
17+
#include <memory>
18+
#include <optional>
19+
20+
#include "ur/ur.hpp"
21+
22+
namespace ur_sanitizer_layer {
23+
namespace tsan {
24+
25+
struct MemBuffer {
26+
// Buffer constructor
27+
MemBuffer(ur_context_handle_t Context, size_t Size, char *HostPtr)
28+
: Context(Context), Size(Size), HostPtr(HostPtr) {}
29+
30+
// Sub-buffer constructor
31+
MemBuffer(std::shared_ptr<MemBuffer> Parent, size_t Origin, size_t Size)
32+
: Context(Parent->Context), Size(Size),
33+
SubBuffer{{std::move(Parent), Origin}} {}
34+
35+
ur_result_t getHandle(ur_device_handle_t Device, char *&Handle);
36+
37+
ur_result_t free();
38+
39+
size_t getAlignment();
40+
41+
std::unordered_map<ur_device_handle_t, char *> Allocations;
42+
43+
enum AccessMode { UNKNOWN, READ_WRITE, READ_ONLY, WRITE_ONLY };
44+
45+
struct Mapping {
46+
size_t Offset;
47+
size_t Size;
48+
};
49+
50+
std::unordered_map<void *, Mapping> Mappings;
51+
52+
ur_context_handle_t Context;
53+
54+
struct Device_t {
55+
ur_device_handle_t hDevice;
56+
char *MemHandle;
57+
};
58+
Device_t LastSyncedDevice{};
59+
60+
size_t Size;
61+
62+
char *HostPtr{};
63+
64+
struct SubBuffer_t {
65+
std::shared_ptr<MemBuffer> Parent;
66+
size_t Origin;
67+
};
68+
69+
std::optional<SubBuffer_t> SubBuffer;
70+
71+
std::atomic<int32_t> RefCount = 1;
72+
73+
ur_shared_mutex Mutex;
74+
};
75+
76+
ur_result_t EnqueueMemCopyRectHelper(
77+
ur_queue_handle_t Queue, char *pSrc, char *pDst, ur_rect_offset_t SrcOffset,
78+
ur_rect_offset_t DstOffset, ur_rect_region_t Region, size_t SrcRowPitch,
79+
size_t SrcSlicePitch, size_t DstRowPitch, size_t DstSlicePitch,
80+
bool Blocking, uint32_t NumEventsInWaitList,
81+
const ur_event_handle_t *EventWaitList, ur_event_handle_t *Event);
82+
83+
} // namespace tsan
84+
} // namespace ur_sanitizer_layer

0 commit comments

Comments
 (0)