Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DevTSAN] Support device thread sanitizer for sycl::buffer #17625

Merged
merged 7 commits into from
Mar 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions sycl/test-e2e/ThreadSanitizer/check_buffer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// REQUIRES: linux, cpu || (gpu && level_zero)
// ALLOW_RETRIES: 10
// RUN: %{build} %device_tsan_flags -O0 -g -o %t1.out
// RUN: %{run} %t1.out 2>&1 | FileCheck %s
// RUN: %{build} %device_tsan_flags -O2 -g -o %t2.out
// RUN: %{run} %t2.out 2>&1 | FileCheck %s

#include <sycl/detail/core.hpp>

static const int N = 16;

int main() {
sycl::queue q;

std::vector<int> v(N);

{
// We intentionally test sycl::buffer uses host ptr and trigger data write
// back here because in unified runtime we intercept sycl::buffer with usm,
// we need to cover that pattern here.
sycl::buffer<int, 1> buf(v.data(), v.size());
q.submit([&](sycl::handler &h) {
auto A = buf.get_access<sycl::access::mode::read_write>(h);
h.parallel_for<class Test>(sycl::nd_range<1>(N, 1),
[=](sycl::nd_item<1>) { A[0]++; });
}).wait();
// CHECK: WARNING: DeviceSanitizer: data race
// CHECK-NEXT: When write of size 4 at 0x{{.*}} in kernel <{{.*}}Test>
// CHECK-NEXT: #0 {{.*}}check_buffer.cpp:[[@LINE-4]]
}

return 0;
}
33 changes: 33 additions & 0 deletions sycl/test-e2e/ThreadSanitizer/check_sub_buffer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// REQUIRES: linux, cpu || (gpu && level_zero)
// ALLOW_RETRIES: 10
// RUN: %{build} %device_tsan_flags -O0 -g -o %t1.out
// RUN: %{run} %t1.out 2>&1 | FileCheck %s
// RUN: %{build} %device_tsan_flags -O2 -g -o %t2.out
// RUN: %{run} %t2.out 2>&1 | FileCheck %s

#include <sycl/detail/core.hpp>

int main() {
constexpr size_t size_x = 16;

std::vector<int> v(size_x);
for (size_t i = 0; i < size_x; i++)
v[i] = i;

{
sycl::queue q;
sycl::buffer<int> buf(v.data(), v.size());
sycl::buffer<int> sub_buf(buf, {size_x / 2}, {size_x / 2});

q.submit([&](sycl::handler &cgh) {
auto accessor = sub_buf.get_access<sycl::access::mode::read_write>(cgh);
cgh.parallel_for<class Test>(sycl::nd_range<1>(size_x / 2, 1),
[=](sycl::nd_item<1>) { accessor[0]++; });
}).wait();
// CHECK: WARNING: DeviceSanitizer: data race
// CHECK-NEXT: When write of size 4 at 0x{{.*}} in kernel <{{.*}}Test>
// CHECK-NEXT: #0 {{.*}}check_sub_buffer.cpp:[[@LINE-4]]
}

return 0;
}
2 changes: 2 additions & 0 deletions unified-runtime/source/loader/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,8 @@ if(UR_ENABLE_SANITIZER)
${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/msan/msan_report.hpp
${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/msan/msan_shadow.cpp
${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/msan/msan_shadow.hpp
${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/tsan/tsan_buffer.cpp
${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/tsan/tsan_buffer.hpp
${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/tsan/tsan_ddi.cpp
${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/tsan/tsan_ddi.hpp
${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/tsan/tsan_interceptor.cpp
Expand Down
208 changes: 208 additions & 0 deletions unified-runtime/source/loader/layers/sanitizer/tsan/tsan_buffer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
/*
*
* Copyright (C) 2025 Intel Corporation
*
* Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
* Exceptions. See LICENSE.TXT
*
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
* @file tsan_buffer.cpp
*
*/

#include "tsan_buffer.hpp"
#include "sanitizer_common/sanitizer_utils.hpp"
#include "tsan_interceptor.hpp"
#include "ur_sanitizer_layer.hpp"

namespace ur_sanitizer_layer {
namespace tsan {

ur_result_t EnqueueMemCopyRectHelper(
ur_queue_handle_t Queue, char *pSrc, char *pDst, ur_rect_offset_t SrcOffset,
ur_rect_offset_t DstOffset, ur_rect_region_t Region, size_t SrcRowPitch,
size_t SrcSlicePitch, size_t DstRowPitch, size_t DstSlicePitch,
bool Blocking, uint32_t NumEventsInWaitList,
const ur_event_handle_t *EventWaitList, ur_event_handle_t *Event) {
// If user doesn't determine src/dst row pitch and slice pitch, just use
// region for it.
if (SrcRowPitch == 0) {
SrcRowPitch = Region.width;
}

if (SrcSlicePitch == 0) {
SrcSlicePitch = SrcRowPitch * Region.height;
}

if (DstRowPitch == 0) {
DstRowPitch = Region.width;
}

if (DstSlicePitch == 0) {
DstSlicePitch = DstRowPitch * Region.height;
}

// Calculate the src and dst addresses that actually will be copied.
char *SrcOrigin = pSrc + SrcOffset.x + SrcRowPitch * SrcOffset.y +
SrcSlicePitch * SrcOffset.z;
char *DstOrigin = pDst + DstOffset.x + DstRowPitch * DstOffset.y +
DstSlicePitch * DstOffset.z;

ur_device_handle_t Device = GetDevice(Queue);
std::shared_ptr<DeviceInfo> DeviceInfo =
getTsanInterceptor()->getDeviceInfo(Device);
std::vector<ur_event_handle_t> Events;

// For now, USM doesn't support 3D memory copy operation, so we can only
// loop call 2D memory copy function to implement it.
for (size_t i = 0; i < Region.depth; i++) {
ur_event_handle_t NewEvent{};
UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy2D(
Queue, false, DstOrigin + (i * DstSlicePitch), DstRowPitch,
SrcOrigin + (i * SrcSlicePitch), SrcRowPitch, Region.width,
Region.height, NumEventsInWaitList, EventWaitList, &NewEvent));
Events.push_back(NewEvent);
}

if (Blocking) {
UR_CALL(getContext()->urDdiTable.Event.pfnWait(Events.size(), &Events[0]));
}

if (Event) {
UR_CALL(getContext()->urDdiTable.Enqueue.pfnEventsWait(Queue, Events.size(),
&Events[0], Event));
}

return UR_RESULT_SUCCESS;
}

ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) {
// Sub-buffers don't maintain own allocations but rely on parent buffer.
if (SubBuffer) {
UR_CALL(SubBuffer->Parent->getHandle(Device, Handle));
Handle += SubBuffer->Origin;
return UR_RESULT_SUCCESS;
}

// Device may be null, we follow the L0 adapter's practice to use the first
// device
if (!Device) {
auto Devices = GetDevices(Context);
assert(Devices.size() > 0 && "Devices should not be empty");
Device = Devices[0];
}
assert((void *)Device != nullptr && "Device cannot be nullptr");

std::scoped_lock<ur_shared_mutex> Guard(Mutex);
auto &Allocation = Allocations[Device];
ur_result_t URes = UR_RESULT_SUCCESS;
if (!Allocation) {
ur_usm_desc_t USMDesc{};
USMDesc.align = getAlignment();
ur_usm_pool_handle_t Pool{};
URes = getTsanInterceptor()->allocateMemory(Context, Device, &USMDesc, Pool,
Size, AllocType::DEVICE_USM,
ur_cast<void **>(&Allocation));
if (URes != UR_RESULT_SUCCESS) {
getContext()->logger.error(
"Failed to allocate {} bytes memory for buffer {}", Size, this);
return URes;
}

if (HostPtr) {
ManagedQueue Queue(Context, Device);
URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
Queue, true, Allocation, HostPtr, Size, 0, nullptr, nullptr);
if (URes != UR_RESULT_SUCCESS) {
getContext()->logger.error("Failed to copy {} bytes data from host "
"pointer {} to buffer {}",
Size, HostPtr, this);
return URes;
}
}
}

Handle = Allocation;

if (!LastSyncedDevice.hDevice) {
LastSyncedDevice = MemBuffer::Device_t{Device, Handle};
return URes;
}

// If the device required to allocate memory is not the previous one, we
// need to do data migration.
if (Device != LastSyncedDevice.hDevice) {
auto &HostAllocation = Allocations[nullptr];
if (!HostAllocation) {
ur_usm_desc_t USMDesc{};
USMDesc.align = getAlignment();
ur_usm_pool_handle_t Pool{};
URes = getContext()->urDdiTable.USM.pfnHostAlloc(
Context, &USMDesc, Pool, Size, ur_cast<void **>(&HostAllocation));
if (URes != UR_RESULT_SUCCESS) {
getContext()->logger.error("Failed to allocate {} bytes host "
"USM for buffer {} migration",
Size, this);
return URes;
}
}

// Copy data from last synced device to host
{
ManagedQueue Queue(Context, LastSyncedDevice.hDevice);
URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
Queue, true, HostAllocation, LastSyncedDevice.MemHandle, Size, 0,
nullptr, nullptr);
if (URes != UR_RESULT_SUCCESS) {
getContext()->logger.error("Failed to migrate memory buffer data");
return URes;
}
}

// Sync data back to device
{
ManagedQueue Queue(Context, Device);
URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
Queue, true, Allocation, HostAllocation, Size, 0, nullptr, nullptr);
if (URes != UR_RESULT_SUCCESS) {
getContext()->logger.error("Failed to migrate memory buffer data");
return URes;
}
}
}

LastSyncedDevice = MemBuffer::Device_t{Device, Handle};

return URes;
}

ur_result_t MemBuffer::free() {
for (const auto &[_, Ptr] : Allocations) {
ur_result_t URes = getContext()->urDdiTable.USM.pfnFree(Context, Ptr);
if (URes != UR_RESULT_SUCCESS) {
getContext()->logger.error("Failed to free buffer handle {}", Ptr);
return URes;
}
}
Allocations.clear();
return UR_RESULT_SUCCESS;
}

size_t MemBuffer::getAlignment() {
// Choose an alignment that is at most 128 and is the next power of 2
// for sizes less than 128.
// TODO: If we don't set the alignment size explicitly, the device will
// usually choose a very large size (more than 1k). Then sanitizer will
// allocate extra unnessary memory. Not sure if this will impact
// performance.
size_t MsbIdx = 63 - __builtin_clzl(Size);
size_t Alignment = (1ULL << (MsbIdx + 1));
if (Alignment > 128) {
Alignment = 128;
}
return Alignment;
}

} // namespace tsan
} // namespace ur_sanitizer_layer
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*
*
* Copyright (C) 2025 Intel Corporation
*
* Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
* Exceptions. See LICENSE.TXT
*
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
* @file tsan_buffer.hpp
*
*/

#pragma once

#include <atomic>
#include <memory>
#include <optional>

#include "ur/ur.hpp"

namespace ur_sanitizer_layer {
namespace tsan {

struct MemBuffer {
// Buffer constructor
MemBuffer(ur_context_handle_t Context, size_t Size, char *HostPtr)
: Context(Context), Size(Size), HostPtr(HostPtr) {}

// Sub-buffer constructor
MemBuffer(std::shared_ptr<MemBuffer> Parent, size_t Origin, size_t Size)
: Context(Parent->Context), Size(Size),
SubBuffer{{std::move(Parent), Origin}} {}

ur_result_t getHandle(ur_device_handle_t Device, char *&Handle);

ur_result_t free();

size_t getAlignment();

std::unordered_map<ur_device_handle_t, char *> Allocations;

enum AccessMode { UNKNOWN, READ_WRITE, READ_ONLY, WRITE_ONLY };

struct Mapping {
size_t Offset;
size_t Size;
};

std::unordered_map<void *, Mapping> Mappings;

ur_context_handle_t Context;

struct Device_t {
ur_device_handle_t hDevice;
char *MemHandle;
};
Device_t LastSyncedDevice{};

size_t Size;

char *HostPtr{};

struct SubBuffer_t {
std::shared_ptr<MemBuffer> Parent;
size_t Origin;
};

std::optional<SubBuffer_t> SubBuffer;

std::atomic<int32_t> RefCount = 1;

ur_shared_mutex Mutex;
};

ur_result_t EnqueueMemCopyRectHelper(
ur_queue_handle_t Queue, char *pSrc, char *pDst, ur_rect_offset_t SrcOffset,
ur_rect_offset_t DstOffset, ur_rect_region_t Region, size_t SrcRowPitch,
size_t SrcSlicePitch, size_t DstRowPitch, size_t DstSlicePitch,
bool Blocking, uint32_t NumEventsInWaitList,
const ur_event_handle_t *EventWaitList, ur_event_handle_t *Event);

} // namespace tsan
} // namespace ur_sanitizer_layer
Loading
Loading