intel · martygrant · Mar 27, 2025 · Mar 25, 2025 · Mar 26, 2025 · Mar 26, 2025
@@ -0,0 +1,33 @@
+// REQUIRES: linux, cpu || (gpu && level_zero)
+// ALLOW_RETRIES: 10
+// RUN: %{build} %device_tsan_flags -O0 -g -o %t1.out
+// RUN: %{run} %t1.out 2>&1 | FileCheck %s
+// RUN: %{build} %device_tsan_flags -O2 -g -o %t2.out
+// RUN: %{run} %t2.out 2>&1 | FileCheck %s
+
+#include <sycl/detail/core.hpp>
+
+static const int N = 16;
+
+int main() {
+  sycl::queue q;
+
+  std::vector<int> v(N);
+
+  {
+    // We intentionally test sycl::buffer uses host ptr and trigger data write
+    // back here because in unified runtime we intercept sycl::buffer with usm,
+    // we need to cover that pattern here.
+    sycl::buffer<int, 1> buf(v.data(), v.size());
+    q.submit([&](sycl::handler &h) {
+       auto A = buf.get_access<sycl::access::mode::read_write>(h);
+       h.parallel_for<class Test>(sycl::nd_range<1>(N, 1),
+                                  [=](sycl::nd_item<1>) { A[0]++; });
+     }).wait();
+    // CHECK: WARNING: DeviceSanitizer: data race
+    // CHECK-NEXT: When write of size 4 at 0x{{.*}} in kernel <{{.*}}Test>
+    // CHECK-NEXT: #0 {{.*}}check_buffer.cpp:[[@LINE-4]]
+  }
+
+  return 0;
+}
@@ -0,0 +1,33 @@
+// REQUIRES: linux, cpu || (gpu && level_zero)
+// ALLOW_RETRIES: 10
+// RUN: %{build} %device_tsan_flags -O0 -g -o %t1.out
+// RUN: %{run} %t1.out 2>&1 | FileCheck %s
+// RUN: %{build} %device_tsan_flags -O2 -g -o %t2.out
+// RUN: %{run} %t2.out 2>&1 | FileCheck %s
+
+#include <sycl/detail/core.hpp>
+
+int main() {
+  constexpr size_t size_x = 16;
+
+  std::vector<int> v(size_x);
+  for (size_t i = 0; i < size_x; i++)
+    v[i] = i;
+
+  {
+    sycl::queue q;
+    sycl::buffer<int> buf(v.data(), v.size());
+    sycl::buffer<int> sub_buf(buf, {size_x / 2}, {size_x / 2});
+
+    q.submit([&](sycl::handler &cgh) {
+       auto accessor = sub_buf.get_access<sycl::access::mode::read_write>(cgh);
+       cgh.parallel_for<class Test>(sycl::nd_range<1>(size_x / 2, 1),
+                                    [=](sycl::nd_item<1>) { accessor[0]++; });
+     }).wait();
+    // CHECK: WARNING: DeviceSanitizer: data race
+    // CHECK-NEXT: When write of size 4 at 0x{{.*}} in kernel <{{.*}}Test>
+    // CHECK-NEXT: #0 {{.*}}check_sub_buffer.cpp:[[@LINE-4]]
+  }
+
+  return 0;
+}
@@ -163,6 +163,8 @@ if(UR_ENABLE_SANITIZER)
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/msan/msan_report.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/msan/msan_shadow.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/msan/msan_shadow.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/tsan/tsan_buffer.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/tsan/tsan_buffer.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/tsan/tsan_ddi.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/tsan/tsan_ddi.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/tsan/tsan_interceptor.cpp

@@ -0,0 +1,208 @@
+/*
+ *
+ * Copyright (C) 2025 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+ * Exceptions. See LICENSE.TXT
+ *
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file tsan_buffer.cpp
+ *
+ */
+
+#include "tsan_buffer.hpp"
+#include "sanitizer_common/sanitizer_utils.hpp"
+#include "tsan_interceptor.hpp"
+#include "ur_sanitizer_layer.hpp"
+
+namespace ur_sanitizer_layer {
+namespace tsan {
+
+ur_result_t EnqueueMemCopyRectHelper(
+    ur_queue_handle_t Queue, char *pSrc, char *pDst, ur_rect_offset_t SrcOffset,
+    ur_rect_offset_t DstOffset, ur_rect_region_t Region, size_t SrcRowPitch,
+    size_t SrcSlicePitch, size_t DstRowPitch, size_t DstSlicePitch,
+    bool Blocking, uint32_t NumEventsInWaitList,
+    const ur_event_handle_t *EventWaitList, ur_event_handle_t *Event) {
+  // If user doesn't determine src/dst row pitch and slice pitch, just use
+  // region for it.
+  if (SrcRowPitch == 0) {
+    SrcRowPitch = Region.width;
+  }
+
+  if (SrcSlicePitch == 0) {
+    SrcSlicePitch = SrcRowPitch * Region.height;
+  }
+
+  if (DstRowPitch == 0) {
+    DstRowPitch = Region.width;
+  }
+
+  if (DstSlicePitch == 0) {
+    DstSlicePitch = DstRowPitch * Region.height;
+  }
+
+  // Calculate the src and dst addresses that actually will be copied.
+  char *SrcOrigin = pSrc + SrcOffset.x + SrcRowPitch * SrcOffset.y +
+                    SrcSlicePitch * SrcOffset.z;
+  char *DstOrigin = pDst + DstOffset.x + DstRowPitch * DstOffset.y +
+                    DstSlicePitch * DstOffset.z;
+
+  ur_device_handle_t Device = GetDevice(Queue);
+  std::shared_ptr<DeviceInfo> DeviceInfo =
+      getTsanInterceptor()->getDeviceInfo(Device);
+  std::vector<ur_event_handle_t> Events;
+
+  // For now, USM doesn't support 3D memory copy operation, so we can only
+  // loop call 2D memory copy function to implement it.
+  for (size_t i = 0; i < Region.depth; i++) {
+    ur_event_handle_t NewEvent{};
+    UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy2D(
+        Queue, false, DstOrigin + (i * DstSlicePitch), DstRowPitch,
+        SrcOrigin + (i * SrcSlicePitch), SrcRowPitch, Region.width,
+        Region.height, NumEventsInWaitList, EventWaitList, &NewEvent));
+    Events.push_back(NewEvent);
+  }
+
+  if (Blocking) {
+    UR_CALL(getContext()->urDdiTable.Event.pfnWait(Events.size(), &Events[0]));
+  }
+
+  if (Event) {
+    UR_CALL(getContext()->urDdiTable.Enqueue.pfnEventsWait(Queue, Events.size(),
+                                                           &Events[0], Event));
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) {
+  // Sub-buffers don't maintain own allocations but rely on parent buffer.
+  if (SubBuffer) {
+    UR_CALL(SubBuffer->Parent->getHandle(Device, Handle));
+    Handle += SubBuffer->Origin;
+    return UR_RESULT_SUCCESS;
+  }
+
+  // Device may be null, we follow the L0 adapter's practice to use the first
+  // device
+  if (!Device) {
+    auto Devices = GetDevices(Context);
+    assert(Devices.size() > 0 && "Devices should not be empty");
+    Device = Devices[0];
+  }
+  assert((void *)Device != nullptr && "Device cannot be nullptr");
+
+  std::scoped_lock<ur_shared_mutex> Guard(Mutex);
+  auto &Allocation = Allocations[Device];
+  ur_result_t URes = UR_RESULT_SUCCESS;
+  if (!Allocation) {
+    ur_usm_desc_t USMDesc{};
+    USMDesc.align = getAlignment();
+    ur_usm_pool_handle_t Pool{};
+    URes = getTsanInterceptor()->allocateMemory(Context, Device, &USMDesc, Pool,
+                                                Size, AllocType::DEVICE_USM,
+                                                ur_cast<void **>(&Allocation));
+    if (URes != UR_RESULT_SUCCESS) {
+      getContext()->logger.error(
+          "Failed to allocate {} bytes memory for buffer {}", Size, this);
+      return URes;
+    }
+
+    if (HostPtr) {
+      ManagedQueue Queue(Context, Device);
+      URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
+          Queue, true, Allocation, HostPtr, Size, 0, nullptr, nullptr);
+      if (URes != UR_RESULT_SUCCESS) {
+        getContext()->logger.error("Failed to copy {} bytes data from host "
+                                   "pointer {} to buffer {}",
+                                   Size, HostPtr, this);
+        return URes;
+      }
+    }
+  }
+
+  Handle = Allocation;
+
+  if (!LastSyncedDevice.hDevice) {
+    LastSyncedDevice = MemBuffer::Device_t{Device, Handle};
+    return URes;
+  }
+
+  // If the device required to allocate memory is not the previous one, we
+  // need to do data migration.
+  if (Device != LastSyncedDevice.hDevice) {
+    auto &HostAllocation = Allocations[nullptr];
+    if (!HostAllocation) {
+      ur_usm_desc_t USMDesc{};
+      USMDesc.align = getAlignment();
+      ur_usm_pool_handle_t Pool{};
+      URes = getContext()->urDdiTable.USM.pfnHostAlloc(
+          Context, &USMDesc, Pool, Size, ur_cast<void **>(&HostAllocation));
+      if (URes != UR_RESULT_SUCCESS) {
+        getContext()->logger.error("Failed to allocate {} bytes host "
+                                   "USM for buffer {} migration",
+                                   Size, this);
+        return URes;
+      }
+    }
+
+    // Copy data from last synced device to host
+    {
+      ManagedQueue Queue(Context, LastSyncedDevice.hDevice);
+      URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
+          Queue, true, HostAllocation, LastSyncedDevice.MemHandle, Size, 0,
+          nullptr, nullptr);
+      if (URes != UR_RESULT_SUCCESS) {
+        getContext()->logger.error("Failed to migrate memory buffer data");
+        return URes;
+      }
+    }
+
+    // Sync data back to device
+    {
+      ManagedQueue Queue(Context, Device);
+      URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
+          Queue, true, Allocation, HostAllocation, Size, 0, nullptr, nullptr);
+      if (URes != UR_RESULT_SUCCESS) {
+        getContext()->logger.error("Failed to migrate memory buffer data");
+        return URes;
+      }
+    }
+  }
+
+  LastSyncedDevice = MemBuffer::Device_t{Device, Handle};
+
+  return URes;
+}
+
+ur_result_t MemBuffer::free() {
+  for (const auto &[_, Ptr] : Allocations) {
+    ur_result_t URes = getContext()->urDdiTable.USM.pfnFree(Context, Ptr);
+    if (URes != UR_RESULT_SUCCESS) {
+      getContext()->logger.error("Failed to free buffer handle {}", Ptr);
+      return URes;
+    }
+  }
+  Allocations.clear();
+  return UR_RESULT_SUCCESS;
+}
+
+size_t MemBuffer::getAlignment() {
+  // Choose an alignment that is at most 128 and is the next power of 2
+  // for sizes less than 128.
+  // TODO: If we don't set the alignment size explicitly, the device will
+  // usually choose a very large size (more than 1k). Then sanitizer will
+  // allocate extra unnessary memory. Not sure if this will impact
+  // performance.
+  size_t MsbIdx = 63 - __builtin_clzl(Size);
+  size_t Alignment = (1ULL << (MsbIdx + 1));
+  if (Alignment > 128) {
+    Alignment = 128;
+  }
+  return Alignment;
+}
+
+} // namespace tsan
+} // namespace ur_sanitizer_layer
@@ -0,0 +1,84 @@
+/*
+ *
+ * Copyright (C) 2025 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+ * Exceptions. See LICENSE.TXT
+ *
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file tsan_buffer.hpp
+ *
+ */
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <optional>
+
+#include "ur/ur.hpp"
+
+namespace ur_sanitizer_layer {
+namespace tsan {
+
+struct MemBuffer {
+  // Buffer constructor
+  MemBuffer(ur_context_handle_t Context, size_t Size, char *HostPtr)
+      : Context(Context), Size(Size), HostPtr(HostPtr) {}
+
+  // Sub-buffer constructor
+  MemBuffer(std::shared_ptr<MemBuffer> Parent, size_t Origin, size_t Size)
+      : Context(Parent->Context), Size(Size),
+        SubBuffer{{std::move(Parent), Origin}} {}
+
+  ur_result_t getHandle(ur_device_handle_t Device, char *&Handle);
+
+  ur_result_t free();
+
+  size_t getAlignment();
+
+  std::unordered_map<ur_device_handle_t, char *> Allocations;
+
+  enum AccessMode { UNKNOWN, READ_WRITE, READ_ONLY, WRITE_ONLY };
+
+  struct Mapping {
+    size_t Offset;
+    size_t Size;
+  };
+
+  std::unordered_map<void *, Mapping> Mappings;
+
+  ur_context_handle_t Context;
+
+  struct Device_t {
+    ur_device_handle_t hDevice;
+    char *MemHandle;
+  };
+  Device_t LastSyncedDevice{};
+
+  size_t Size;
+
+  char *HostPtr{};
+
+  struct SubBuffer_t {
+    std::shared_ptr<MemBuffer> Parent;
+    size_t Origin;
+  };
+
+  std::optional<SubBuffer_t> SubBuffer;
+
+  std::atomic<int32_t> RefCount = 1;
+
+  ur_shared_mutex Mutex;
+};
+
+ur_result_t EnqueueMemCopyRectHelper(
+    ur_queue_handle_t Queue, char *pSrc, char *pDst, ur_rect_offset_t SrcOffset,
+    ur_rect_offset_t DstOffset, ur_rect_region_t Region, size_t SrcRowPitch,
+    size_t SrcSlicePitch, size_t DstRowPitch, size_t DstSlicePitch,
+    bool Blocking, uint32_t NumEventsInWaitList,
+    const ur_event_handle_t *EventWaitList, ur_event_handle_t *Event);
+
+} // namespace tsan
+} // namespace ur_sanitizer_layer