Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion RELEASE_NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,13 @@ This release contains ...
Notable changes include:

* New features / API changes:
* Added `RAJA::atomicOperation` to enable user-defined atomic update
operations implemented using a compare-and-swap loop.

* Build changes/improvements:

* Bug fixes/improvements:


Version 2025.12.2 -- Release date 2026-03-04
============================================

Expand Down
14 changes: 13 additions & 1 deletion include/RAJA/pattern/atomic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -280,14 +280,26 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T* acc, T value)
* @param compare Value to compare with *acc
* @return Returns value at *acc immediately before this operation completed
*/

RAJA_SUPPRESS_HD_WARN
template<typename Policy, typename T>
RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T* acc, T compare, T value)
{
return RAJA::atomicCAS(Policy {}, acc, compare, value);
}

/*!
* @brief Generic atomic operation implemented using CAS loop
* @param acc Pointer to location to store value
* @param operation Functor that computes a new value from the old value
* @return Returns value at *acc immediately before this operation completed
*/
RAJA_SUPPRESS_HD_WARN
template<typename Policy, typename T, typename Operation>
RAJA_INLINE RAJA_HOST_DEVICE T atomicOperation(T* acc, Operation&& operation)
{
return RAJA::atomicOperation(Policy {}, acc, std::forward<Operation>(operation));
}

/*!
* \brief Atomic wrapper object
*
Expand Down
6 changes: 6 additions & 0 deletions include/RAJA/policy/atomic_auto.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,12 @@ atomicCAS(auto_atomic, T* acc, T compare, T value)
return atomicCAS(RAJA_AUTO_ATOMIC, acc, compare, value);
}

template<typename T, typename Operation>
RAJA_INLINE RAJA_HOST_DEVICE T
atomicOperation(auto_atomic, T* acc, Operation&& operation)
{
return atomicOperation(RAJA_AUTO_ATOMIC, acc, std::forward<Operation>(operation));
}

} // namespace RAJA

Expand Down
8 changes: 8 additions & 0 deletions include/RAJA/policy/atomic_builtin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "RAJA/config.hpp"

#include <cstdint>
#include <utility>

#if defined(RAJA_COMPILER_MSVC) || \
((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER))
Expand Down Expand Up @@ -1011,6 +1012,13 @@ atomicCAS(builtin_atomic, T* acc, T compare, T value)
return detail::builtin_atomicCAS(acc, compare, value);
}

template<typename T, typename Operation>
RAJA_DEVICE_HIP RAJA_INLINE T
atomicOperation(builtin_atomic, T* acc, Operation&& operation)
{
return detail::builtin_atomicCAS_loop(acc, std::forward<Operation>(operation));
}


} // namespace RAJA

Expand Down
12 changes: 12 additions & 0 deletions include/RAJA/policy/cuda/atomic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -858,6 +858,18 @@ atomicCAS(cuda_atomic_explicit<host_policy>, T* acc, T compare, T value)
#endif
}

RAJA_SUPPRESS_HD_WARN
template<typename T, typename Operation, typename host_policy>
RAJA_INLINE RAJA_HOST_DEVICE T
atomicOperation(cuda_atomic_explicit<host_policy>, T* acc, Operation&& operation)
{
#ifdef __CUDA_ARCH__
return detail::cuda_atomicCAS_loop(acc, std::forward<Operation>(operation));
#else
return RAJA::atomicOperation(host_policy {}, acc, std::forward<Operation>(operation));
#endif
}

} // namespace RAJA


Expand Down
59 changes: 59 additions & 0 deletions include/RAJA/policy/desul/atomic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@

#if defined(RAJA_ENABLE_DESUL_ATOMICS)

#include <cstdint>
#include <type_traits>
#include <utility>

#include "RAJA/util/macros.hpp"
#include "RAJA/util/TypeConvert.hpp"

#include "RAJA/policy/atomic_builtin.hpp"

Expand All @@ -27,6 +32,32 @@ using raja_default_desul_scope = desul::MemoryScopeDevice;
namespace RAJA
{

namespace detail
{

template<typename T>
RAJA_HOST_DEVICE RAJA_INLINE bool desul_atomicCAS_equal(const T& a, const T& b)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we not already have something like this?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it's duplicated in multiple places. Is there a centralized place you would suggest moving these implementations?

Copy link
Copy Markdown
Member

@MrBurmark MrBurmark Mar 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Somewhere in pattern or maybe internal would be a good place.

{
return a == b;
Copy link
Copy Markdown
Member

@MrBurmark MrBurmark Mar 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes me wonder whether it makes sense to also allow a used-defined comparison function for comparing old and expected as well, as some may not care about equivalent vs bit-wise equal. We may want to default to using bit-wise equality in user-defined types in addition to floating point and integer types.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you thinking about this just for the short-circuiting, or for determining whether the atomic CAS happened as well?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a little worried about the interface getting too complicated.

}

template<typename T,
std::enable_if_t<std::is_floating_point<T>::value, bool> = true>
RAJA_HOST_DEVICE RAJA_INLINE bool desul_atomicCAS_equal(const T& a, const T& b)
{
using R = std::conditional_t<sizeof(T) == sizeof(std::uint32_t),
std::uint32_t,
std::uint64_t>;
static_assert(sizeof(T) == sizeof(std::uint32_t) ||
sizeof(T) == sizeof(std::uint64_t),
"desul_atomicCAS_equal only supports 32/64-bit floating point");

return RAJA::util::reinterp_A_as_B<T, R>(a) ==
RAJA::util::reinterp_A_as_B<T, R>(b);
}

} // namespace detail

RAJA_SUPPRESS_HD_WARN
template<typename AtomicPolicy, typename T>
RAJA_HOST_DEVICE RAJA_INLINE T atomicLoad(AtomicPolicy, T* acc)
Expand Down Expand Up @@ -153,6 +184,34 @@ atomicCAS(AtomicPolicy, T* acc, T compare, T value)
raja_default_desul_scope {});
}

RAJA_SUPPRESS_HD_WARN
template<typename AtomicPolicy, typename T, typename Operation>
RAJA_HOST_DEVICE RAJA_INLINE T
atomicOperation(AtomicPolicy, T* acc, Operation&& operation)
{
T expected = desul::atomic_load(acc,
raja_default_desul_order {},
raja_default_desul_scope {});

while (true) {
const T desired = operation(expected);

if (desul_atomicCAS_equal(desired, expected)) {
return expected; // no-op
}

const T old = desul::atomic_compare_exchange(acc, expected, desired,
raja_default_desul_order {},
raja_default_desul_scope {});

if (desul_atomicCAS_equal(old, expected)) {
return old; // success
}
Comment on lines +199 to +209
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we get away with one conditional per iteration?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we want short-circuiting, I don't think so.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll think about this some more.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I don't think there's a way to do short-circuiting without two checks. If we are really worried about performance, we could expose a couple of different overloads of atomicOperation (with a functor like the different cas loop overloads we have for CUDA and HIP), or a single overload with a boolean parameter. There are definitely cases where shortcircuiting helps (like min/max or there's a chance you are multiplying by 1 or adding 0 to a non-builtin type). But there are plenty of cases where it would just be extra overhead.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense. To summarize some operators like min/max make sense to short circuit as they will act like the identity function in some cases. Others like plus/mult make less sense to short circuit as they (almost) always return a different value.
Given that we internally have both I would lean toward providing both.

Copy link
Copy Markdown
Member

@MrBurmark MrBurmark Mar 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could have a single implementation with a short circuit and "old_matches_expected" operator where the short circuit functor defaults to one that always returns false and the "old_matches_expected" operator does a bit-wise comparison.


expected = old; // CAS failed, old is the latest observed value
}
}

} // namespace RAJA

#endif // RAJA_ENABLE_DESUL_ATOMICS
Expand Down
12 changes: 12 additions & 0 deletions include/RAJA/policy/hip/atomic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -901,6 +901,18 @@ atomicCAS(hip_atomic_explicit<host_policy>, T* acc, T compare, T value)
#endif
}

RAJA_SUPPRESS_HD_WARN
template<typename T, typename Operation, typename host_policy>
RAJA_INLINE RAJA_HOST_DEVICE T
atomicOperation(hip_atomic_explicit<host_policy>, T* acc, Operation&& operation)
{
#if defined(__HIP_DEVICE_COMPILE__)
return detail::hip_atomicCAS_loop(acc, std::forward<Operation>(operation));
#else
return RAJA::atomicOperation(host_policy {}, acc, std::forward<Operation>(operation));
#endif
}

} // namespace RAJA


Expand Down
13 changes: 13 additions & 0 deletions include/RAJA/policy/openmp/atomic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@

#if defined(RAJA_ENABLE_OPENMP)

#include <utility>

#include "RAJA/policy/openmp/policy.hpp"

#include "RAJA/util/macros.hpp"
Expand Down Expand Up @@ -230,6 +232,17 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicCAS(omp_atomic, T* acc, T compare, T value)
return RAJA::atomicCAS(builtin_atomic {}, acc, compare, value);
}

RAJA_SUPPRESS_HD_WARN
template<typename T, typename Operation>
RAJA_HOST_DEVICE RAJA_INLINE T
atomicOperation(omp_atomic, T* acc, Operation&& operation)
{
// OpenMP doesn't define a generic atomic operation, so use builtin atomics
return RAJA::atomicOperation(builtin_atomic {},
acc,
std::forward<Operation>(operation));
}

#endif // not defined RAJA_COMPILER_MSVC


Expand Down
9 changes: 9 additions & 0 deletions include/RAJA/policy/sequential/atomic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,15 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicCAS(seq_atomic, T* acc, T compare, T value)
return ret;
}

RAJA_SUPPRESS_HD_WARN
template<typename T, typename Operation>
RAJA_HOST_DEVICE RAJA_INLINE T atomicOperation(seq_atomic, T* acc, Operation&& operation)
{
T ret = *acc;
*acc = operation(ret);
return ret;
}


} // namespace RAJA

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ template <typename ExecPolicy,
void ForallAtomicBasicTestImpl( IdxType seglimit )
{
// initialize an array
const int len = 12;
const int len = 13;

camp::resources::Resource work_res{WorkingRes::get_default()};

Expand Down Expand Up @@ -93,6 +93,7 @@ void ForallAtomicBasicTestImpl( IdxType seglimit )
test_array[9] = static_cast<T>(0);
test_array[10] = static_cast<T>(0);
test_array[11] = static_cast<T>(0);
test_array[12] = static_cast<T>(1);

work_res.memcpy(work_array, test_array, sizeof(T) * len);

Expand All @@ -109,6 +110,23 @@ void ForallAtomicBasicTestImpl( IdxType seglimit )
RAJA::atomicStore<AtomicPolicy>(work_array + 9, static_cast<T>(1));
RAJA::atomicInc<AtomicPolicy>(work_array + 10, static_cast<T>(16));
RAJA::atomicDec<AtomicPolicy>(work_array + 11, static_cast<T>(16));

// Exercise generic atomicOperation with an order-independent update:
// compute factorial(N) by multiplying by (i+1) for i in [0, N).
//
// Choose N small enough that:
// - The result fits in 32-bit signed ints (avoids overflow/UB).
// - The intermediate values are exactly representable in float/double
// (avoids non-associativity issues).
constexpr IdxType factN = static_cast<IdxType>(10);
RAJA::atomicOperation<AtomicPolicy>(work_array + 12,
[=] RAJA_HOST_DEVICE(T old) {
if (i < factN)
{
return old * static_cast<T>(i + static_cast<IdxType>(1));
}
return old;
});
});

work_res.memcpy( check_array, work_array, sizeof(T) * len );
Expand All @@ -128,6 +146,7 @@ void ForallAtomicBasicTestImpl( IdxType seglimit )
EXPECT_EQ(static_cast<T>(1), check_array[9]);
EXPECT_EQ(static_cast<T>(4), check_array[10]);
EXPECT_EQ(static_cast<T>(13), check_array[11]);
EXPECT_EQ(static_cast<T>(3628800), check_array[12]);

deallocateForallTestData<T>(work_res,
work_array,
Expand Down
4 changes: 4 additions & 0 deletions test/unit/atomic/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,7 @@ raja_add_test(
raja_add_test(
NAME test-atomic-ref-bitwise
SOURCES test-atomic-ref-bitwise.cpp)

raja_add_test(
NAME test-atomic-operation
SOURCES test-atomic-operation.cpp)
Loading