20 #ifndef RAJA_policy_cuda_atomic_HPP
21 #define RAJA_policy_cuda_atomic_HPP
23 #include "RAJA/config.hpp"
25 #if defined(RAJA_ENABLE_CUDA)
28 #include <type_traits>
30 #if __CUDA__ARCH__ >= 600 && __CUDACC_VER_MAJOR__ >= 11 && \
31 __CUDACC_VER_MINOR__ >= 6
32 #define RAJA_ENABLE_CUDA_ATOMIC_REF
35 #if defined(RAJA_ENABLE_CUDA_ATOMIC_REF)
36 #include <cuda/atomic>
39 #include "camp/list.hpp"
44 #if defined(RAJA_OPENMP_ACTIVE)
71 struct cuda_useBuiltinCommon
73 static constexpr
bool value = std::is_same<T, int>::value ||
74 std::is_same<T, unsigned int>::value ||
75 std::is_same<T, unsigned long long>::value;
86 struct cuda_useReinterpretCommon
88 static constexpr
bool value = !cuda_useBuiltinCommon<T>::value &&
89 (
sizeof(T) ==
sizeof(
unsigned int) ||
90 sizeof(T) ==
sizeof(
unsigned long long));
92 using type = std::conditional_t<
sizeof(T) ==
sizeof(
unsigned int),
101 using cuda_useReinterpretCommon_t =
typename cuda_useReinterpretCommon<T>::type;
111 std::enable_if_t<cuda_useBuiltinCommon<T>::value,
bool> =
true>
112 RAJA_INLINE __device__ T cuda_atomicOr(T* acc, T value)
126 struct cuda_useBuiltinExchange
128 static constexpr
bool value = std::is_same<T, int>::value ||
129 std::is_same<T, unsigned int>::value ||
130 std::is_same<T, unsigned long long>::value ||
131 std::is_same<T, float>::value;
139 struct cuda_useReinterpretExchange
141 static constexpr
bool value = !cuda_useBuiltinExchange<T>::value &&
142 (
sizeof(T) ==
sizeof(
unsigned int) ||
143 sizeof(T) ==
sizeof(
unsigned long long));
145 using type = std::conditional_t<
sizeof(T) ==
sizeof(
unsigned int),
154 using cuda_useReinterpretExchange_t =
155 typename cuda_useReinterpretExchange<T>::type;
162 std::enable_if_t<cuda_useBuiltinExchange<T>::value,
bool> =
true>
163 RAJA_INLINE __device__ T cuda_atomicExchange(T* acc, T value)
165 return ::atomicExch(acc, value);
173 std::enable_if_t<cuda_useReinterpretExchange<T>::value,
bool> =
true>
174 RAJA_INLINE __device__ T cuda_atomicExchange(T* acc, T value)
176 using R = cuda_useReinterpretExchange_t<T>;
178 return RAJA::util::reinterp_A_as_B<R, T>(cuda_atomicExchange(
179 reinterpret_cast<R*
>(acc), RAJA::util::reinterp_A_as_B<T, R>(value)));
185 #if defined(RAJA_ENABLE_CUDA_ATOMIC_REF)
188 RAJA_INLINE __device__ T cuda_atomicLoad(T* acc)
190 return cuda::atomic_ref<T, cuda::thread_scope_device>(*acc).load(
191 cuda::memory_order_relaxed {});
195 RAJA_INLINE __device__
void cuda_atomicStore(T* acc, T value)
197 cuda::atomic_ref<T, cuda::thread_scope_device>(*acc).store(
198 value, cuda::memory_order_relaxed {});
204 std::enable_if_t<cuda_useBuiltinCommon<T>::value,
bool> =
true>
205 RAJA_INLINE __device__ T cuda_atomicLoad(T* acc)
207 return cuda_atomicOr(acc,
static_cast<T
>(0));
211 std::enable_if_t<cuda_useReinterpretCommon<T>::value,
bool> =
true>
212 RAJA_INLINE __device__ T cuda_atomicLoad(T* acc)
214 using R = cuda_useReinterpretCommon_t<T>;
216 return RAJA::util::reinterp_A_as_B<R, T>(
217 cuda_atomicLoad(
reinterpret_cast<R*
>(acc)));
221 RAJA_INLINE __device__
void cuda_atomicStore(T* acc, T value)
223 cuda_atomicExchange(acc, value);
238 struct cuda_useBuiltinCAS
240 static constexpr
bool value =
241 #if __CUDA_ARCH__ >= 700
242 std::is_same<T, unsigned short int>::value ||
244 std::is_same<T, int>::value || std::is_same<T, unsigned int>::value ||
245 std::is_same<T, unsigned long long>::value;
254 struct cuda_useReinterpretCAS
256 static constexpr
bool value = !cuda_useBuiltinCAS<T>::value &&
258 #if __CUDA_ARCH__ >= 700
259 sizeof(T) ==
sizeof(
unsigned short) ||
261 sizeof(T) ==
sizeof(
unsigned int) ||
262 sizeof(T) ==
sizeof(
unsigned long long));
265 #if __CUDA_ARCH__ >= 700
266 std::conditional_t<
sizeof(T) ==
sizeof(
unsigned short),
269 std::conditional_t<
sizeof(T) ==
sizeof(
unsigned int),
272 #if __CUDA_ARCH__ >= 700
282 using cuda_useReinterpretCAS_t =
typename cuda_useReinterpretCAS<T>::type;
285 std::enable_if_t<cuda_useBuiltinCAS<T>::value,
bool> =
true>
286 RAJA_INLINE __device__ T cuda_atomicCAS(T* acc, T compare, T value)
292 std::enable_if_t<cuda_useReinterpretCAS<T>::value,
bool> =
true>
293 RAJA_INLINE __device__ T cuda_atomicCAS(T* acc, T compare, T value)
295 using R = cuda_useReinterpretCAS_t<T>;
297 return RAJA::util::reinterp_A_as_B<R, T>(cuda_atomicCAS(
298 reinterpret_cast<R*
>(acc), RAJA::util::reinterp_A_as_B<T, R>(compare),
299 RAJA::util::reinterp_A_as_B<T, R>(value)));
308 std::enable_if_t<cuda_useBuiltinCommon<T>::value,
bool> =
true>
309 RAJA_INLINE __device__
bool cuda_atomicCAS_equal(
const T& a,
const T& b)
315 std::enable_if_t<cuda_useReinterpretCommon<T>::value,
bool> =
true>
316 RAJA_INLINE __device__
bool cuda_atomicCAS_equal(
const T& a,
const T& b)
318 using R = cuda_useReinterpretCommon_t<T>;
320 return cuda_atomicCAS_equal(RAJA::util::reinterp_A_as_B<T, R>(a),
321 RAJA::util::reinterp_A_as_B<T, R>(b));
330 template<
typename T,
typename Oper>
331 RAJA_INLINE __device__ T cuda_atomicCAS_loop(T* acc, Oper&& oper)
333 T old = cuda_atomicLoad(acc);
339 old = cuda_atomicCAS(acc, expected, oper(expected));
340 }
while (!cuda_atomicCAS_equal(old, expected));
351 template<
typename T,
typename Oper,
typename ShortCircuit>
352 RAJA_INLINE __device__ T cuda_atomicCAS_loop(T* acc,
356 T old = cuda_atomicLoad(acc);
368 old = cuda_atomicCAS(acc, expected, oper(expected));
369 }
while (!cuda_atomicCAS_equal(old, expected) && !sc(old));
377 using cuda_atomicAdd_builtin_types = ::camp::list<int,
379 unsigned long long int,
381 #if __CUDA_ARCH__ >= 600
390 RAJA_INLINE __device__ T cuda_atomicAdd(T* acc, T value)
392 return cuda_atomicCAS_loop(acc, [value](T old) {
400 RAJA_INLINE __device__ T cuda_atomicAdd(T* acc, T value)
408 using cuda_atomicSub_builtin_types = cuda_atomicAdd_builtin_types;
410 using cuda_atomicSub_via_Sub_builtin_types = ::camp::list<int, unsigned int>;
412 using cuda_atomicSub_via_Add_builtin_types =
413 ::camp::list<
unsigned long long int,
415 #if __CUDA_ARCH__ >= 600
424 RAJA_INLINE __device__ T cuda_atomicSub(T* acc, T value)
426 return cuda_atomicCAS_loop(acc, [value](T old) {
435 RAJA_INLINE __device__ T cuda_atomicSub(T* acc, T value)
444 RAJA_INLINE __device__ T cuda_atomicSub(T* acc, T value)
452 using cuda_atomicMinMax_builtin_types = ::camp::list<int,
454 #if __CUDA_ARCH__ >= 500
457 unsigned long long int
468 RAJA_INLINE __device__ T cuda_atomicMin(T* acc, T value)
470 return cuda_atomicCAS_loop(
473 return value < old ? value : old;
476 return current <= value;
483 RAJA_INLINE __device__ T cuda_atomicMin(T* acc, T value)
494 RAJA_INLINE __device__ T cuda_atomicMax(T* acc, T value)
496 return cuda_atomicCAS_loop(
499 return old < value ? value : old;
502 return value <= current;
509 RAJA_INLINE __device__ T cuda_atomicMax(T* acc, T value)
517 using cuda_atomicIncDecReset_builtin_types = ::camp::list<unsigned int>;
526 RAJA_INLINE __device__ T cuda_atomicInc(T* acc, T value)
530 return cuda_atomicCAS_loop(acc, [value](T old) {
531 return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
539 RAJA_INLINE __device__ T cuda_atomicInc(T* acc, T value)
548 RAJA_INLINE __device__ T cuda_atomicInc(T* acc)
550 return cuda_atomicAdd(acc,
static_cast<T
>(1));
560 RAJA_INLINE __device__ T cuda_atomicDec(T* acc, T value)
564 return cuda_atomicCAS_loop(acc, [value](T old) {
565 return old ==
static_cast<T
>(0) || value < old ? value
566 : old -
static_cast<T
>(1);
574 RAJA_INLINE __device__ T cuda_atomicDec(T* acc, T value)
583 RAJA_INLINE __device__ T cuda_atomicDec(T* acc)
585 return cuda_atomicSub(acc,
static_cast<T
>(1));
591 using cuda_atomicBit_builtin_types =
592 ::camp::list<int, unsigned int, unsigned long long int>;
600 RAJA_INLINE __device__ T cuda_atomicAnd(T* acc, T value)
602 return cuda_atomicCAS_loop(acc, [value](T old) {
610 RAJA_INLINE __device__ T cuda_atomicAnd(T* acc, T value)
621 RAJA_INLINE __device__ T cuda_atomicOr(T* acc, T value)
623 return cuda_atomicCAS_loop(acc, [value](T old) {
640 RAJA_INLINE __device__ T cuda_atomicXor(T* acc, T value)
642 return cuda_atomicCAS_loop(acc, [value](T old) {
650 RAJA_INLINE __device__ T cuda_atomicXor(T* acc, T value)
667 template<
typename T,
typename host_policy>
672 return detail::cuda_atomicLoad(acc);
679 template<
typename T,
typename host_policy>
685 detail::cuda_atomicStore(acc, value);
692 template<
typename T,
typename host_policy>
698 return detail::cuda_atomicAdd(acc, value);
705 template<
typename T,
typename host_policy>
711 return detail::cuda_atomicSub(acc, value);
718 template<
typename T,
typename host_policy>
724 return detail::cuda_atomicMin(acc, value);
731 template<
typename T,
typename host_policy>
737 return detail::cuda_atomicMax(acc, value);
744 template<
typename T,
typename host_policy>
752 return detail::cuda_atomicInc(acc, value);
759 template<
typename T,
typename host_policy>
764 return detail::cuda_atomicInc(acc);
771 template<
typename T,
typename host_policy>
779 return detail::cuda_atomicDec(acc, value);
786 template<
typename T,
typename host_policy>
791 return detail::cuda_atomicDec(acc);
798 template<
typename T,
typename host_policy>
804 return detail::cuda_atomicAnd(acc, value);
811 template<
typename T,
typename host_policy>
817 return detail::cuda_atomicOr(acc, value);
824 template<
typename T,
typename host_policy>
830 return detail::cuda_atomicXor(acc, value);
837 template<
typename T,
typename host_policy>
843 return detail::cuda_atomicExchange(acc, value);
850 template<
typename T,
typename host_policy>
852 atomicCAS(cuda_atomic_explicit<host_policy>, T* acc, T compare, T value)
855 return detail::cuda_atomicCAS(acc, compare, value);
Header file for enable_if helpers.
Header file for RAJA operator definitions.
Header file for reinterpreting type conversions.
RAJA header file defining automatic and builtin atomic operations.
Header file for common RAJA internal macro definitions.
#define RAJA_HOST_DEVICE
Definition: macros.hpp:65
#define RAJA_SUPPRESS_HD_WARN
Definition: macros.hpp:68
std::enable_if_t< is_any_of< T, TypeList >::value, T > enable_if_is_any_of
Definition: EnableIf.hpp:49
std::enable_if_t<::RAJA::concepts::negate< is_any_of< T, TypeList > >::value, T > enable_if_is_none_of
Definition: EnableIf.hpp:54
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T *acc, T value)
Atomic bitwise AND equivalent to (*acc) = (*acc) & value This only works with integral data types.
Definition: atomic.hpp:224
RAJA_SUPPRESS_HD_WARN RAJA_HOST_DEVICE RAJA_INLINE T atomicCAS(seq_atomic, T *acc, T compare, T value)
Definition: atomic.hpp:154
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T *acc, T value)
Atomic bitwise OR equivalent to (*acc) = (*acc) | value This only works with integral data types.
Definition: atomic.hpp:240
RAJA_SUPPRESS_HD_WARN RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(seq_atomic, T *acc, T value)
Definition: atomic.hpp:73
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(T *acc)
Atomic load.
Definition: atomic.hpp:92
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc)
Atomic decrement.
Definition: atomic.hpp:195
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T *acc, T value)
Atomic add.
Definition: atomic.hpp:117
RAJA_SUPPRESS_HD_WARN RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(seq_atomic, T *acc, T value)
Definition: atomic.hpp:55
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T *acc, T value)
Atomic bitwise XOR equivalent to (*acc) = (*acc) ^ value This only works with integral data types.
Definition: atomic.hpp:256
RAJA_SUPPRESS_HD_WARN RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(seq_atomic, T *acc, T value)
Definition: atomic.hpp:136
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(T *acc, T value)
Atomic store.
Definition: atomic.hpp:104
RAJA_SUPPRESS_HD_WARN RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(seq_atomic, T *acc, T value)
Definition: atomic.hpp:118
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T *acc, T value)
Atomic maximum equivalent to (*acc) = std::max(*acc, value)
Definition: atomic.hpp:156
RAJA_SUPPRESS_HD_WARN RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(seq_atomic, T *acc, T val)
Definition: atomic.hpp:109
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T *acc, T compare, T value)
Atomic compare and swap.
Definition: atomic.hpp:286
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc)
Atomic increment.
Definition: atomic.hpp:168
RAJA_SUPPRESS_HD_WARN RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(seq_atomic, T *acc, T value)
Definition: atomic.hpp:46
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T *acc, T value)
Atomic minimum equivalent to (*acc) = std::min(*acc, value)
Definition: atomic.hpp:143
RAJA_SUPPRESS_HD_WARN RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(seq_atomic, T *acc, T value)
Definition: atomic.hpp:64
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T *acc, T value)
Atomic subtract.
Definition: atomic.hpp:130
RAJA_SUPPRESS_HD_WARN RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(seq_atomic, T *acc, T value)
Definition: atomic.hpp:127
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T *acc, T value)
Atomic value exchange.
Definition: atomic.hpp:271
RAJA_SUPPRESS_HD_WARN RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(seq_atomic, T *acc, T val)
Definition: atomic.hpp:91
RAJA header file defining OpenMP atomic operations.
RAJA header file defining sequential atomic operations.