23 #ifndef RAJA_hip_intrinsics_HPP
24 #define RAJA_hip_intrinsics_HPP
26 #include "RAJA/config.hpp"
28 #if defined(RAJA_HIP_ACTIVE)
30 #include <type_traits>
32 #include <hip/hip_runtime.h>
46 struct DeviceConstants
52 ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE;
59 : WARP_SIZE(warp_size),
60 MAX_BLOCK_SIZE(max_block_size),
61 MAX_WARPS(max_block_size / warp_size),
62 ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
70 #if defined(__HIP_PLATFORM_AMD__)
71 constexpr DeviceConstants device_constants(RAJA_HIP_WAVESIZE,
77 #elif defined(__HIP_PLATFORM_NVIDIA__)
78 constexpr DeviceConstants device_constants(RAJA_CUDA_WARPSIZE,
82 static_assert(device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
83 "RAJA Assumption Broken: device_constants.WARP_SIZE < "
84 "device_constants.MAX_WARPS");
85 static_assert(device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
86 "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
87 "a multiple of device_constants.WARP_SIZE");
113 static RAJA_DEVICE RAJA_INLINE
void fence_acquire() { __threadfence(); }
115 static RAJA_DEVICE RAJA_INLINE
void fence_release() { __threadfence(); }
138 struct AccessorDeviceScopeUseBlockFence
141 static constexpr
size_t min_atomic_int_type_size =
sizeof(
unsigned int);
142 static constexpr
size_t max_atomic_int_type_size =
sizeof(
unsigned long long);
148 max_atomic_int_type_size>;
149 using integer_type =
typename ArrayType::integer_type;
152 auto ptr =
const_cast<integer_type*
>(
153 reinterpret_cast<const integer_type*
>(in_ptr + idx));
155 for (
size_t i = 0; i < u.array_size(); ++i)
157 #if defined(RAJA_USE_HIP_INTRINSICS) && \
158 RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_load)
159 u.
array[i] = __hip_atomic_load(&ptr[i], __ATOMIC_RELAXED,
160 __HIP_MEMORY_SCOPE_AGENT);
162 u.array[i] =
::atomicAdd(&ptr[i], integer_type(0));
166 return u.get_value();
170 static RAJA_DEVICE RAJA_INLINE
void set(T* in_ptr,
size_t idx, T val)
173 max_atomic_int_type_size>;
174 using integer_type =
typename ArrayType::integer_type;
178 auto ptr =
reinterpret_cast<integer_type*
>(in_ptr + idx);
180 for (
size_t i = 0; i < u.array_size(); ++i)
182 #if defined(RAJA_USE_HIP_INTRINSICS) && \
183 RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_store)
184 __hip_atomic_store(&ptr[i], u.array[i], __ATOMIC_RELAXED,
185 __HIP_MEMORY_SCOPE_AGENT);
187 ::atomicExch(&ptr[i], u.array[i]);
192 static RAJA_DEVICE RAJA_INLINE
void fence_acquire()
194 #if defined(RAJA_USE_HIP_INTRINSICS) && \
195 RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence)
196 __builtin_amdgcn_fence(__ATOMIC_ACQUIRE,
"workgroup");
202 static RAJA_DEVICE RAJA_INLINE
void fence_release()
204 #if defined(RAJA_USE_HIP_INTRINSICS) && \
205 RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence) && \
206 RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_s_waitcnt)
207 __builtin_amdgcn_fence(__ATOMIC_RELEASE,
"workgroup");
209 __builtin_amdgcn_s_waitcnt( 0 | ( 0x7 << 4) |
218 constexpr
size_t min_shfl_int_type_size =
sizeof(
unsigned int);
219 constexpr
size_t max_shfl_int_type_size =
sizeof(
unsigned int);
232 RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var,
int laneMask)
235 max_shfl_int_type_size>
239 for (
size_t i = 0; i < u.array_size(); ++i)
241 u.array[i] = ::__shfl_xor(u.array[i], laneMask);
243 return u.get_value();
247 RAJA_DEVICE RAJA_INLINE T shfl_sync(T var,
int srcLane)
250 max_shfl_int_type_size>
254 for (
size_t i = 0; i < u.array_size(); ++i)
256 u.array[i] = ::__shfl(u.array[i], srcLane);
258 return u.get_value();
262 RAJA_DEVICE RAJA_INLINE
int shfl_xor_sync<int>(
int var,
int laneMask)
264 return ::__shfl_xor(var, laneMask);
268 RAJA_DEVICE RAJA_INLINE
float shfl_xor_sync<float>(
float var,
int laneMask)
270 return ::__shfl_xor(var, laneMask);
274 RAJA_DEVICE RAJA_INLINE
int shfl_sync<int>(
int var,
int srcLane)
276 return ::__shfl(var, srcLane);
280 RAJA_DEVICE RAJA_INLINE
float shfl_sync<float>(
float var,
int srcLane)
282 return ::__shfl(var, srcLane);
286 template<
typename Combiner,
typename T>
289 int numThreads = blockDim.x * blockDim.y * blockDim.z;
291 int threadId = threadIdx.x + blockDim.x * threadIdx.y +
292 (blockDim.x * blockDim.y) * threadIdx.z;
296 if (numThreads % policy::hip::device_constants.WARP_SIZE == 0)
300 for (
int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
302 T rhs = shfl_xor_sync(temp, i);
303 Combiner {}(temp, rhs);
310 for (
int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
312 int srcLane = threadId ^ i;
313 T rhs = shfl_sync(temp, srcLane);
315 if (srcLane < numThreads)
317 Combiner {}(temp, rhs);
332 template<
typename Combiner,
typename T>
337 for (
int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
339 T rhs = shfl_xor_sync(temp, i);
340 Combiner {}(temp, rhs);
347 template<
typename Combiner,
typename T>
348 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
350 int numThreads = blockDim.x * blockDim.y * blockDim.z;
352 int threadId = threadIdx.x + blockDim.x * threadIdx.y +
353 (blockDim.x * blockDim.y) * threadIdx.z;
355 int warpId = threadId % policy::hip::device_constants.WARP_SIZE;
356 int warpNum = threadId / policy::hip::device_constants.WARP_SIZE;
360 if (numThreads % policy::hip::device_constants.WARP_SIZE == 0)
364 for (
int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
366 T rhs = shfl_xor_sync(temp, i);
367 Combiner {}(temp, rhs);
374 for (
int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
376 int srcLane = threadId ^ i;
377 T rhs = shfl_sync(temp, srcLane);
379 if (srcLane < numThreads)
381 Combiner {}(temp, rhs);
387 if (numThreads > policy::hip::device_constants.WARP_SIZE)
390 static_assert(policy::hip::device_constants.MAX_WARPS <=
391 policy::hip::device_constants.WARP_SIZE,
392 "This algorithms assumes a warp of WARP_SIZE threads can "
393 "reduce MAX_WARPS values");
395 __shared__
unsigned char tmpsd[
sizeof(
399 T, policy::hip::device_constants.MAX_WARPS
>*>(tmpsd);
404 sd->
set(warpNum, temp);
413 if (warpId * policy::hip::device_constants.WARP_SIZE < numThreads)
415 temp = sd->get(warpId);
422 for (
int i = 1; i < policy::hip::device_constants.MAX_WARPS; i *= 2)
424 T rhs = shfl_xor_sync(temp, i);
425 Combiner {}(temp, rhs);
Header file for common RAJA internal definitions.
Array class specialized for Struct of Array data layout.
Definition: SoAArray.hpp:42
constexpr RAJA_HOST_DEVICE void set(size_t i, value_type val)
Definition: SoAArray.hpp:48
Header file for common RAJA internal macro definitions.
#define RAJA_UNUSED_ARG(x)
Definition: macros.hpp:97
#define RAJA_DEVICE
Definition: macros.hpp:66
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T *acc, T value)
Atomic add.
Definition: atomic.hpp:117
std::ptrdiff_t Index_type
Definition: types.hpp:226
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
Abstracts T into an equal or greater size array of integers whose size is between min_integer_type_si...
Definition: types.hpp:962
integer_type array[num_integer_type]
Definition: types.hpp:1000
RAJA_HOST_DEVICE void set_value(T value)
Definition: types.hpp:1016
Abstracts access to memory using normal memory accesses.
Definition: types.hpp:938
Header file for RAJA type definitions.