23 #ifndef RAJA_cuda_intrinsics_HPP
24 #define RAJA_cuda_intrinsics_HPP
26 #include "RAJA/config.hpp"
28 #if defined(RAJA_CUDA_ACTIVE)
30 #include <type_traits>
47 struct DeviceConstants
53 ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE;
60 : WARP_SIZE(warp_size),
61 MAX_BLOCK_SIZE(max_block_size),
62 MAX_WARPS(max_block_size / warp_size),
63 ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
71 constexpr DeviceConstants device_constants(RAJA_CUDA_WARPSIZE,
74 static_assert(device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
75 "RAJA Assumption Broken: device_constants.WARP_SIZE < "
76 "device_constants.MAX_WARPS");
77 static_assert(device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
78 "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
79 "a multiple of device_constants.WARP_SIZE");
81 constexpr
const size_t MIN_BLOCKS_PER_SM = 1;
82 constexpr
const size_t MAX_BLOCKS_PER_SM = 32;
108 static RAJA_DEVICE RAJA_INLINE
void fence_acquire() { __threadfence(); }
110 static RAJA_DEVICE RAJA_INLINE
void fence_release() { __threadfence(); }
133 struct AccessorDeviceScopeUseBlockFence
136 static constexpr
size_t min_atomic_int_type_size =
sizeof(
unsigned int);
137 static constexpr
size_t max_atomic_int_type_size =
sizeof(
unsigned long long);
143 max_atomic_int_type_size>;
144 using integer_type =
typename ArrayType::integer_type;
147 auto ptr =
const_cast<integer_type*
>(
148 reinterpret_cast<const integer_type*
>(in_ptr + idx));
150 for (
size_t i = 0; i < u.array_size(); ++i)
155 return u.get_value();
159 static RAJA_DEVICE RAJA_INLINE
void set(T* in_ptr,
size_t idx, T val)
162 max_atomic_int_type_size>;
163 using integer_type =
typename ArrayType::integer_type;
167 auto ptr =
reinterpret_cast<integer_type*
>(in_ptr + idx);
169 for (
size_t i = 0; i < u.array_size(); ++i)
171 ::atomicExch(&ptr[i], u.array[i]);
175 static RAJA_DEVICE RAJA_INLINE
void fence_acquire() { __threadfence(); }
177 static RAJA_DEVICE RAJA_INLINE
void fence_release() { __threadfence(); }
181 constexpr
size_t min_shfl_int_type_size =
sizeof(
unsigned int);
182 #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
183 constexpr
size_t max_shfl_int_type_size =
sizeof(
unsigned long long);
185 constexpr
size_t max_shfl_int_type_size =
sizeof(
unsigned int);
199 RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var,
int laneMask)
202 max_shfl_int_type_size>
206 for (
size_t i = 0; i < u.array_size(); ++i)
208 #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
209 u.array[i] = ::__shfl_xor_sync(0xffffffffu, u.array[i], laneMask);
211 u.array[i] = ::__shfl_xor(u.array[i], laneMask);
214 return u.get_value();
218 RAJA_DEVICE RAJA_INLINE T shfl_sync(T var,
int srcLane)
221 max_shfl_int_type_size>
225 for (
size_t i = 0; i < u.array_size(); ++i)
227 #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
228 u.array[i] = ::__shfl_sync(0xffffffffu, u.array[i], srcLane);
230 u.array[i] = ::__shfl(u.array[i], srcLane);
233 return u.get_value();
236 #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
239 RAJA_DEVICE RAJA_INLINE
int shfl_xor_sync<int>(
int var,
int laneMask)
241 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
245 RAJA_DEVICE RAJA_INLINE
unsigned int shfl_xor_sync<unsigned int>(
249 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
253 RAJA_DEVICE RAJA_INLINE
long shfl_xor_sync<long>(
long var,
int laneMask)
255 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
259 RAJA_DEVICE RAJA_INLINE
unsigned long shfl_xor_sync<unsigned long>(
263 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
267 RAJA_DEVICE RAJA_INLINE
long long shfl_xor_sync<long long>(
long long var,
270 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
274 RAJA_DEVICE RAJA_INLINE
unsigned long long shfl_xor_sync<unsigned long long>(
275 unsigned long long var,
278 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
282 RAJA_DEVICE RAJA_INLINE
float shfl_xor_sync<float>(
float var,
int laneMask)
284 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
288 RAJA_DEVICE RAJA_INLINE
double shfl_xor_sync<double>(
double var,
int laneMask)
290 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
296 RAJA_DEVICE RAJA_INLINE
int shfl_xor_sync<int>(
int var,
int laneMask)
298 return ::__shfl_xor(var, laneMask);
302 RAJA_DEVICE RAJA_INLINE
float shfl_xor_sync<float>(
float var,
int laneMask)
304 return ::__shfl_xor(var, laneMask);
310 #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
313 RAJA_DEVICE RAJA_INLINE
int shfl_sync<int>(
int var,
int srcLane)
315 return ::__shfl_sync(0xffffffffu, var, srcLane);
319 RAJA_DEVICE RAJA_INLINE
unsigned int shfl_sync<unsigned int>(
unsigned int var,
322 return ::__shfl_sync(0xffffffffu, var, srcLane);
326 RAJA_DEVICE RAJA_INLINE
long shfl_sync<long>(
long var,
int srcLane)
328 return ::__shfl_sync(0xffffffffu, var, srcLane);
332 RAJA_DEVICE RAJA_INLINE
unsigned long shfl_sync<unsigned long>(
336 return ::__shfl_sync(0xffffffffu, var, srcLane);
340 RAJA_DEVICE RAJA_INLINE
long long shfl_sync<long long>(
long long var,
343 return ::__shfl_sync(0xffffffffu, var, srcLane);
347 RAJA_DEVICE RAJA_INLINE
unsigned long long shfl_sync<unsigned long long>(
348 unsigned long long var,
351 return ::__shfl_sync(0xffffffffu, var, srcLane);
355 RAJA_DEVICE RAJA_INLINE
float shfl_sync<float>(
float var,
int srcLane)
357 return ::__shfl_sync(0xffffffffu, var, srcLane);
361 RAJA_DEVICE RAJA_INLINE
double shfl_sync<double>(
double var,
int srcLane)
363 return ::__shfl_sync(0xffffffffu, var, srcLane);
369 RAJA_DEVICE RAJA_INLINE
int shfl_sync<int>(
int var,
int srcLane)
371 return ::__shfl(var, srcLane);
375 RAJA_DEVICE RAJA_INLINE
float shfl_sync<float>(
float var,
int srcLane)
377 return ::__shfl(var, srcLane);
384 template<
typename Combiner,
typename T>
387 int numThreads = blockDim.x * blockDim.y * blockDim.z;
389 int threadId = threadIdx.x + blockDim.x * threadIdx.y +
390 (blockDim.x * blockDim.y) * threadIdx.z;
394 if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0)
398 for (
int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
400 T rhs = shfl_xor_sync(temp, i);
401 Combiner {}(temp, rhs);
408 for (
int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
410 int srcLane = threadId ^ i;
411 T rhs = shfl_sync(temp, srcLane);
413 if (srcLane < numThreads)
415 Combiner {}(temp, rhs);
430 template<
typename Combiner,
typename T>
435 for (
int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
437 T rhs = __shfl_xor_sync(0xffffffff, temp, i);
438 Combiner {}(temp, rhs);
445 template<
typename Combiner,
typename T>
446 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
448 int numThreads = blockDim.x * blockDim.y * blockDim.z;
450 int threadId = threadIdx.x + blockDim.x * threadIdx.y +
451 (blockDim.x * blockDim.y) * threadIdx.z;
453 int warpId = threadId % policy::cuda::device_constants.WARP_SIZE;
454 int warpNum = threadId / policy::cuda::device_constants.WARP_SIZE;
458 if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0)
462 for (
int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
464 T rhs = shfl_xor_sync(temp, i);
465 Combiner {}(temp, rhs);
472 for (
int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
474 int srcLane = threadId ^ i;
475 T rhs = shfl_sync(temp, srcLane);
477 if (srcLane < numThreads)
479 Combiner {}(temp, rhs);
485 if (numThreads > policy::cuda::device_constants.WARP_SIZE)
488 static_assert(policy::cuda::device_constants.MAX_WARPS <=
489 policy::cuda::device_constants.WARP_SIZE,
490 "This algorithms assumes a warp of WARP_SIZE threads can "
491 "reduce MAX_WARPS values");
494 __shared__
unsigned char tmpsd[
sizeof(
501 T, policy::cuda::device_constants.MAX_WARPS
>*>(tmpsd);
506 sd->
set(warpNum, temp);
515 if (warpId * policy::cuda::device_constants.WARP_SIZE < numThreads)
517 temp = sd->get(warpId);
524 for (
int i = 1; i < policy::cuda::device_constants.MAX_WARPS; i *= 2)
526 T rhs = shfl_xor_sync(temp, i);
527 Combiner {}(temp, rhs);
Header file for common RAJA internal definitions.
Array class specialized for Struct of Array data layout.
Definition: SoAArray.hpp:42
constexpr RAJA_HOST_DEVICE void set(size_t i, value_type val)
Definition: SoAArray.hpp:48
Header file for common RAJA internal macro definitions.
#define RAJA_UNUSED_ARG(x)
Definition: macros.hpp:97
#define RAJA_DEVICE
Definition: macros.hpp:66
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T *acc, T value)
Atomic add.
Definition: atomic.hpp:117
std::ptrdiff_t Index_type
Definition: types.hpp:226
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
Abstracts T into an equal or greater size array of integers whose size is between min_integer_type_si...
Definition: types.hpp:962
integer_type array[num_integer_type]
Definition: types.hpp:1000
RAJA_HOST_DEVICE void set_value(T value)
Definition: types.hpp:1016
Abstracts access to memory using normal memory accesses.
Definition: types.hpp:938
Header file for RAJA type definitions.