RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
intrinsics.hpp
Go to the documentation of this file.
1 
14 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
15 // Copyright (c) Lawrence Livermore National Security, LLC and other
16 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
17 // files for dates and other details. No copyright assignment is required
18 // to contribute to RAJA.
19 //
20 // SPDX-License-Identifier: (BSD-3-Clause)
21 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
22 
23 #ifndef RAJA_cuda_intrinsics_HPP
24 #define RAJA_cuda_intrinsics_HPP
25 
26 #include "RAJA/config.hpp"
27 
28 #if defined(RAJA_CUDA_ACTIVE)
29 
30 #include <type_traits>
31 
32 #include <cuda.h>
33 
34 #include "RAJA/util/macros.hpp"
35 #include "RAJA/util/SoAArray.hpp"
36 #include "RAJA/util/types.hpp"
37 
38 namespace RAJA
39 {
40 
41 namespace policy
42 {
43 
44 namespace cuda
45 {
46 
47 struct DeviceConstants
48 {
49  RAJA::Index_type WARP_SIZE;
50  RAJA::Index_type MAX_BLOCK_SIZE;
51  RAJA::Index_type MAX_WARPS;
53  ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE; // basically the cache line size of
54  // the cache level that handles
55  // atomics
56 
57  constexpr DeviceConstants(RAJA::Index_type warp_size,
58  RAJA::Index_type max_block_size,
59  RAJA::Index_type atomic_cache_line_bytes) noexcept
60  : WARP_SIZE(warp_size),
61  MAX_BLOCK_SIZE(max_block_size),
62  MAX_WARPS(max_block_size / warp_size),
63  ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
64  {}
65 };
66 
67 //
68 // Operations in the included files are parametrized using the following
69 // values for CUDA warp size and max block size.
70 //
71 constexpr DeviceConstants device_constants(RAJA_CUDA_WARPSIZE,
72  1024,
73  32); // V100
74 static_assert(device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
75  "RAJA Assumption Broken: device_constants.WARP_SIZE < "
76  "device_constants.MAX_WARPS");
77 static_assert(device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
78  "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
79  "a multiple of device_constants.WARP_SIZE");
80 
81 constexpr const size_t MIN_BLOCKS_PER_SM = 1;
82 constexpr const size_t MAX_BLOCKS_PER_SM = 32;
83 
84 } // end namespace cuda
85 
86 } // end namespace policy
87 
88 namespace cuda
89 {
90 
91 namespace impl
92 {
93 
106 struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor
107 {
108  static RAJA_DEVICE RAJA_INLINE void fence_acquire() { __threadfence(); }
109 
110  static RAJA_DEVICE RAJA_INLINE void fence_release() { __threadfence(); }
111 };
112 
133 struct AccessorDeviceScopeUseBlockFence
134 {
135  // cuda has 32 and 64 bit atomics
136  static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int);
137  static constexpr size_t max_atomic_int_type_size = sizeof(unsigned long long);
138 
139  template<typename T>
140  static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx)
141  {
142  using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
143  max_atomic_int_type_size>;
144  using integer_type = typename ArrayType::integer_type;
145 
146  ArrayType u;
147  auto ptr = const_cast<integer_type*>(
148  reinterpret_cast<const integer_type*>(in_ptr + idx));
149 
150  for (size_t i = 0; i < u.array_size(); ++i)
151  {
152  u.array[i] = ::atomicAdd(&ptr[i], integer_type(0));
153  }
154 
155  return u.get_value();
156  }
157 
158  template<typename T>
159  static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val)
160  {
161  using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
162  max_atomic_int_type_size>;
163  using integer_type = typename ArrayType::integer_type;
164 
165  ArrayType u;
166  u.set_value(val);
167  auto ptr = reinterpret_cast<integer_type*>(in_ptr + idx);
168 
169  for (size_t i = 0; i < u.array_size(); ++i)
170  {
171  ::atomicExch(&ptr[i], u.array[i]);
172  }
173  }
174 
175  static RAJA_DEVICE RAJA_INLINE void fence_acquire() { __threadfence(); }
176 
177  static RAJA_DEVICE RAJA_INLINE void fence_release() { __threadfence(); }
178 };
179 
180 // cuda 8 only has shfl primitives for 32 bits while cuda 9 has 32 and 64 bits
181 constexpr size_t min_shfl_int_type_size = sizeof(unsigned int);
182 #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
183 constexpr size_t max_shfl_int_type_size = sizeof(unsigned long long);
184 #else
185 constexpr size_t max_shfl_int_type_size = sizeof(unsigned int);
186 #endif
187 
198 template<typename T>
199 RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
200 {
201  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
202  max_shfl_int_type_size>
203  u;
204  u.set_value(var);
205 
206  for (size_t i = 0; i < u.array_size(); ++i)
207  {
208 #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
209  u.array[i] = ::__shfl_xor_sync(0xffffffffu, u.array[i], laneMask);
210 #else
211  u.array[i] = ::__shfl_xor(u.array[i], laneMask);
212 #endif
213  }
214  return u.get_value();
215 }
216 
217 template<typename T>
218 RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
219 {
220  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
221  max_shfl_int_type_size>
222  u;
223  u.set_value(var);
224 
225  for (size_t i = 0; i < u.array_size(); ++i)
226  {
227 #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
228  u.array[i] = ::__shfl_sync(0xffffffffu, u.array[i], srcLane);
229 #else
230  u.array[i] = ::__shfl(u.array[i], srcLane);
231 #endif
232  }
233  return u.get_value();
234 }
235 
236 #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
237 
238 template<>
239 RAJA_DEVICE RAJA_INLINE int shfl_xor_sync<int>(int var, int laneMask)
240 {
241  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
242 }
243 
244 template<>
245 RAJA_DEVICE RAJA_INLINE unsigned int shfl_xor_sync<unsigned int>(
246  unsigned int var,
247  int laneMask)
248 {
249  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
250 }
251 
252 template<>
253 RAJA_DEVICE RAJA_INLINE long shfl_xor_sync<long>(long var, int laneMask)
254 {
255  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
256 }
257 
258 template<>
259 RAJA_DEVICE RAJA_INLINE unsigned long shfl_xor_sync<unsigned long>(
260  unsigned long var,
261  int laneMask)
262 {
263  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
264 }
265 
266 template<>
267 RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync<long long>(long long var,
268  int laneMask)
269 {
270  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
271 }
272 
273 template<>
274 RAJA_DEVICE RAJA_INLINE unsigned long long shfl_xor_sync<unsigned long long>(
275  unsigned long long var,
276  int laneMask)
277 {
278  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
279 }
280 
281 template<>
282 RAJA_DEVICE RAJA_INLINE float shfl_xor_sync<float>(float var, int laneMask)
283 {
284  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
285 }
286 
287 template<>
288 RAJA_DEVICE RAJA_INLINE double shfl_xor_sync<double>(double var, int laneMask)
289 {
290  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
291 }
292 
293 #else
294 
295 template<>
296 RAJA_DEVICE RAJA_INLINE int shfl_xor_sync<int>(int var, int laneMask)
297 {
298  return ::__shfl_xor(var, laneMask);
299 }
300 
301 template<>
302 RAJA_DEVICE RAJA_INLINE float shfl_xor_sync<float>(float var, int laneMask)
303 {
304  return ::__shfl_xor(var, laneMask);
305 }
306 
307 #endif
308 
309 
310 #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
311 
312 template<>
313 RAJA_DEVICE RAJA_INLINE int shfl_sync<int>(int var, int srcLane)
314 {
315  return ::__shfl_sync(0xffffffffu, var, srcLane);
316 }
317 
318 template<>
319 RAJA_DEVICE RAJA_INLINE unsigned int shfl_sync<unsigned int>(unsigned int var,
320  int srcLane)
321 {
322  return ::__shfl_sync(0xffffffffu, var, srcLane);
323 }
324 
325 template<>
326 RAJA_DEVICE RAJA_INLINE long shfl_sync<long>(long var, int srcLane)
327 {
328  return ::__shfl_sync(0xffffffffu, var, srcLane);
329 }
330 
331 template<>
332 RAJA_DEVICE RAJA_INLINE unsigned long shfl_sync<unsigned long>(
333  unsigned long var,
334  int srcLane)
335 {
336  return ::__shfl_sync(0xffffffffu, var, srcLane);
337 }
338 
339 template<>
340 RAJA_DEVICE RAJA_INLINE long long shfl_sync<long long>(long long var,
341  int srcLane)
342 {
343  return ::__shfl_sync(0xffffffffu, var, srcLane);
344 }
345 
346 template<>
347 RAJA_DEVICE RAJA_INLINE unsigned long long shfl_sync<unsigned long long>(
348  unsigned long long var,
349  int srcLane)
350 {
351  return ::__shfl_sync(0xffffffffu, var, srcLane);
352 }
353 
354 template<>
355 RAJA_DEVICE RAJA_INLINE float shfl_sync<float>(float var, int srcLane)
356 {
357  return ::__shfl_sync(0xffffffffu, var, srcLane);
358 }
359 
360 template<>
361 RAJA_DEVICE RAJA_INLINE double shfl_sync<double>(double var, int srcLane)
362 {
363  return ::__shfl_sync(0xffffffffu, var, srcLane);
364 }
365 
366 #else
367 
368 template<>
369 RAJA_DEVICE RAJA_INLINE int shfl_sync<int>(int var, int srcLane)
370 {
371  return ::__shfl(var, srcLane);
372 }
373 
374 template<>
375 RAJA_DEVICE RAJA_INLINE float shfl_sync<float>(float var, int srcLane)
376 {
377  return ::__shfl(var, srcLane);
378 }
379 
380 #endif
381 
382 
384 template<typename Combiner, typename T>
385 RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
386 {
387  int numThreads = blockDim.x * blockDim.y * blockDim.z;
388 
389  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
390  (blockDim.x * blockDim.y) * threadIdx.z;
391 
392  T temp = val;
393 
394  if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0)
395  {
396 
397  // reduce each warp
398  for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
399  {
400  T rhs = shfl_xor_sync(temp, i);
401  Combiner {}(temp, rhs);
402  }
403  }
404  else
405  {
406 
407  // reduce each warp
408  for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
409  {
410  int srcLane = threadId ^ i;
411  T rhs = shfl_sync(temp, srcLane);
412  // only add from threads that exist (don't double count own value)
413  if (srcLane < numThreads)
414  {
415  Combiner {}(temp, rhs);
416  }
417  }
418  }
419 
420  return temp;
421 }
422 
430 template<typename Combiner, typename T>
431 RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
432 {
433  T temp = val;
434 
435  for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
436  {
437  T rhs = __shfl_xor_sync(0xffffffff, temp, i);
438  Combiner {}(temp, rhs);
439  }
440 
441  return temp;
442 }
443 
445 template<typename Combiner, typename T>
446 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
447 {
448  int numThreads = blockDim.x * blockDim.y * blockDim.z;
449 
450  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
451  (blockDim.x * blockDim.y) * threadIdx.z;
452 
453  int warpId = threadId % policy::cuda::device_constants.WARP_SIZE;
454  int warpNum = threadId / policy::cuda::device_constants.WARP_SIZE;
455 
456  T temp = val;
457 
458  if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0)
459  {
460 
461  // reduce each warp
462  for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
463  {
464  T rhs = shfl_xor_sync(temp, i);
465  Combiner {}(temp, rhs);
466  }
467  }
468  else
469  {
470 
471  // reduce each warp
472  for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
473  {
474  int srcLane = threadId ^ i;
475  T rhs = shfl_sync(temp, srcLane);
476  // only add from threads that exist (don't double count own value)
477  if (srcLane < numThreads)
478  {
479  Combiner {}(temp, rhs);
480  }
481  }
482  }
483 
484  // reduce per warp values
485  if (numThreads > policy::cuda::device_constants.WARP_SIZE)
486  {
487 
488  static_assert(policy::cuda::device_constants.MAX_WARPS <=
489  policy::cuda::device_constants.WARP_SIZE,
490  "This algorithms assumes a warp of WARP_SIZE threads can "
491  "reduce MAX_WARPS values");
492 
493  // Need to separate declaration and initialization for clang-cuda
494  __shared__ unsigned char tmpsd[sizeof(
495  RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS>)];
496 
497  // Partial placement new: Should call new(tmpsd) here but recasting memory
498  // to avoid calling constructor/destructor in shared memory.
499  RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS>* sd =
500  reinterpret_cast<RAJA::detail::SoAArray<
501  T, policy::cuda::device_constants.MAX_WARPS>*>(tmpsd);
502 
503  // write per warp values to shared memory
504  if (warpId == 0)
505  {
506  sd->set(warpNum, temp);
507  }
508 
509  __syncthreads();
510 
511  if (warpNum == 0)
512  {
513 
514  // read per warp values
515  if (warpId * policy::cuda::device_constants.WARP_SIZE < numThreads)
516  {
517  temp = sd->get(warpId);
518  }
519  else
520  {
521  temp = identity;
522  }
523 
524  for (int i = 1; i < policy::cuda::device_constants.MAX_WARPS; i *= 2)
525  {
526  T rhs = shfl_xor_sync(temp, i);
527  Combiner {}(temp, rhs);
528  }
529  }
530 
531  __syncthreads();
532  }
533 
534  return temp;
535 }
536 
537 } // end namespace impl
538 
539 } // end namespace cuda
540 
541 } // end namespace RAJA
542 
543 #endif // closing endif for RAJA_ENABLE_CUDA guard
544 
545 #endif // closing endif for header file include guard
Header file for common RAJA internal definitions.
Array class specialized for Struct of Array data layout.
Definition: SoAArray.hpp:42
constexpr RAJA_HOST_DEVICE void set(size_t i, value_type val)
Definition: SoAArray.hpp:48
Header file for common RAJA internal macro definitions.
#define RAJA_UNUSED_ARG(x)
Definition: macros.hpp:97
#define RAJA_DEVICE
Definition: macros.hpp:66
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T *acc, T value)
Atomic add.
Definition: atomic.hpp:117
std::ptrdiff_t Index_type
Definition: types.hpp:226
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
Abstracts T into an equal or greater size array of integers whose size is between min_integer_type_si...
Definition: types.hpp:962
integer_type array[num_integer_type]
Definition: types.hpp:1000
RAJA_HOST_DEVICE void set_value(T value)
Definition: types.hpp:1016
Abstracts access to memory using normal memory accesses.
Definition: types.hpp:938
Header file for RAJA type definitions.