RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
intrinsics.hpp
Go to the documentation of this file.
1 
14 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
15 // Copyright (c) Lawrence Livermore National Security, LLC and other
16 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
17 // files for dates and other details. No copyright assignment is required
18 // to contribute to RAJA.
19 //
20 // SPDX-License-Identifier: (BSD-3-Clause)
21 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
22 
23 #ifndef RAJA_hip_intrinsics_HPP
24 #define RAJA_hip_intrinsics_HPP
25 
26 #include "RAJA/config.hpp"
27 
28 #if defined(RAJA_HIP_ACTIVE)
29 
30 #include <type_traits>
31 
32 #include <hip/hip_runtime.h>
33 
34 #include "RAJA/util/macros.hpp"
35 #include "RAJA/util/SoAArray.hpp"
36 #include "RAJA/util/types.hpp"
37 
38 namespace RAJA
39 {
40 
41 namespace policy
42 {
43 namespace hip
44 {
45 
46 struct DeviceConstants
47 {
48  RAJA::Index_type WARP_SIZE;
49  RAJA::Index_type MAX_BLOCK_SIZE;
50  RAJA::Index_type MAX_WARPS;
52  ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE; // basically the cache line size of
53  // the cache level that handles
54  // atomics
55 
56  constexpr DeviceConstants(RAJA::Index_type warp_size,
57  RAJA::Index_type max_block_size,
58  RAJA::Index_type atomic_cache_line_bytes) noexcept
59  : WARP_SIZE(warp_size),
60  MAX_BLOCK_SIZE(max_block_size),
61  MAX_WARPS(max_block_size / warp_size),
62  ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
63  {}
64 };
65 
66 //
67 // Operations in the included files are parametrized using the following
68 // values for HIP warp size and max block size.
69 //
70 #if defined(__HIP_PLATFORM_AMD__)
71 constexpr DeviceConstants device_constants(RAJA_HIP_WAVESIZE,
72  1024,
73  64); // MI300A
74 // constexpr DeviceConstants device_constants(RAJA_HIP_WAVESIZE, 1024, 128); //
75 // MI250X
76 
77 #elif defined(__HIP_PLATFORM_NVIDIA__)
78 constexpr DeviceConstants device_constants(RAJA_CUDA_WARPSIZE,
79  1024,
80  32); // V100
81 #endif
82 static_assert(device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
83  "RAJA Assumption Broken: device_constants.WARP_SIZE < "
84  "device_constants.MAX_WARPS");
85 static_assert(device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
86  "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
87  "a multiple of device_constants.WARP_SIZE");
88 
89 } // end namespace hip
90 
91 } // end namespace policy
92 
93 namespace hip
94 {
95 
96 namespace impl
97 {
98 
111 struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor
112 {
113  static RAJA_DEVICE RAJA_INLINE void fence_acquire() { __threadfence(); }
114 
115  static RAJA_DEVICE RAJA_INLINE void fence_release() { __threadfence(); }
116 };
117 
138 struct AccessorDeviceScopeUseBlockFence
139 {
140  // hip has 32 and 64 bit atomics
141  static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int);
142  static constexpr size_t max_atomic_int_type_size = sizeof(unsigned long long);
143 
144  template<typename T>
145  static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx)
146  {
147  using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
148  max_atomic_int_type_size>;
149  using integer_type = typename ArrayType::integer_type;
150 
151  ArrayType u;
152  auto ptr = const_cast<integer_type*>(
153  reinterpret_cast<const integer_type*>(in_ptr + idx));
154 
155  for (size_t i = 0; i < u.array_size(); ++i)
156  {
157 #if defined(RAJA_USE_HIP_INTRINSICS) && \
158  RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_load)
159  u.array[i] = __hip_atomic_load(&ptr[i], __ATOMIC_RELAXED,
160  __HIP_MEMORY_SCOPE_AGENT);
161 #else
162  u.array[i] = ::atomicAdd(&ptr[i], integer_type(0));
163 #endif
164  }
165 
166  return u.get_value();
167  }
168 
169  template<typename T>
170  static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val)
171  {
172  using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
173  max_atomic_int_type_size>;
174  using integer_type = typename ArrayType::integer_type;
175 
176  ArrayType u;
177  u.set_value(val);
178  auto ptr = reinterpret_cast<integer_type*>(in_ptr + idx);
179 
180  for (size_t i = 0; i < u.array_size(); ++i)
181  {
182 #if defined(RAJA_USE_HIP_INTRINSICS) && \
183  RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_store)
184  __hip_atomic_store(&ptr[i], u.array[i], __ATOMIC_RELAXED,
185  __HIP_MEMORY_SCOPE_AGENT);
186 #else
187  ::atomicExch(&ptr[i], u.array[i]);
188 #endif
189  }
190  }
191 
192  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
193  {
194 #if defined(RAJA_USE_HIP_INTRINSICS) && \
195  RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence)
196  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
197 #else
198  __threadfence();
199 #endif
200  }
201 
202  static RAJA_DEVICE RAJA_INLINE void fence_release()
203  {
204 #if defined(RAJA_USE_HIP_INTRINSICS) && \
205  RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence) && \
206  RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_s_waitcnt)
207  __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
208  // Wait until all vmem operations complete (s_waitcnt vmcnt(0))
209  __builtin_amdgcn_s_waitcnt(/*vmcnt*/ 0 | (/*exp_cnt*/ 0x7 << 4) |
210  (/*lgkmcnt*/ 0xf << 8));
211 #else
212  __threadfence();
213 #endif
214  }
215 };
216 
217 // hip only has shfl primitives for 32 bits
218 constexpr size_t min_shfl_int_type_size = sizeof(unsigned int);
219 constexpr size_t max_shfl_int_type_size = sizeof(unsigned int);
220 
231 template<typename T>
232 RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
233 {
234  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
235  max_shfl_int_type_size>
236  u;
237  u.set_value(var);
238 
239  for (size_t i = 0; i < u.array_size(); ++i)
240  {
241  u.array[i] = ::__shfl_xor(u.array[i], laneMask);
242  }
243  return u.get_value();
244 }
245 
246 template<typename T>
247 RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
248 {
249  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
250  max_shfl_int_type_size>
251  u;
252  u.set_value(var);
253 
254  for (size_t i = 0; i < u.array_size(); ++i)
255  {
256  u.array[i] = ::__shfl(u.array[i], srcLane);
257  }
258  return u.get_value();
259 }
260 
261 template<>
262 RAJA_DEVICE RAJA_INLINE int shfl_xor_sync<int>(int var, int laneMask)
263 {
264  return ::__shfl_xor(var, laneMask);
265 }
266 
267 template<>
268 RAJA_DEVICE RAJA_INLINE float shfl_xor_sync<float>(float var, int laneMask)
269 {
270  return ::__shfl_xor(var, laneMask);
271 }
272 
273 template<>
274 RAJA_DEVICE RAJA_INLINE int shfl_sync<int>(int var, int srcLane)
275 {
276  return ::__shfl(var, srcLane);
277 }
278 
279 template<>
280 RAJA_DEVICE RAJA_INLINE float shfl_sync<float>(float var, int srcLane)
281 {
282  return ::__shfl(var, srcLane);
283 }
284 
286 template<typename Combiner, typename T>
287 RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
288 {
289  int numThreads = blockDim.x * blockDim.y * blockDim.z;
290 
291  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
292  (blockDim.x * blockDim.y) * threadIdx.z;
293 
294  T temp = val;
295 
296  if (numThreads % policy::hip::device_constants.WARP_SIZE == 0)
297  {
298 
299  // reduce each warp
300  for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
301  {
302  T rhs = shfl_xor_sync(temp, i);
303  Combiner {}(temp, rhs);
304  }
305  }
306  else
307  {
308 
309  // reduce each warp
310  for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
311  {
312  int srcLane = threadId ^ i;
313  T rhs = shfl_sync(temp, srcLane);
314  // only add from threads that exist (don't double count own value)
315  if (srcLane < numThreads)
316  {
317  Combiner {}(temp, rhs);
318  }
319  }
320  }
321 
322  return temp;
323 }
324 
332 template<typename Combiner, typename T>
333 RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
334 {
335  T temp = val;
336 
337  for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
338  {
339  T rhs = shfl_xor_sync(temp, i);
340  Combiner {}(temp, rhs);
341  }
342 
343  return temp;
344 }
345 
347 template<typename Combiner, typename T>
348 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
349 {
350  int numThreads = blockDim.x * blockDim.y * blockDim.z;
351 
352  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
353  (blockDim.x * blockDim.y) * threadIdx.z;
354 
355  int warpId = threadId % policy::hip::device_constants.WARP_SIZE;
356  int warpNum = threadId / policy::hip::device_constants.WARP_SIZE;
357 
358  T temp = val;
359 
360  if (numThreads % policy::hip::device_constants.WARP_SIZE == 0)
361  {
362 
363  // reduce each warp
364  for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
365  {
366  T rhs = shfl_xor_sync(temp, i);
367  Combiner {}(temp, rhs);
368  }
369  }
370  else
371  {
372 
373  // reduce each warp
374  for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
375  {
376  int srcLane = threadId ^ i;
377  T rhs = shfl_sync(temp, srcLane);
378  // only add from threads that exist (don't double count own value)
379  if (srcLane < numThreads)
380  {
381  Combiner {}(temp, rhs);
382  }
383  }
384  }
385 
386  // reduce per warp values
387  if (numThreads > policy::hip::device_constants.WARP_SIZE)
388  {
389 
390  static_assert(policy::hip::device_constants.MAX_WARPS <=
391  policy::hip::device_constants.WARP_SIZE,
392  "This algorithms assumes a warp of WARP_SIZE threads can "
393  "reduce MAX_WARPS values");
394 
395  __shared__ unsigned char tmpsd[sizeof(
396  RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>)];
397  RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>* sd =
398  reinterpret_cast<RAJA::detail::SoAArray<
399  T, policy::hip::device_constants.MAX_WARPS>*>(tmpsd);
400 
401  // write per warp values to shared memory
402  if (warpId == 0)
403  {
404  sd->set(warpNum, temp);
405  }
406 
407  __syncthreads();
408 
409  if (warpNum == 0)
410  {
411 
412  // read per warp values
413  if (warpId * policy::hip::device_constants.WARP_SIZE < numThreads)
414  {
415  temp = sd->get(warpId);
416  }
417  else
418  {
419  temp = identity;
420  }
421 
422  for (int i = 1; i < policy::hip::device_constants.MAX_WARPS; i *= 2)
423  {
424  T rhs = shfl_xor_sync(temp, i);
425  Combiner {}(temp, rhs);
426  }
427  }
428 
429  __syncthreads();
430  }
431 
432  return temp;
433 }
434 
435 } // end namespace impl
436 
437 } // end namespace hip
438 
439 } // end namespace RAJA
440 
441 #endif // closing endif for RAJA_ENABLE_HIP guard
442 
443 #endif // closing endif for header file include guard
Header file for common RAJA internal definitions.
Array class specialized for Struct of Array data layout.
Definition: SoAArray.hpp:42
constexpr RAJA_HOST_DEVICE void set(size_t i, value_type val)
Definition: SoAArray.hpp:48
Header file for common RAJA internal macro definitions.
#define RAJA_UNUSED_ARG(x)
Definition: macros.hpp:97
#define RAJA_DEVICE
Definition: macros.hpp:66
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T *acc, T value)
Atomic add.
Definition: atomic.hpp:117
std::ptrdiff_t Index_type
Definition: types.hpp:226
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
Abstracts T into an equal or greater size array of integers whose size is between min_integer_type_si...
Definition: types.hpp:962
integer_type array[num_integer_type]
Definition: types.hpp:1000
RAJA_HOST_DEVICE void set_value(T value)
Definition: types.hpp:1016
Abstracts access to memory using normal memory accesses.
Definition: types.hpp:938
Header file for RAJA type definitions.