RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
forall.hpp
Go to the documentation of this file.
1 
15 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
16 // Copyright (c) Lawrence Livermore National Security, LLC and other
17 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
18 // files for dates and other details. No copyright assignment is required
19 // to contribute to RAJA.
20 //
21 // SPDX-License-Identifier: (BSD-3-Clause)
22 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
23 
24 #ifndef RAJA_forall_hip_HPP
25 #define RAJA_forall_hip_HPP
26 
27 #include "RAJA/config.hpp"
28 
29 #if defined(RAJA_ENABLE_HIP)
30 
31 #include <algorithm>
32 #include "hip/hip_runtime.h"
33 
34 #include "RAJA/pattern/forall.hpp"
35 
37 
38 #include "RAJA/util/macros.hpp"
39 #include "RAJA/util/types.hpp"
40 
44 
45 #include "RAJA/index/IndexSet.hpp"
46 
47 #include "RAJA/util/resource.hpp"
48 
49 namespace RAJA
50 {
51 namespace policy
52 {
53 namespace hip
54 {
55 
56 namespace impl
57 {
58 
74 template<typename IterationMapping,
75  typename IterationGetter,
76  typename Concretizer,
77  typename UniqueMarker>
78 struct ForallDimensionCalculator;
79 
80 // The general cases handle fixed BLOCK_SIZE > 0 and/or GRID_SIZE > 0
81 // there are specializations for named_usage::unspecified
82 // but named_usage::ignored is not supported so no specializations are provided
83 // and static_asserts in the general case catch unsupported values
84 template<named_dim dim,
85  int BLOCK_SIZE,
86  int GRID_SIZE,
87  typename Concretizer,
88  typename UniqueMarker>
89 struct ForallDimensionCalculator<
91  ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
92  Concretizer,
93  UniqueMarker>
94 {
95  static_assert(
96  BLOCK_SIZE > 0,
97  "block size must be > 0 or named_usage::unspecified with forall");
98  static_assert(
99  GRID_SIZE > 0,
100  "grid size must be > 0 or named_usage::unspecified with forall");
101 
102  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
103 
104  template<typename IdxT>
105  static void set_dimensions(internal::HipDims& dims,
106  IdxT len,
107  const void* RAJA_UNUSED_ARG(func),
108  size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
109  {
110  const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
111  const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
112 
113  if (len > (block_size * grid_size))
114  {
116  "len exceeds the size of the directly mapped index space");
117  }
118 
119  internal::set_hip_dim<dim>(dims.threads,
120  static_cast<IdxT>(IndexGetter::block_size));
121  internal::set_hip_dim<dim>(dims.blocks,
122  static_cast<IdxT>(IndexGetter::grid_size));
123  }
124 };
125 
126 template<named_dim dim,
127  int GRID_SIZE,
128  typename Concretizer,
129  typename UniqueMarker>
130 struct ForallDimensionCalculator<
132  ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
133  Concretizer,
134  UniqueMarker>
135 {
136  static_assert(
137  GRID_SIZE > 0,
138  "grid size must be > 0 or named_usage::unspecified with forall");
139 
140  using IndexGetter =
141  ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
142 
143  template<typename IdxT>
144  static void set_dimensions(internal::HipDims& dims,
145  IdxT len,
146  const void* func,
147  size_t dynamic_shmem_size)
148  {
149  ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
150  func, dynamic_shmem_size, len};
151 
152  const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
153  const IdxT block_size = concretizer.get_block_size_to_fit_len(grid_size);
154 
155  if (block_size == IdxT(0))
156  {
158  "len exceeds the size of the directly mapped index space");
159  }
160 
161  internal::set_hip_dim<dim>(dims.threads, block_size);
162  internal::set_hip_dim<dim>(dims.blocks, grid_size);
163  }
164 };
165 
166 template<named_dim dim,
167  int BLOCK_SIZE,
168  typename Concretizer,
169  typename UniqueMarker>
170 struct ForallDimensionCalculator<
172  ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
173  Concretizer,
174  UniqueMarker>
175 {
176  static_assert(
177  BLOCK_SIZE > 0,
178  "block size must be > 0 or named_usage::unspecified with forall");
179 
180  using IndexGetter =
181  ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
182 
183  template<typename IdxT>
184  static void set_dimensions(internal::HipDims& dims,
185  IdxT len,
186  const void* func,
187  size_t dynamic_shmem_size)
188  {
189  ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
190  func, dynamic_shmem_size, len};
191 
192  const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
193  const IdxT grid_size = concretizer.get_grid_size_to_fit_len(block_size);
194 
195  internal::set_hip_dim<dim>(dims.threads, block_size);
196  internal::set_hip_dim<dim>(dims.blocks, grid_size);
197  }
198 };
199 
200 template<named_dim dim, typename Concretizer, typename UniqueMarker>
201 struct ForallDimensionCalculator<
203  ::RAJA::hip::
204  IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
205  Concretizer,
206  UniqueMarker>
207 {
208  using IndexGetter = ::RAJA::hip::
209  IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
210 
211  template<typename IdxT>
212  static void set_dimensions(internal::HipDims& dims,
213  IdxT len,
214  const void* func,
215  size_t dynamic_shmem_size)
216  {
217  ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
218  func, dynamic_shmem_size, len};
219 
220  const auto sizes = concretizer.get_block_and_grid_size_to_fit_len();
221 
222  internal::set_hip_dim<dim>(dims.threads, sizes.first);
223  internal::set_hip_dim<dim>(dims.blocks, sizes.second);
224  }
225 };
226 
227 template<named_dim dim,
228  int BLOCK_SIZE,
229  int GRID_SIZE,
230  typename Concretizer,
231  typename UniqueMarker>
232 struct ForallDimensionCalculator<
233  ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
234  ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
235  Concretizer,
236  UniqueMarker>
237 {
238  static_assert(
239  BLOCK_SIZE > 0,
240  "block size must be > 0 or named_usage::unspecified with forall");
241  static_assert(
242  GRID_SIZE > 0,
243  "grid size must be > 0 or named_usage::unspecified with forall");
244 
245  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
246 
247  template<typename IdxT>
248  static void set_dimensions(internal::HipDims& dims,
249  IdxT RAJA_UNUSED_ARG(len),
250  const void* RAJA_UNUSED_ARG(func),
251  size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
252  {
253  const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
254  const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
255 
256  internal::set_hip_dim<dim>(dims.threads, block_size);
257  internal::set_hip_dim<dim>(dims.blocks, grid_size);
258  }
259 };
260 
261 template<named_dim dim,
262  int GRID_SIZE,
263  typename Concretizer,
264  typename UniqueMarker>
265 struct ForallDimensionCalculator<
266  ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
267  ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
268  Concretizer,
269  UniqueMarker>
270 {
271  static_assert(
272  GRID_SIZE > 0,
273  "grid size must be > 0 or named_usage::unspecified with forall");
274 
275  using IndexGetter =
276  ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
277 
278  template<typename IdxT>
279  static void set_dimensions(internal::HipDims& dims,
280  IdxT len,
281  const void* func,
282  size_t dynamic_shmem_size)
283  {
284  ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
285  func, dynamic_shmem_size, len};
286 
287  const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
288  const IdxT block_size = concretizer.get_block_size_to_fit_device(grid_size);
289 
290  internal::set_hip_dim<dim>(dims.threads, block_size);
291  internal::set_hip_dim<dim>(dims.blocks, grid_size);
292  }
293 };
294 
295 template<named_dim dim,
296  int BLOCK_SIZE,
297  typename Concretizer,
298  typename UniqueMarker>
299 struct ForallDimensionCalculator<
300  ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
301  ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
302  Concretizer,
303  UniqueMarker>
304 {
305  static_assert(
306  BLOCK_SIZE > 0,
307  "block size must be > 0 or named_usage::unspecified with forall");
308 
309  using IndexGetter =
310  ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
311 
312  template<typename IdxT>
313  static void set_dimensions(internal::HipDims& dims,
314  IdxT len,
315  const void* func,
316  size_t dynamic_shmem_size)
317  {
318  ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
319  func, dynamic_shmem_size, len};
320 
321  const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
322  const IdxT grid_size = concretizer.get_grid_size_to_fit_device(block_size);
323 
324  internal::set_hip_dim<dim>(dims.threads, block_size);
325  internal::set_hip_dim<dim>(dims.blocks, grid_size);
326  }
327 };
328 
329 template<named_dim dim, typename Concretizer, typename UniqueMarker>
330 struct ForallDimensionCalculator<
331  ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
332  ::RAJA::hip::
333  IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
334  Concretizer,
335  UniqueMarker>
336 {
337  using IndexGetter = ::RAJA::hip::
338  IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
339 
340  template<typename IdxT>
341  static void set_dimensions(internal::HipDims& dims,
342  IdxT len,
343  const void* func,
344  size_t dynamic_shmem_size)
345  {
346  ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
347  func, dynamic_shmem_size, len};
348 
349  const auto sizes = concretizer.get_block_and_grid_size_to_fit_device();
350 
351  internal::set_hip_dim<dim>(dims.threads, sizes.first);
352  internal::set_hip_dim<dim>(dims.blocks, sizes.second);
353  }
354 };
355 
356 //
358 //
359 // HIP kernel templates.
360 //
362 //
363 
364 
365 template<typename EXEC_POL,
366  typename Iterator,
367  typename LOOP_BODY,
368  typename IndexType,
369  typename ForallParam,
370  typename IterationMapping = typename EXEC_POL::IterationMapping,
371  typename IterationGetter = typename EXEC_POL::IterationGetter,
372  std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
373  IterationMapping>::value &&
374  (IterationGetter::block_size > 0),
375  size_t> BlockSize = IterationGetter::block_size>
376 __launch_bounds__(BlockSize, 1) __global__
377  void forallp_hip_kernel(const LOOP_BODY loop_body,
378  const Iterator idx,
379  const IndexType length,
380  ForallParam f_params)
381 {
383  auto privatizer = thread_privatize(loop_body);
384  auto& body = privatizer.get_priv();
385  auto ii = IterationGetter::template index<IndexType>();
386 
387  if (ii < length)
388  {
389  RAJA::expt::invoke_body(f_params, body, idx[ii]);
390  }
391 
393 }
394 
395 template<typename EXEC_POL,
396  typename Iterator,
397  typename LOOP_BODY,
398  typename IndexType,
399  typename ForallParam,
400  typename IterationMapping = typename EXEC_POL::IterationMapping,
401  typename IterationGetter = typename EXEC_POL::IterationGetter,
402  std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
403  IterationMapping>::value &&
404  (IterationGetter::block_size <= 0),
405  size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
406 __global__ void forallp_hip_kernel(const LOOP_BODY loop_body,
407  const Iterator idx,
408  const IndexType length,
409  ForallParam f_params)
410 {
412  auto privatizer = thread_privatize(loop_body);
413  auto& body = privatizer.get_priv();
414  auto ii = IterationGetter::template index<IndexType>();
415  ;
416  if (ii < length)
417  {
418  RAJA::expt::invoke_body(f_params, body, idx[ii]);
419  }
421 }
422 
423 template<
424  typename EXEC_POL,
425  typename Iterator,
426  typename LOOP_BODY,
427  typename IndexType,
428  typename ForallParam,
429  typename IterationMapping = typename EXEC_POL::IterationMapping,
430  typename IterationGetter = typename EXEC_POL::IterationGetter,
431  std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
432  IterationMapping>::value &&
433  std::is_base_of<iteration_mapping::UnsizedLoopBase,
434  IterationMapping>::value &&
435  (IterationGetter::block_size > 0),
436  size_t> BlockSize = IterationGetter::block_size>
437 __launch_bounds__(BlockSize, 1) __global__
438  void forallp_hip_kernel(const LOOP_BODY loop_body,
439  const Iterator idx,
440  const IndexType length,
441  ForallParam f_params)
442 {
444  auto privatizer = thread_privatize(loop_body);
445  auto& body = privatizer.get_priv();
446 
447  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
448  ii += IterationGetter::template size<IndexType>())
449  {
450  RAJA::expt::invoke_body(f_params, body, idx[ii]);
451  }
453 }
454 
455 template<
456  typename EXEC_POL,
457  typename Iterator,
458  typename LOOP_BODY,
459  typename IndexType,
460  typename ForallParam,
461  typename IterationMapping = typename EXEC_POL::IterationMapping,
462  typename IterationGetter = typename EXEC_POL::IterationGetter,
463  std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
464  IterationMapping>::value &&
465  std::is_base_of<iteration_mapping::UnsizedLoopBase,
466  IterationMapping>::value &&
467  (IterationGetter::block_size <= 0),
468  size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
469 __global__ void forallp_hip_kernel(const LOOP_BODY loop_body,
470  const Iterator idx,
471  const IndexType length,
472  ForallParam f_params)
473 {
475  auto privatizer = thread_privatize(loop_body);
476  auto& body = privatizer.get_priv();
477 
478  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
479  ii += IterationGetter::template size<IndexType>())
480  {
481 
482  RAJA::expt::invoke_body(f_params, body, idx[ii]);
483  }
484 
486 }
487 
488 } // namespace impl
489 
490 //
492 //
493 // Function templates for HIP execution over iterables.
494 //
496 //
497 
498 
499 template<typename Iterable,
500  typename LoopBody,
501  typename IterationMapping,
502  typename IterationGetter,
503  typename Concretizer,
504  bool Async,
505  typename ForallParam>
506 RAJA_INLINE concepts::enable_if_t<
507  resources::EventProxy<resources::Hip>,
509 forall_impl(resources::Hip hip_res,
510  ::RAJA::policy::hip::hip_exec<IterationMapping,
511  IterationGetter,
512  Concretizer,
513  Async> const& pol,
514  Iterable&& iter,
515  LoopBody&& loop_body,
516  ForallParam f_params)
517 {
518  using Iterator = camp::decay<decltype(std::begin(iter))>;
519  using LOOP_BODY = camp::decay<LoopBody>;
520  using IndexType =
521  camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
522  using EXEC_POL = camp::decay<decltype(pol)>;
523  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter,
524  LOOP_BODY, Iterator, ForallParam>;
525  using DimensionCalculator =
526  impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
527  Concretizer, UniqueMarker>;
528 
529  //
530  // Compute the requested iteration space size
531  //
532  Iterator begin = std::begin(iter);
533  Iterator end = std::end(iter);
534  IndexType len = std::distance(begin, end);
535 
536  // Only launch kernel if we have something to iterate over
537  if (len > 0)
538  {
539 
540  auto func = reinterpret_cast<const void*>(
541  &impl::forallp_hip_kernel<EXEC_POL, Iterator, LOOP_BODY, IndexType,
542  camp::decay<ForallParam>>);
543 
544  //
545  // Setup shared memory buffers
546  //
547  size_t shmem = 0;
548 
549  //
550  // Compute the kernel dimensions
551  //
552  internal::HipDims dims(1);
553  DimensionCalculator::set_dimensions(dims, len, func, shmem);
554 
555 
556  RAJA::hip::detail::hipInfo launch_info;
557  launch_info.gridDim = dims.blocks;
558  launch_info.blockDim = dims.threads;
559  launch_info.res = hip_res;
560 
561  {
562  RAJA::expt::ParamMultiplexer::parampack_init(pol, f_params, launch_info);
563 
564  //
565  // Privatize the loop_body, using make_launch_body to setup reductions
566  //
567  LOOP_BODY body = RAJA::hip::make_launch_body(
568  func, dims.blocks, dims.threads, shmem, hip_res,
569  std::forward<LoopBody>(loop_body));
570 
571  //
572  // Launch the kernels
573  //
574  void* args[] = {(void*)&body, (void*)&begin, (void*)&len,
575  (void*)&f_params};
576  RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res,
577  Async);
578 
580  launch_info);
581  }
582  }
583 
584  return resources::EventProxy<resources::Hip>(hip_res);
585 }
586 
587 //
589 //
590 // The following function templates iterate over index set segments
591 // using the explicitly named segment iteration policy and execute
592 // segments as HIP kernels.
593 //
595 //
596 
605 template<typename LoopBody,
606  typename IterationMapping,
607  typename IterationGetter,
608  typename Concretizer,
609  bool Async,
610  typename... SegmentTypes>
611 RAJA_INLINE resources::EventProxy<resources::Hip> forall_impl(
612  resources::Hip r,
613  ExecPolicy<
614  seq_segit,
615  ::RAJA::policy::hip::
616  hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>,
617  const TypedIndexSet<SegmentTypes...>& iset,
618  LoopBody&& loop_body)
619 {
620  int num_seg = iset.getNumSegments();
621  for (int isi = 0; isi < num_seg; ++isi)
622  {
623  iset.segmentCall(
624  r, isi, detail::CallForall(),
625  ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter,
626  Concretizer, true>(),
627  loop_body);
628  } // iterate over segments of index set
629 
630  if (!Async) RAJA::hip::synchronize(r);
631  return resources::EventProxy<resources::Hip>(r);
632 }
633 
634 } // namespace hip
635 
636 } // namespace policy
637 
638 } // namespace RAJA
639 
640 #endif // closing endif for RAJA_ENABLE_HIP guard
641 
642 #endif // closing endif for header file include guard
RAJA header file defining index set classes.
Header file defining prototypes for routines used to manage memory for HIP reductions and other opera...
Header file containing RAJA HIP policy definitions.
Header file for common RAJA internal macro definitions.
RAJA_HOST_DEVICE void RAJA_ABORT_OR_THROW(const char *str)
Definition: macros.hpp:143
#define RAJA_UNUSED_ARG(x)
Definition: macros.hpp:97
Args args
Definition: WorkRunner.hpp:212
value_type::device_call &[i_loop] iter
Definition: WorkRunner.hpp:216
constexpr RAJA_HOST_DEVICE auto invoke_body(Params &&params, Fn &&f, Ts &&... extra)
Definition: forall.hpp:598
RAJA_HOST_DEVICE auto thread_privatize(const T &item) -> Privatizer< T >
Create a private copy of the argument to be stored on the current thread's stack in a class of the Pr...
Definition: privatizer.hpp:88
seq_exec seq_segit
Definition: policy.hpp:83
RAJA_INLINE concepts::enable_if_t< resources::EventProxy< resources::Host >, expt::type_traits::is_ForallParamPack< ForallParam >, expt::type_traits::is_ForallParamPack_empty< ForallParam > > forall_impl(resources::Host host_res, const simd_exec &, Iterable &&iter, Func &&body, ForallParam)
Definition: forall.hpp:81
Definition: AlignedRangeIndexSetBuilders.cpp:35
named_dim
Definition: types.hpp:53
__launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ void launch_new_reduce_global_fcn_fixed(const RAJA_CUDA_GRID_CONSTANT BODY body_in
void launch(LaunchParams const &launch_params, ReduceParams &&... rest_of_launch_args)
Definition: launch_core.hpp:268
auto & body
Definition: launch.hpp:177
auto privatizer
Definition: launch.hpp:176
void synchronize()
Synchronize all current RAJA executions for the specified policy.
Definition: synchronize.hpp:44
Header file containing RAJA index set and segment iteration template methods that take an execution p...
Header file containing utility methods used in HIP operations.
Header file for RAJA resource definitions.
static constexpr void parampack_resolve(EXEC_POL const &pol, ForallParamPack< Params... > &f_params, Args &&... args)
Definition: forall.hpp:304
static constexpr void parampack_init(EXEC_POL const &pol, ForallParamPack< Params... > &f_params, Args &&... args)
Definition: forall.hpp:269
static RAJA_HOST_DEVICE constexpr void parampack_combine(EXEC_POL const &pol, ForallParamPack< Params... > &f_params, Args &&... args)
Definition: forall.hpp:286
Definition: TypeTraits.hpp:59
Definition: types.hpp:143
Definition: types.hpp:209
Header file for RAJA type definitions.