RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
HipKernel.hpp
Go to the documentation of this file.
1 
12 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
13 // Copyright (c) Lawrence Livermore National Security, LLC and other
14 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
15 // files for dates and other details. No copyright assignment is required
16 // to contribute to RAJA.
17 //
18 // SPDX-License-Identifier: (BSD-3-Clause)
19 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
20 
21 #ifndef RAJA_policy_hip_kernel_HipKernel_HPP
22 #define RAJA_policy_hip_kernel_HipKernel_HPP
23 
24 #include "RAJA/config.hpp"
25 
26 #if defined(RAJA_ENABLE_HIP)
27 
28 #include <cassert>
29 #include <climits>
30 
31 #include "camp/camp.hpp"
32 
33 #include "RAJA/util/macros.hpp"
34 #include "RAJA/util/types.hpp"
35 
36 #include "RAJA/pattern/kernel.hpp"
39 
41 
44 
46 
47 namespace RAJA
48 {
49 
57 template<bool async0, int num_blocks, int num_threads>
58 struct hip_explicit_launch
59 {};
60 
74 template<bool async0, int num_blocks, int num_threads>
75 using hip_launch = hip_explicit_launch<async0, num_blocks, num_threads>;
76 
82 template<int num_threads0, bool async0>
83 using hip_occ_calc_launch = hip_explicit_launch<async0, 0, num_threads0>;
84 
85 namespace statement
86 {
87 
93 template<typename LaunchConfig, typename... EnclosedStmts>
94 struct HipKernelExt
95  : public internal::Statement<
96  ::RAJA::policy::hip::hip_exec<LaunchConfig, void, void, true>,
97  EnclosedStmts...>
98 {};
99 
106 template<int num_blocks, int num_threads, typename... EnclosedStmts>
107 using HipKernelExp =
108  HipKernelExt<hip_explicit_launch<false, num_blocks, num_threads>,
109  EnclosedStmts...>;
110 
117 template<int num_blocks, int num_threads, typename... EnclosedStmts>
118 using HipKernelExpAsync =
119  HipKernelExt<hip_explicit_launch<true, num_blocks, num_threads>,
120  EnclosedStmts...>;
121 
127 template<typename... EnclosedStmts>
128 using HipKernelOcc =
129  HipKernelExt<hip_occ_calc_launch<1024, false>, EnclosedStmts...>;
130 
136 template<typename... EnclosedStmts>
137 using HipKernelOccAsync =
138  HipKernelExt<hip_occ_calc_launch<1024, true>, EnclosedStmts...>;
139 
145 template<int num_threads, typename... EnclosedStmts>
146 using HipKernelFixed = HipKernelExt<
148  EnclosedStmts...>;
149 
155 template<int num_threads, typename... EnclosedStmts>
156 using HipKernelFixedAsync = HipKernelExt<
158  EnclosedStmts...>;
159 
164 template<typename... EnclosedStmts>
165 using HipKernel = HipKernelFixed<1024, EnclosedStmts...>;
166 
171 template<typename... EnclosedStmts>
172 using HipKernelAsync = HipKernelFixedAsync<1024, EnclosedStmts...>;
173 
174 } // namespace statement
175 
176 namespace internal
177 {
178 
179 
183 template<typename Data, typename Exec>
184 __global__ void HipKernelLauncher(const Data data)
185 {
186 
187  using data_t = camp::decay<Data>;
188  data_t private_data = data;
189 
190  Exec::exec(private_data, true);
191  RAJA::expt::detail::combine_params<RAJA::hip_flatten_global_xyz_direct>(
192  private_data.param_tuple);
193 }
194 
202 template<int BlockSize, typename Data, typename Exec>
203 __launch_bounds__(BlockSize, 1) __global__
204  void HipKernelLauncherFixed(const Data data)
205 {
206 
207  using data_t = camp::decay<Data>;
208  data_t private_data = data;
209 
210  // execute the the object
211  Exec::exec(private_data, true);
212 
213  RAJA::expt::detail::combine_params<RAJA::hip_flatten_global_xyz_direct>(
214  private_data.param_tuple);
215 }
216 
225 template<int BlockSize, typename Data, typename executor_t>
226 struct HipKernelLauncherGetter
227 {
228  using type = camp::decay<
229  decltype(&internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>)>;
230 
231  static constexpr type get() noexcept
232  {
233  return &internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>;
234  }
235 };
236 
241 template<typename Data, typename executor_t>
242 struct HipKernelLauncherGetter<0, Data, executor_t>
243 {
244  using type =
245  camp::decay<decltype(&internal::HipKernelLauncher<Data, executor_t>)>;
246 
247  static constexpr type get() noexcept
248  {
249  return &internal::HipKernelLauncher<Data, executor_t>;
250  }
251 };
252 
253 
258 template<typename LaunchPolicy,
259  typename StmtList,
260  typename Data,
261  typename Types>
262 struct HipLaunchHelper;
263 
269 template<bool async0,
270  int num_blocks,
271  int num_threads,
272  typename StmtList,
273  typename Data,
274  typename Types>
275 struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,
276  StmtList,
277  Data,
278  Types>
279 {
280  using Self = HipLaunchHelper;
281 
282  static constexpr bool async = async0;
283 
284  using executor_t =
285  internal::hip_statement_list_executor_t<StmtList, Data, Types>;
286 
287  using kernelGetter_t =
288  HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads,
289  Data,
290  executor_t>;
291 
292  inline static const void* get_func()
293  {
294  return reinterpret_cast<const void*>(kernelGetter_t::get());
295  }
296 
297  inline static void recommended_blocks_threads(size_t shmem_size,
298  int& recommended_blocks,
299  int& recommended_threads)
300  {
301  auto func = Self::get_func();
302 
303  if (num_blocks <= 0)
304  {
305 
306  if (num_threads <= 0)
307  {
308 
309  //
310  // determine blocks at runtime
311  // determine threads at runtime
312  //
313  auto data = ::RAJA::hip::hip_occupancy_max_blocks_threads<Self>(
314  func, shmem_size);
315  recommended_blocks = data.func_max_blocks_per_device;
316  recommended_threads = data.func_max_threads_per_block;
317  }
318  else
319  {
320 
321  //
322  // determine blocks at runtime
323  // threads determined at compile-time
324  //
325  recommended_threads = num_threads;
326 
327  auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self, num_threads>(
328  func, shmem_size);
329  recommended_blocks =
330  data.func_max_blocks_per_sm * data.device_sm_per_device;
331  }
332  }
333  else
334  {
335 
336  if (num_threads <= 0)
337  {
338 
339  //
340  // determine threads at runtime, unsure what use 1024
341  // this value may be invalid for kernels with high register pressure
342  //
343  recommended_threads = 1024;
344  }
345  else
346  {
347 
348  //
349  // threads determined at compile-time
350  //
351  recommended_threads = num_threads;
352  }
353 
354  //
355  // blocks determined at compile-time
356  //
357  recommended_blocks = num_blocks;
358  }
359  }
360 
361  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size),
362  int& max_threads)
363  {
364  if (num_threads <= 0)
365  {
366 
367  //
368  // determine threads at runtime, unsure what use 1024
369  // this value may be invalid for kernels with high register pressure
370  //
371  max_threads = 1024;
372  }
373  else
374  {
375 
376  //
377  // threads determined at compile-time
378  //
379  max_threads = num_threads;
380  }
381  }
382 
383  inline static void max_blocks(size_t shmem_size,
384  int& max_blocks,
385  int actual_threads)
386  {
387  auto func = Self::get_func();
388 
389  if (num_blocks <= 0)
390  {
391 
392  //
393  // determine blocks at runtime
394  //
395  if (num_threads <= 0 || num_threads != actual_threads)
396  {
397 
398  //
399  // determine blocks when actual_threads != num_threads
400  //
401  auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self>(
402  func, shmem_size, actual_threads);
403  max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
404  }
405  else
406  {
407 
408  //
409  // determine blocks when actual_threads == num_threads
410  //
411  auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self, num_threads>(
412  func, shmem_size);
413  max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
414  }
415  }
416  else
417  {
418 
419  //
420  // blocks determined at compile-time
421  //
422  max_blocks = num_blocks;
423  }
424  }
425 };
426 
436 inline hip_dim_t fitHipDims(hip_dim_member_t limit,
437  hip_dim_t result,
438  hip_dim_t minimum = hip_dim_t())
439 {
440 
441 
442  // clamp things to at least 1
443  result.x = result.x ? result.x : 1;
444  result.y = result.y ? result.y : 1;
445  result.z = result.z ? result.z : 1;
446 
447  minimum.x = minimum.x ? minimum.x : 1;
448  minimum.y = minimum.y ? minimum.y : 1;
449  minimum.z = minimum.z ? minimum.z : 1;
450 
451  // if we are under the limit, we're done
452  if (result.x * result.y * result.z <= limit) return result;
453 
454  // Can we reduce z to fit?
455  if (result.x * result.y * minimum.z < limit)
456  {
457  // compute a new z
458  result.z = limit / (result.x * result.y);
459  return result;
460  }
461  // we don't fit, so reduce z to it's minimum and continue on to y
462  result.z = minimum.z;
463 
464 
465  // Can we reduce y to fit?
466  if (result.x * minimum.y * result.z < limit)
467  {
468  // compute a new y
469  result.y = limit / (result.x * result.z);
470  return result;
471  }
472  // we don't fit, so reduce y to it's minimum and continue on to x
473  result.y = minimum.y;
474 
475 
476  // Can we reduce y to fit?
477  if (minimum.x * result.y * result.z < limit)
478  {
479  // compute a new x
480  result.x = limit / (result.y * result.z);
481  return result;
482  }
483  // we don't fit, so we'll return the smallest possible thing
484  result.x = minimum.x;
485 
486  return result;
487 }
488 
492 template<typename LaunchConfig, typename... EnclosedStmts, typename Types>
493 struct StatementExecutor<
494  statement::HipKernelExt<LaunchConfig, EnclosedStmts...>,
495  Types>
496 {
497 
498  using stmt_list_t = StatementList<EnclosedStmts...>;
499  using StatementType = statement::HipKernelExt<LaunchConfig, EnclosedStmts...>;
500 
501  template<typename Data>
502  static inline void exec(Data&& data)
503  {
504 
505  using data_t = camp::decay<Data>;
506  using executor_t =
507  hip_statement_list_executor_t<stmt_list_t, data_t, Types>;
508  using launch_t = HipLaunchHelper<LaunchConfig, stmt_list_t, data_t, Types>;
509 
510 
511  RAJA::resources::Hip res = data.get_resource();
512 
513 
514  //
515  // Compute the requested kernel dimensions
516  //
517  LaunchDims launch_dims = executor_t::calculateDimensions(data);
518 
519 
520  // Only launch kernel if we have something to iterate over
521  bool active_threads = launch_dims.threads_are_active();
522  bool active_blocks = launch_dims.blocks_are_active();
523  int num_blocks = launch_dims.num_blocks();
524  int num_threads = launch_dims.num_threads();
525  if ((active_threads || active_blocks) &&
526  (!active_blocks || num_blocks > 0) &&
527  (!active_threads || num_threads > 0))
528  {
529 
530  //
531  // Setup shared memory buffers
532  //
533  size_t shmem = 0;
534 
535 
536  //
537  // Compute the recommended physical kernel blocks and threads
538  //
539  int recommended_blocks;
540  int recommended_threads;
541  launch_t::recommended_blocks_threads(shmem, recommended_blocks,
542  recommended_threads);
543 
544 
545  //
546  // Compute the MAX physical kernel threads
547  //
548  int max_threads;
549  launch_t::max_threads(shmem, max_threads);
550 
551 
552  //
553  // Fit the requested threads
554  //
555  hip_dim_t fit_threads {0, 0, 0};
556 
557  if (recommended_threads >= get_size(launch_dims.min_dims.threads))
558  {
559 
560  fit_threads = fitHipDims(recommended_threads, launch_dims.dims.threads,
561  launch_dims.min_dims.threads);
562  }
563 
564  //
565  // Redo fit with max threads
566  //
567  if (recommended_threads < max_threads &&
568  get_size(fit_threads) != recommended_threads)
569  {
570 
571  fit_threads = fitHipDims(max_threads, launch_dims.dims.threads,
572  launch_dims.min_dims.threads);
573  }
574 
575  launch_dims.dims.threads = fit_threads;
576 
577 
578  //
579  // Compute the MAX physical kernel blocks
580  //
581  int max_blocks;
582  launch_t::max_blocks(shmem, max_blocks, launch_dims.num_threads());
583 
584  int use_blocks;
585 
586  if (launch_dims.num_threads() == recommended_threads)
587  {
588 
589  //
590  // Fit the requested blocks
591  //
592  use_blocks = recommended_blocks;
593  }
594  else
595  {
596 
597  //
598  // Fit the max blocks
599  //
600  use_blocks = max_blocks;
601  }
602 
603  launch_dims.dims.blocks = fitHipDims(use_blocks, launch_dims.dims.blocks,
604  launch_dims.min_dims.blocks);
605 
606  //
607  // make sure that we fit
608  //
609  /* Doesn't make sense to check this anymore - AJK
610  if(launch_dims.num_blocks() > max_blocks){
611  RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num blocks");
612  }*/
613  if (launch_dims.num_threads() > max_threads)
614  {
615  RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num threads");
616  }
617 
618  {
619  auto func = launch_t::get_func();
620  // The exact policy here does not affect the reduction operation, but
621  // we do need to accurately pass a resource and launch dimensions to
622  // perform initialization and resolution of reduction parameters.
623  using EXEC_POL =
624  ::RAJA::policy::hip::hip_exec<LaunchConfig, void, void, true>;
625 
626  RAJA::hip::detail::hipInfo launch_info;
627  launch_info.gridDim = launch_dims.dims.blocks;
628  launch_info.blockDim = launch_dims.dims.threads;
629  launch_info.dynamic_smem = &shmem;
630  launch_info.res = res;
631 
632  RAJA::expt::detail::init_params<EXEC_POL>(data.param_tuple,
633  launch_info);
634  //
635  // Privatize the LoopData, using make_launch_body to setup reductions
636  //
637  // Note that there is a circular dependency between the previous setup
638  // of the launch_dims and potential changes to shmem here that is
639  // currently an unresolved issue.
640  //
641  auto hip_data = RAJA::hip::make_launch_body(
642  func, launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res,
643  data);
644 
645  //
646  // Launch the kernel
647  //
648  void* args[] = {(void*)&hip_data};
649  RAJA::hip::launch(func, launch_dims.dims.blocks,
650  launch_dims.dims.threads, args, shmem, res,
651  launch_t::async);
652  RAJA::expt::detail::resolve_params<EXEC_POL>(data.param_tuple,
653  launch_info);
654  }
655  }
656  }
657 };
658 
659 
660 } // namespace internal
661 } // namespace RAJA
662 
663 #endif // closing endif for RAJA_ENABLE_HIP guard
664 
665 #endif // closing endif for header file include guard
Header file defining prototypes for routines used to manage memory for HIP reductions and other opera...
Header file containing RAJA HIP policy definitions.
Header file for common RAJA internal macro definitions.
RAJA_HOST_DEVICE void RAJA_ABORT_OR_THROW(const char *str)
Definition: macros.hpp:143
#define RAJA_UNUSED_ARG(x)
Definition: macros.hpp:97
Args args
Definition: WorkRunner.hpp:212
camp::list< Stmts... > StatementList
Definition: StatementList.hpp:41
Definition: AlignedRangeIndexSetBuilders.cpp:35
__launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ void launch_new_reduce_global_fcn_fixed(const RAJA_CUDA_GRID_CONSTANT BODY body_in
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
void launch(LaunchParams const &launch_params, ReduceParams &&... rest_of_launch_args)
Definition: launch_core.hpp:268
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155
Header file for statement wrappers and executors.
Header file for kernel lambda executor.
RAJA header file containing user interface for RAJA::kernel.
RAJA header file containing constructs used to run kernel traversals on GPU with HIP.
Header file for RAJA type definitions.