21 #ifndef RAJA_policy_hip_kernel_HipKernel_HPP
22 #define RAJA_policy_hip_kernel_HipKernel_HPP
24 #include "RAJA/config.hpp"
26 #if defined(RAJA_ENABLE_HIP)
31 #include "camp/camp.hpp"
57 template<
bool async0,
int num_blocks,
int num_threads>
58 struct hip_explicit_launch
74 template<
bool async0,
int num_blocks,
int num_threads>
75 using hip_launch = hip_explicit_launch<async0, num_blocks, num_threads>;
82 template<
int num_threads0,
bool async0>
83 using hip_occ_calc_launch = hip_explicit_launch<async0, 0, num_threads0>;
93 template<
typename LaunchConfig,
typename... EnclosedStmts>
95 :
public internal::Statement<
96 ::RAJA::policy::hip::hip_exec<LaunchConfig, void, void, true>,
106 template<
int num_blocks,
int num_threads,
typename... EnclosedStmts>
108 HipKernelExt<hip_explicit_launch<false, num_blocks, num_threads>,
117 template<
int num_blocks,
int num_threads,
typename... EnclosedStmts>
118 using HipKernelExpAsync =
119 HipKernelExt<hip_explicit_launch<true, num_blocks, num_threads>,
127 template<
typename... EnclosedStmts>
129 HipKernelExt<hip_occ_calc_launch<1024, false>, EnclosedStmts...>;
136 template<
typename... EnclosedStmts>
137 using HipKernelOccAsync =
138 HipKernelExt<hip_occ_calc_launch<1024, true>, EnclosedStmts...>;
145 template<
int num_threads,
typename... EnclosedStmts>
146 using HipKernelFixed = HipKernelExt<
155 template<
int num_threads,
typename... EnclosedStmts>
156 using HipKernelFixedAsync = HipKernelExt<
164 template<
typename... EnclosedStmts>
165 using HipKernel = HipKernelFixed<1024, EnclosedStmts...>;
171 template<
typename... EnclosedStmts>
172 using HipKernelAsync = HipKernelFixedAsync<1024, EnclosedStmts...>;
183 template<
typename Data,
typename Exec>
184 __global__
void HipKernelLauncher(
const Data data)
187 using data_t = camp::decay<Data>;
188 data_t private_data = data;
190 Exec::exec(private_data,
true);
191 RAJA::expt::detail::combine_params<RAJA::hip_flatten_global_xyz_direct>(
192 private_data.param_tuple);
202 template<
int BlockSize,
typename Data,
typename Exec>
204 void HipKernelLauncherFixed(const Data data)
207 using data_t = camp::decay<Data>;
208 data_t private_data = data;
211 Exec::exec(private_data,
true);
213 RAJA::expt::detail::combine_params<RAJA::hip_flatten_global_xyz_direct>(
214 private_data.param_tuple);
225 template<
int BlockSize,
typename Data,
typename executor_t>
226 struct HipKernelLauncherGetter
228 using type = camp::decay<
229 decltype(&internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>)>;
231 static constexpr type
get() noexcept
233 return &internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>;
241 template<
typename Data,
typename executor_t>
242 struct HipKernelLauncherGetter<0, Data, executor_t>
245 camp::decay<decltype(&internal::HipKernelLauncher<Data, executor_t>)>;
247 static constexpr type
get() noexcept
249 return &internal::HipKernelLauncher<Data, executor_t>;
258 template<
typename LaunchPolicy,
262 struct HipLaunchHelper;
269 template<
bool async0,
275 struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,
280 using Self = HipLaunchHelper;
282 static constexpr
bool async = async0;
285 internal::hip_statement_list_executor_t<StmtList, Data, Types>;
287 using kernelGetter_t =
288 HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads,
292 inline static const void* get_func()
297 inline static void recommended_blocks_threads(
size_t shmem_size,
298 int& recommended_blocks,
299 int& recommended_threads)
301 auto func = Self::get_func();
306 if (num_threads <= 0)
313 auto data = ::RAJA::hip::hip_occupancy_max_blocks_threads<Self>(
315 recommended_blocks = data.func_max_blocks_per_device;
316 recommended_threads = data.func_max_threads_per_block;
325 recommended_threads = num_threads;
327 auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self, num_threads>(
330 data.func_max_blocks_per_sm * data.device_sm_per_device;
336 if (num_threads <= 0)
343 recommended_threads = 1024;
351 recommended_threads = num_threads;
357 recommended_blocks = num_blocks;
364 if (num_threads <= 0)
379 max_threads = num_threads;
383 inline static void max_blocks(
size_t shmem_size,
387 auto func = Self::get_func();
395 if (num_threads <= 0 || num_threads != actual_threads)
401 auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self>(
402 func, shmem_size, actual_threads);
403 max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
411 auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self, num_threads>(
413 max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
422 max_blocks = num_blocks;
436 inline hip_dim_t fitHipDims(hip_dim_member_t limit,
438 hip_dim_t minimum = hip_dim_t())
443 result.x = result.x ? result.x : 1;
444 result.y = result.y ? result.y : 1;
445 result.z = result.z ? result.z : 1;
447 minimum.x = minimum.x ? minimum.x : 1;
448 minimum.y = minimum.y ? minimum.y : 1;
449 minimum.z = minimum.z ? minimum.z : 1;
452 if (result.x * result.y * result.z <= limit)
return result;
455 if (result.x * result.y * minimum.z < limit)
458 result.z = limit / (result.x * result.y);
462 result.z = minimum.z;
466 if (result.x * minimum.y * result.z < limit)
469 result.y = limit / (result.x * result.z);
473 result.y = minimum.y;
477 if (minimum.x * result.y * result.z < limit)
480 result.x = limit / (result.y * result.z);
484 result.x = minimum.x;
492 template<
typename LaunchConfig,
typename... EnclosedStmts,
typename Types>
493 struct StatementExecutor<
494 statement::HipKernelExt<LaunchConfig, EnclosedStmts...>,
499 using StatementType = statement::HipKernelExt<LaunchConfig, EnclosedStmts...>;
501 template<
typename Data>
502 static inline void exec(Data&& data)
505 using data_t = camp::decay<Data>;
507 hip_statement_list_executor_t<stmt_list_t, data_t, Types>;
508 using launch_t = HipLaunchHelper<LaunchConfig, stmt_list_t, data_t, Types>;
511 RAJA::resources::Hip res = data.get_resource();
517 LaunchDims launch_dims = executor_t::calculateDimensions(data);
521 bool active_threads = launch_dims.threads_are_active();
522 bool active_blocks = launch_dims.blocks_are_active();
523 int num_blocks = launch_dims.num_blocks();
524 int num_threads = launch_dims.num_threads();
525 if ((active_threads || active_blocks) &&
526 (!active_blocks || num_blocks > 0) &&
527 (!active_threads || num_threads > 0))
539 int recommended_blocks;
540 int recommended_threads;
541 launch_t::recommended_blocks_threads(shmem, recommended_blocks,
542 recommended_threads);
549 launch_t::max_threads(shmem, max_threads);
555 hip_dim_t fit_threads {0, 0, 0};
557 if (recommended_threads >= get_size(launch_dims.min_dims.threads))
560 fit_threads = fitHipDims(recommended_threads, launch_dims.dims.threads,
561 launch_dims.min_dims.threads);
567 if (recommended_threads < max_threads &&
568 get_size(fit_threads) != recommended_threads)
571 fit_threads = fitHipDims(max_threads, launch_dims.dims.threads,
572 launch_dims.min_dims.threads);
575 launch_dims.dims.threads = fit_threads;
582 launch_t::max_blocks(shmem, max_blocks, launch_dims.num_threads());
586 if (launch_dims.num_threads() == recommended_threads)
592 use_blocks = recommended_blocks;
600 use_blocks = max_blocks;
603 launch_dims.dims.blocks = fitHipDims(use_blocks, launch_dims.dims.blocks,
604 launch_dims.min_dims.blocks);
613 if (launch_dims.num_threads() > max_threads)
619 auto func = launch_t::get_func();
624 ::RAJA::policy::hip::hip_exec<LaunchConfig, void, void, true>;
626 RAJA::hip::detail::hipInfo launch_info;
627 launch_info.gridDim = launch_dims.dims.blocks;
628 launch_info.blockDim = launch_dims.dims.threads;
629 launch_info.dynamic_smem = &shmem;
630 launch_info.res = res;
632 RAJA::expt::detail::init_params<EXEC_POL>(data.param_tuple,
641 auto hip_data = RAJA::hip::make_launch_body(
642 func, launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res,
648 void*
args[] = {(
void*)&hip_data};
650 launch_dims.dims.threads,
args, shmem, res,
652 RAJA::expt::detail::resolve_params<EXEC_POL>(data.param_tuple,
Header file defining prototypes for routines used to manage memory for HIP reductions and other opera...
Header file containing RAJA HIP policy definitions.
Header file for common RAJA internal macro definitions.
RAJA_HOST_DEVICE void RAJA_ABORT_OR_THROW(const char *str)
Definition: macros.hpp:143
#define RAJA_UNUSED_ARG(x)
Definition: macros.hpp:97
Args args
Definition: WorkRunner.hpp:212
camp::list< Stmts... > StatementList
Definition: StatementList.hpp:41
Definition: AlignedRangeIndexSetBuilders.cpp:35
__launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ void launch_new_reduce_global_fcn_fixed(const RAJA_CUDA_GRID_CONSTANT BODY body_in
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
void launch(LaunchParams const &launch_params, ReduceParams &&... rest_of_launch_args)
Definition: launch_core.hpp:268
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155
Header file for statement wrappers and executors.
Header file for kernel lambda executor.
RAJA header file containing user interface for RAJA::kernel.
RAJA header file containing constructs used to run kernel traversals on GPU with HIP.
Header file for RAJA type definitions.