21 #ifndef RAJA_policy_cuda_kernel_CudaKernel_HPP
22 #define RAJA_policy_cuda_kernel_CudaKernel_HPP
24 #include "RAJA/config.hpp"
26 #if defined(RAJA_ENABLE_CUDA)
31 #include "camp/camp.hpp"
57 template<
bool async0,
int num_blocks,
int num_threads,
int blocks_per_sm>
58 struct cuda_explicit_launch
74 template<
bool async0,
int num_blocks,
int num_threads>
75 using cuda_launch = cuda_explicit_launch<async0,
78 policy::cuda::MIN_BLOCKS_PER_SM>;
85 template<
int num_threads0,
bool async0>
86 using cuda_occ_calc_launch =
87 cuda_explicit_launch<async0,
90 policy::cuda::MIN_BLOCKS_PER_SM>;
100 template<
typename LaunchConfig,
typename... EnclosedStmts>
102 :
public internal::Statement<
103 ::RAJA::policy::cuda::
104 cuda_exec_explicit<LaunchConfig, void, void, 0, true>,
114 template<
int num_blocks,
int num_threads,
typename... EnclosedStmts>
115 using CudaKernelExp = CudaKernelExt<cuda_launch<false, num_blocks, num_threads>,
124 template<
int num_blocks,
int num_threads,
typename... EnclosedStmts>
125 using CudaKernelExpAsync =
126 CudaKernelExt<cuda_launch<true, num_blocks, num_threads>, EnclosedStmts...>;
133 template<
typename... EnclosedStmts>
134 using CudaKernelOcc =
135 CudaKernelExt<cuda_occ_calc_launch<1024, false>, EnclosedStmts...>;
142 template<
typename... EnclosedStmts>
143 using CudaKernelOccAsync =
144 CudaKernelExt<cuda_occ_calc_launch<1024, true>, EnclosedStmts...>;
151 template<
int num_threads,
typename... EnclosedStmts>
152 using CudaKernelFixed = CudaKernelExt<
161 template<
int num_threads,
typename... EnclosedStmts>
162 using CudaKernelFixedAsync =
171 template<
int num_threads,
int blocks_per_sm,
typename... EnclosedStmts>
172 using CudaKernelFixedSM =
173 CudaKernelExt<cuda_explicit_launch<
false,
184 template<
int num_threads,
int blocks_per_sm,
typename... EnclosedStmts>
185 using CudaKernelFixedSMAsync =
186 CudaKernelExt<cuda_explicit_launch<
true,
196 template<
typename... EnclosedStmts>
197 using CudaKernel = CudaKernelFixed<1024, EnclosedStmts...>;
203 template<
typename... EnclosedStmts>
204 using CudaKernelAsync = CudaKernelFixedAsync<1024, EnclosedStmts...>;
215 template<
typename Data,
typename Exec>
216 __global__
void CudaKernelLauncher(
const RAJA_CUDA_GRID_CONSTANT Data data)
219 using data_t = camp::decay<Data>;
220 data_t private_data = data;
222 Exec::exec(private_data,
true);
224 RAJA::expt::detail::combine_params<RAJA::cuda_flatten_global_xyz_direct>(
225 private_data.param_tuple);
235 template<
int BlockSize,
int BlocksPerSM,
typename Data,
typename Exec>
237 void CudaKernelLauncherFixed(
const RAJA_CUDA_GRID_CONSTANT Data data)
240 using data_t = camp::decay<Data>;
241 data_t private_data = data;
244 Exec::exec(private_data,
true);
246 RAJA::expt::detail::combine_params<RAJA::cuda_flatten_global_xyz_direct>(
247 private_data.param_tuple);
258 template<
int BlockSize,
int BlocksPerSM,
typename Data,
typename executor_t>
259 struct CudaKernelLauncherGetter
262 camp::decay<decltype(&internal::CudaKernelLauncherFixed<BlockSize,
267 static constexpr type
get() noexcept
269 return &internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data,
278 template<
typename Data,
typename executor_t>
279 struct CudaKernelLauncherGetter<0, 0, Data, executor_t>
282 camp::decay<decltype(&internal::CudaKernelLauncher<Data, executor_t>)>;
284 static constexpr type
get() noexcept
286 return &internal::CudaKernelLauncher<Data, executor_t>;
295 template<
typename LaunchPolicy,
299 struct CudaLaunchHelper;
306 template<
bool async0,
313 struct CudaLaunchHelper<
314 cuda_explicit_launch<async0, num_blocks, num_threads, blocks_per_sm>,
319 using Self = CudaLaunchHelper;
321 static constexpr
bool async = async0;
324 internal::cuda_statement_list_executor_t<StmtList, Data, Types>;
326 using kernelGetter_t =
327 CudaKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads,
328 (blocks_per_sm <= 0) ? 0 : blocks_per_sm,
332 inline static const void* get_func()
337 inline static void recommended_blocks_threads(
size_t shmem_size,
338 int& recommended_blocks,
339 int& recommended_threads)
341 auto func = Self::get_func();
346 if (num_threads <= 0)
353 auto data = ::RAJA::cuda::cuda_occupancy_max_blocks_threads<Self>(
355 recommended_blocks = data.func_max_blocks_per_device;
356 recommended_threads = data.func_max_threads_per_block;
365 recommended_threads = num_threads;
367 auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(
370 data.func_max_blocks_per_sm * data.device_sm_per_device;
376 if (num_threads <= 0)
383 recommended_threads = 1024;
391 recommended_threads = num_threads;
397 recommended_blocks = num_blocks;
404 if (num_threads <= 0)
419 max_threads = num_threads;
423 inline static void max_blocks(
size_t shmem_size,
427 auto func = Self::get_func();
435 if (num_threads <= 0 || num_threads != actual_threads)
441 auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self>(
442 func, shmem_size, actual_threads);
443 max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
451 auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(
453 max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
462 max_blocks = num_blocks;
476 inline cuda_dim_t fitCudaDims(cuda_dim_member_t limit,
478 cuda_dim_t minimum = cuda_dim_t())
483 result.x = result.x ? result.x : 1;
484 result.y = result.y ? result.y : 1;
485 result.z = result.z ? result.z : 1;
487 minimum.x = minimum.x ? minimum.x : 1;
488 minimum.y = minimum.y ? minimum.y : 1;
489 minimum.z = minimum.z ? minimum.z : 1;
492 if (result.x * result.y * result.z <= limit)
return result;
495 if (result.x * result.y * minimum.z < limit)
498 result.z = limit / (result.x * result.y);
502 result.z = minimum.z;
506 if (result.x * minimum.y * result.z < limit)
509 result.y = limit / (result.x * result.z);
513 result.y = minimum.y;
517 if (minimum.x * result.y * result.z < limit)
520 result.x = limit / (result.y * result.z);
524 result.x = minimum.x;
532 template<
typename LaunchConfig,
typename... EnclosedStmts,
typename Types>
533 struct StatementExecutor<
534 statement::CudaKernelExt<LaunchConfig, EnclosedStmts...>,
539 using StatementType =
540 statement::CudaKernelExt<LaunchConfig, EnclosedStmts...>;
542 template<
typename Data>
543 static inline void exec(Data&& data)
546 using data_t = camp::decay<Data>;
548 cuda_statement_list_executor_t<stmt_list_t, data_t, Types>;
549 using launch_t = CudaLaunchHelper<LaunchConfig, stmt_list_t, data_t, Types>;
552 RAJA::resources::Cuda res = data.get_resource();
558 LaunchDims launch_dims = executor_t::calculateDimensions(data);
562 bool active_threads = launch_dims.threads_are_active();
563 bool active_blocks = launch_dims.blocks_are_active();
564 int num_blocks = launch_dims.num_blocks();
565 int num_threads = launch_dims.num_threads();
566 if ((active_threads || active_blocks) &&
567 (!active_blocks || num_blocks > 0) &&
568 (!active_threads || num_threads > 0))
580 int recommended_blocks;
581 int recommended_threads;
582 launch_t::recommended_blocks_threads(shmem, recommended_blocks,
583 recommended_threads);
590 launch_t::max_threads(shmem, max_threads);
596 cuda_dim_t fit_threads {0, 0, 0};
598 if (recommended_threads >= get_size(launch_dims.min_dims.threads))
601 fit_threads = fitCudaDims(recommended_threads, launch_dims.dims.threads,
602 launch_dims.min_dims.threads);
608 if (recommended_threads < max_threads &&
609 get_size(fit_threads) != recommended_threads)
612 fit_threads = fitCudaDims(max_threads, launch_dims.dims.threads,
613 launch_dims.min_dims.threads);
616 launch_dims.dims.threads = fit_threads;
623 launch_t::max_blocks(shmem, max_blocks, launch_dims.num_threads());
627 if (launch_dims.num_threads() == recommended_threads)
633 use_blocks = recommended_blocks;
641 use_blocks = max_blocks;
644 launch_dims.dims.blocks = fitCudaDims(use_blocks, launch_dims.dims.blocks,
645 launch_dims.min_dims.blocks);
654 if (launch_dims.num_threads() > max_threads)
660 auto func = launch_t::get_func();
665 ::RAJA::policy::cuda::cuda_exec_explicit<LaunchConfig, void, void,
668 RAJA::cuda::detail::cudaInfo launch_info;
669 launch_info.gridDim = launch_dims.dims.blocks;
670 launch_info.blockDim = launch_dims.dims.threads;
671 launch_info.dynamic_smem = &shmem;
672 launch_info.res = res;
674 RAJA::expt::detail::init_params<EXEC_POL>(data.param_tuple,
683 auto cuda_data = RAJA::cuda::make_launch_body(
684 func, launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res,
690 void*
args[] = {(
void*)&cuda_data};
692 launch_dims.dims.threads,
args, shmem, res,
694 RAJA::expt::detail::resolve_params<EXEC_POL>(data.param_tuple,
Header file defining prototypes for routines used to manage memory for CUDA reductions and other oper...
Header file containing RAJA CUDA policy definitions.
Header file for common RAJA internal macro definitions.
RAJA_HOST_DEVICE void RAJA_ABORT_OR_THROW(const char *str)
Definition: macros.hpp:143
#define RAJA_UNUSED_ARG(x)
Definition: macros.hpp:97
Args args
Definition: WorkRunner.hpp:212
camp::list< Stmts... > StatementList
Definition: StatementList.hpp:41
Definition: AlignedRangeIndexSetBuilders.cpp:35
__launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ void launch_new_reduce_global_fcn_fixed(const RAJA_CUDA_GRID_CONSTANT BODY body_in
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
void launch(LaunchParams const &launch_params, ReduceParams &&... rest_of_launch_args)
Definition: launch_core.hpp:268
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155
Header file for statement wrappers and executors.
Header file for kernel lambda executor.
RAJA header file containing user interface for RAJA::kernel.
RAJA header file containing constructs used to run kernel traversals on GPU with CUDA.
Header file for RAJA type definitions.