24 #ifndef RAJA_forall_cuda_HPP
25 #define RAJA_forall_cuda_HPP
27 #include "RAJA/config.hpp"
29 #if defined(RAJA_ENABLE_CUDA)
73 template<
typename IterationMapping,
74 typename IterationGetter,
76 typename UniqueMarker>
77 struct ForallDimensionCalculator;
87 typename UniqueMarker>
88 struct ForallDimensionCalculator<
90 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
96 "block size must be > 0 or named_usage::unspecified with forall");
99 "grid size must be > 0 or named_usage::unspecified with forall");
101 using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
103 template<
typename IdxT>
104 static void set_dimensions(internal::CudaDims& dims,
109 const IdxT block_size =
static_cast<IdxT
>(IndexGetter::block_size);
110 const IdxT grid_size =
static_cast<IdxT
>(IndexGetter::grid_size);
112 if (len > (block_size * grid_size))
115 "len exceeds the size of the directly mapped index space");
118 internal::set_cuda_dim<dim>(dims.threads,
119 static_cast<IdxT
>(IndexGetter::block_size));
120 internal::set_cuda_dim<dim>(dims.blocks,
121 static_cast<IdxT
>(IndexGetter::grid_size));
127 typename Concretizer,
128 typename UniqueMarker>
129 struct ForallDimensionCalculator<
131 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
137 "grid size must be > 0 or named_usage::unspecified with forall");
140 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
142 template<
typename IdxT>
143 static void set_dimensions(internal::CudaDims& dims,
146 size_t dynamic_shmem_size)
148 ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
149 func, dynamic_shmem_size, len};
151 const IdxT grid_size =
static_cast<IdxT
>(IndexGetter::grid_size);
152 const IdxT block_size = concretizer.get_block_size_to_fit_len(grid_size);
154 if (block_size == IdxT(0))
157 "len exceeds the size of the directly mapped index space");
160 internal::set_cuda_dim<dim>(dims.threads, block_size);
161 internal::set_cuda_dim<dim>(dims.blocks, grid_size);
167 typename Concretizer,
168 typename UniqueMarker>
169 struct ForallDimensionCalculator<
171 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
177 "block size must be > 0 or named_usage::unspecified with forall");
180 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
182 template<
typename IdxT>
183 static void set_dimensions(internal::CudaDims& dims,
186 size_t dynamic_shmem_size)
188 ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
189 func, dynamic_shmem_size, len};
191 const IdxT block_size =
static_cast<IdxT
>(IndexGetter::block_size);
192 const IdxT grid_size = concretizer.get_grid_size_to_fit_len(block_size);
194 internal::set_cuda_dim<dim>(dims.threads, block_size);
195 internal::set_cuda_dim<dim>(dims.blocks, grid_size);
199 template<named_dim dim,
typename Concretizer,
typename UniqueMarker>
200 struct ForallDimensionCalculator<
203 IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
207 using IndexGetter = ::RAJA::cuda::
208 IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
210 template<
typename IdxT>
211 static void set_dimensions(internal::CudaDims& dims,
214 size_t dynamic_shmem_size)
216 ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
217 func, dynamic_shmem_size, len};
219 const auto sizes = concretizer.get_block_and_grid_size_to_fit_len();
221 internal::set_cuda_dim<dim>(dims.threads, sizes.first);
222 internal::set_cuda_dim<dim>(dims.blocks, sizes.second);
229 typename Concretizer,
230 typename UniqueMarker>
231 struct ForallDimensionCalculator<
233 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
239 "block size must be > 0 or named_usage::unspecified with forall");
242 "grid size must be > 0 or named_usage::unspecified with forall");
244 using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
246 template<
typename IdxT>
247 static void set_dimensions(internal::CudaDims& dims,
252 const IdxT block_size =
static_cast<IdxT
>(IndexGetter::block_size);
253 const IdxT grid_size =
static_cast<IdxT
>(IndexGetter::grid_size);
255 internal::set_cuda_dim<dim>(dims.threads, block_size);
256 internal::set_cuda_dim<dim>(dims.blocks, grid_size);
262 typename Concretizer,
263 typename UniqueMarker>
264 struct ForallDimensionCalculator<
266 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
272 "grid size must be > 0 or named_usage::unspecified with forall");
275 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
277 template<
typename IdxT>
278 static void set_dimensions(internal::CudaDims& dims,
281 size_t dynamic_shmem_size)
283 ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
284 func, dynamic_shmem_size, len};
286 const IdxT grid_size =
static_cast<IdxT
>(IndexGetter::grid_size);
287 const IdxT block_size = concretizer.get_block_size_to_fit_device(grid_size);
289 internal::set_cuda_dim<dim>(dims.threads, block_size);
290 internal::set_cuda_dim<dim>(dims.blocks, grid_size);
296 typename Concretizer,
297 typename UniqueMarker>
298 struct ForallDimensionCalculator<
300 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
306 "block size must be > 0 or named_usage::unspecified with forall");
309 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
311 template<
typename IdxT>
312 static void set_dimensions(internal::CudaDims& dims,
315 size_t dynamic_shmem_size)
317 ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
318 func, dynamic_shmem_size, len};
320 const IdxT block_size =
static_cast<IdxT
>(IndexGetter::block_size);
321 const IdxT grid_size = concretizer.get_grid_size_to_fit_device(block_size);
323 internal::set_cuda_dim<dim>(dims.threads, block_size);
324 internal::set_cuda_dim<dim>(dims.blocks, grid_size);
328 template<named_dim dim,
typename Concretizer,
typename UniqueMarker>
329 struct ForallDimensionCalculator<
332 IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
336 using IndexGetter = ::RAJA::cuda::
337 IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
339 template<
typename IdxT>
340 static void set_dimensions(internal::CudaDims& dims,
343 size_t dynamic_shmem_size)
345 ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
346 func, dynamic_shmem_size, len};
348 const auto sizes = concretizer.get_block_and_grid_size_to_fit_device();
350 internal::set_cuda_dim<dim>(dims.threads, sizes.first);
351 internal::set_cuda_dim<dim>(dims.blocks, sizes.second);
371 template<
typename EXEC_POL,
376 typename ForallParam,
377 typename IterationMapping =
typename EXEC_POL::IterationMapping,
378 typename IterationGetter =
typename EXEC_POL::IterationGetter,
379 std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
380 IterationMapping>::value &&
381 (IterationGetter::block_size > 0),
382 size_t> BlockSize = IterationGetter::block_size>
384 void forallp_cuda_kernel(
const RAJA_CUDA_GRID_CONSTANT LOOP_BODY loop_body,
385 const RAJA_CUDA_GRID_CONSTANT Iterator idx,
386 const RAJA_CUDA_GRID_CONSTANT IndexType length,
387 ForallParam f_params)
392 auto ii = IterationGetter::template index<IndexType>();
403 template<
typename EXEC_POL,
408 typename ForallParam,
409 typename IterationMapping =
typename EXEC_POL::IterationMapping,
410 typename IterationGetter =
typename EXEC_POL::IterationGetter,
411 std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
412 IterationMapping>::value &&
413 (IterationGetter::block_size <= 0),
415 __global__
void forallp_cuda_kernel(
416 const RAJA_CUDA_GRID_CONSTANT LOOP_BODY loop_body,
417 const RAJA_CUDA_GRID_CONSTANT Iterator idx,
418 const RAJA_CUDA_GRID_CONSTANT IndexType length,
419 ForallParam f_params)
424 auto ii = IterationGetter::template index<IndexType>();
441 typename ForallParam,
442 typename IterationMapping =
typename EXEC_POL::IterationMapping,
443 typename IterationGetter =
typename EXEC_POL::IterationGetter,
444 std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
445 IterationMapping>::value &&
446 std::is_base_of<iteration_mapping::UnsizedLoopBase,
447 IterationMapping>::value &&
448 (IterationGetter::block_size > 0),
449 size_t> BlockSize = IterationGetter::block_size>
451 void forallp_cuda_kernel(
const RAJA_CUDA_GRID_CONSTANT LOOP_BODY loop_body,
452 const RAJA_CUDA_GRID_CONSTANT Iterator idx,
453 const RAJA_CUDA_GRID_CONSTANT IndexType length,
454 ForallParam f_params)
460 for (
auto ii = IterationGetter::template index<IndexType>(); ii < length;
461 ii += IterationGetter::template size<IndexType>())
476 typename ForallParam,
477 typename IterationMapping =
typename EXEC_POL::IterationMapping,
478 typename IterationGetter =
typename EXEC_POL::IterationGetter,
479 std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
480 IterationMapping>::value &&
481 std::is_base_of<iteration_mapping::UnsizedLoopBase,
482 IterationMapping>::value &&
483 (IterationGetter::block_size <= 0),
485 __global__
void forallp_cuda_kernel(
486 const RAJA_CUDA_GRID_CONSTANT LOOP_BODY loop_body,
487 const RAJA_CUDA_GRID_CONSTANT Iterator idx,
488 const RAJA_CUDA_GRID_CONSTANT IndexType length,
489 ForallParam f_params)
495 for (
auto ii = IterationGetter::template index<IndexType>(); ii < length;
496 ii += IterationGetter::template size<IndexType>())
514 template<
typename Iterable,
516 typename IterationMapping,
517 typename IterationGetter,
518 typename Concretizer,
521 typename ForallParam>
522 RAJA_INLINE concepts::enable_if_t<
523 resources::EventProxy<resources::Cuda>,
526 ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
532 LoopBody&& loop_body,
533 ForallParam f_params)
535 using Iterator = camp::decay<decltype(std::begin(
iter))>;
536 using LOOP_BODY = camp::decay<LoopBody>;
538 camp::decay<decltype(std::distance(std::begin(
iter), std::end(
iter)))>;
539 using EXEC_POL = camp::decay<decltype(pol)>;
541 ::camp::list<IterationMapping, IterationGetter, camp::num<BlocksPerSM>,
542 LOOP_BODY, Iterator, ForallParam>;
543 using DimensionCalculator =
544 impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
545 Concretizer, UniqueMarker>;
550 Iterator begin = std::begin(
iter);
551 Iterator end = std::end(
iter);
552 IndexType len = std::distance(begin, end);
558 auto func =
reinterpret_cast<const void*
>(
559 &impl::forallp_cuda_kernel<EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
560 IndexType, camp::decay<ForallParam>>);
570 internal::CudaDims dims(1);
571 DimensionCalculator::set_dimensions(dims, len, func, shmem);
574 RAJA::cuda::detail::cudaInfo launch_info;
575 launch_info.gridDim = dims.blocks;
576 launch_info.blockDim = dims.threads;
577 launch_info.res = cuda_res;
585 LOOP_BODY
body = RAJA::cuda::make_launch_body(
586 func, dims.blocks, dims.threads, shmem, cuda_res,
587 std::forward<LoopBody>(loop_body));
592 void*
args[] = {(
void*)&
body, (
void*)&begin, (
void*)&len,
602 return resources::EventProxy<resources::Cuda>(cuda_res);
623 template<
typename LoopBody,
624 typename IterationMapping,
625 typename IterationGetter,
626 typename Concretizer,
629 typename... SegmentTypes>
630 RAJA_INLINE resources::EventProxy<resources::Cuda>
forall_impl(
633 ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
638 const TypedIndexSet<SegmentTypes...>& iset,
639 LoopBody&& loop_body)
641 int num_seg = iset.getNumSegments();
642 for (
int isi = 0; isi < num_seg; ++isi)
645 r, isi, detail::CallForall(),
646 ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
647 IterationGetter, Concretizer,
648 BlocksPerSM,
true>(),
653 return resources::EventProxy<resources::Cuda>(r);
RAJA header file defining index set classes.
Header file defining prototypes for routines used to manage memory for CUDA reductions and other oper...
Header file containing RAJA CUDA policy definitions.
Header file for common RAJA internal macro definitions.
RAJA_HOST_DEVICE void RAJA_ABORT_OR_THROW(const char *str)
Definition: macros.hpp:143
#define RAJA_UNUSED_ARG(x)
Definition: macros.hpp:97
Args args
Definition: WorkRunner.hpp:212
value_type::device_call &[i_loop] iter
Definition: WorkRunner.hpp:216
constexpr RAJA_HOST_DEVICE auto invoke_body(Params &¶ms, Fn &&f, Ts &&... extra)
Definition: forall.hpp:598
RAJA_HOST_DEVICE auto thread_privatize(const T &item) -> Privatizer< T >
Create a private copy of the argument to be stored on the current thread's stack in a class of the Pr...
Definition: privatizer.hpp:88
seq_exec seq_segit
Definition: policy.hpp:83
RAJA_INLINE concepts::enable_if_t< resources::EventProxy< resources::Host >, expt::type_traits::is_ForallParamPack< ForallParam >, expt::type_traits::is_ForallParamPack_empty< ForallParam > > forall_impl(resources::Host host_res, const simd_exec &, Iterable &&iter, Func &&body, ForallParam)
Definition: forall.hpp:81
Definition: AlignedRangeIndexSetBuilders.cpp:35
named_dim
Definition: types.hpp:53
__launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ void launch_new_reduce_global_fcn_fixed(const RAJA_CUDA_GRID_CONSTANT BODY body_in
void launch(LaunchParams const &launch_params, ReduceParams &&... rest_of_launch_args)
Definition: launch_core.hpp:268
auto & body
Definition: launch.hpp:177
auto privatizer
Definition: launch.hpp:176
void synchronize()
Synchronize all current RAJA executions for the specified policy.
Definition: synchronize.hpp:44
Header file containing RAJA index set and segment iteration template methods that take an execution p...
Header file containing utility methods used in CUDA operations.
Header file for RAJA resource definitions.
static constexpr void parampack_resolve(EXEC_POL const &pol, ForallParamPack< Params... > &f_params, Args &&... args)
Definition: forall.hpp:304
static constexpr void parampack_init(EXEC_POL const &pol, ForallParamPack< Params... > &f_params, Args &&... args)
Definition: forall.hpp:269
static RAJA_HOST_DEVICE constexpr void parampack_combine(EXEC_POL const &pol, ForallParamPack< Params... > &f_params, Args &&... args)
Definition: forall.hpp:286
Definition: TypeTraits.hpp:59
Definition: types.hpp:143
Definition: types.hpp:209
Header file for RAJA type definitions.