22 #ifndef RAJA_policy_cuda_kernel_internal_HPP
23 #define RAJA_policy_cuda_kernel_internal_HPP
25 #include "RAJA/config.hpp"
27 #if defined(RAJA_ENABLE_CUDA)
32 #include "camp/camp.hpp"
53 CudaDims min_dims {0};
55 LaunchDims() =
default;
56 LaunchDims(LaunchDims
const&) =
default;
57 LaunchDims(LaunchDims&&) =
default;
58 LaunchDims& operator=(LaunchDims
const&) =
default;
59 LaunchDims& operator=(LaunchDims&&) =
default;
62 LaunchDims(CudaDims _active, CudaDims _dims, CudaDims _min_dims)
69 LaunchDims
max(LaunchDims
const& c)
const
73 result.active.blocks.x =
std::max(c.active.blocks.x, active.blocks.x);
74 result.active.blocks.y =
std::max(c.active.blocks.y, active.blocks.y);
75 result.active.blocks.z =
std::max(c.active.blocks.z, active.blocks.z);
77 result.dims.blocks.x =
std::max(c.dims.blocks.x, dims.blocks.x);
78 result.dims.blocks.y =
std::max(c.dims.blocks.y, dims.blocks.y);
79 result.dims.blocks.z =
std::max(c.dims.blocks.z, dims.blocks.z);
81 result.min_dims.blocks.x =
std::max(c.min_dims.blocks.x, min_dims.blocks.x);
82 result.min_dims.blocks.y =
std::max(c.min_dims.blocks.y, min_dims.blocks.y);
83 result.min_dims.blocks.z =
std::max(c.min_dims.blocks.z, min_dims.blocks.z);
85 result.active.threads.x =
std::max(c.active.threads.x, active.threads.x);
86 result.active.threads.y =
std::max(c.active.threads.y, active.threads.y);
87 result.active.threads.z =
std::max(c.active.threads.z, active.threads.z);
89 result.dims.threads.x =
std::max(c.dims.threads.x, dims.threads.x);
90 result.dims.threads.y =
std::max(c.dims.threads.y, dims.threads.y);
91 result.dims.threads.z =
std::max(c.dims.threads.z, dims.threads.z);
93 result.min_dims.threads.x =
94 std::max(c.min_dims.threads.x, min_dims.threads.x);
95 result.min_dims.threads.y =
96 std::max(c.min_dims.threads.y, min_dims.threads.y);
97 result.min_dims.threads.z =
98 std::max(c.min_dims.threads.z, min_dims.threads.z);
104 int blocks_are_active()
const
106 return active.blocks.x || active.blocks.y || active.blocks.z;
110 int threads_are_active()
const
112 return active.threads.x || active.threads.y || active.threads.z;
116 int num_blocks()
const
118 if (blocks_are_active())
120 return (active.blocks.x ? dims.blocks.x : 1) *
121 (active.blocks.y ? dims.blocks.y : 1) *
122 (active.blocks.z ? dims.blocks.z : 1);
131 int num_threads()
const
133 if (threads_are_active())
135 return (active.threads.x ? dims.threads.x : 1) *
136 (active.threads.y ? dims.threads.y : 1) *
137 (active.threads.z ? dims.threads.z : 1);
146 void clamp_to_min_blocks()
148 dims.blocks.x =
std::max(min_dims.blocks.x, dims.blocks.x);
149 dims.blocks.y =
std::max(min_dims.blocks.y, dims.blocks.y);
150 dims.blocks.z =
std::max(min_dims.blocks.z, dims.blocks.z);
154 void clamp_to_min_threads()
156 dims.threads.x =
std::max(min_dims.threads.x, dims.threads.x);
157 dims.threads.y =
std::max(min_dims.threads.y, dims.threads.y);
158 dims.threads.z =
std::max(min_dims.threads.z, dims.threads.z);
163 LaunchDims combine(LaunchDims
const& lhs, LaunchDims
const& rhs)
168 template<camp::
idx_t cur_stmt, camp::
idx_t num_stmts,
typename StmtList>
169 struct CudaStatementListExecutorHelper
172 using next_helper_t =
173 CudaStatementListExecutorHelper<cur_stmt + 1, num_stmts, StmtList>;
175 using cur_stmt_t = camp::at_v<StmtList, cur_stmt>;
177 template<
typename Data>
178 inline static RAJA_DEVICE void exec(Data& data,
bool thread_active)
181 cur_stmt_t::exec(data, thread_active);
184 next_helper_t::exec(data, thread_active);
187 template<
typename Data>
188 inline static LaunchDims calculateDimensions(Data& data)
190 LaunchDims statement_dims = cur_stmt_t::calculateDimensions(data);
192 LaunchDims next_dims = next_helper_t::calculateDimensions(data);
194 return combine(statement_dims, next_dims);
198 template<camp::
idx_t num_stmts,
typename StmtList>
199 struct CudaStatementListExecutorHelper<num_stmts, num_stmts, StmtList>
202 template<
typename Data>
208 template<
typename Data>
209 inline static LaunchDims calculateDimensions(Data&)
216 template<
typename Data,
typename Policy,
typename Types>
217 struct CudaStatementExecutor;
219 template<
typename Data,
typename StmtList,
typename Types>
220 struct CudaStatementListExecutor;
222 template<
typename Data,
typename... Stmts,
typename Types>
223 struct CudaStatementListExecutor<Data,
StatementList<Stmts...>, Types>
226 using enclosed_stmts_t =
227 camp::list<CudaStatementExecutor<Data, Stmts, Types>...>;
229 static constexpr
size_t num_stmts =
sizeof...(Stmts);
231 static inline RAJA_DEVICE void exec(Data& data,
bool thread_active)
234 CudaStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(
235 data, thread_active);
238 static inline LaunchDims calculateDimensions(Data
const& data)
241 return CudaStatementListExecutorHelper<
242 0, num_stmts, enclosed_stmts_t>::calculateDimensions(data);
246 template<
typename StmtList,
typename Data,
typename Types>
247 using cuda_statement_list_executor_t =
248 CudaStatementListExecutor<Data, StmtList, Types>;
251 template<
typename kernel_indexer>
252 struct KernelDimensionCalculator;
255 template<named_dim dim, kernel_sync_requirement sync>
256 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
257 iteration_mapping::DirectUnchecked,
259 cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
262 cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
264 template<
typename IdxT>
265 static LaunchDims get_dimensions(IdxT len)
267 if (len !=
static_cast<IdxT
>(1))
270 "mapped index space");
273 return LaunchDims {};
278 template<named_dim dim, kernel_sync_requirement sync>
279 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
280 iteration_mapping::DirectUnchecked,
282 cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
285 cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
287 template<
typename IdxT>
288 static LaunchDims get_dimensions(IdxT len)
294 set_cuda_dim<dim>(dims.active.threads,
295 static_cast<cuda_dim_member_t
>(
true));
296 set_cuda_dim<dim>(dims.dims.threads,
static_cast<cuda_dim_member_t
>(len));
297 set_cuda_dim<dim>(dims.min_dims.threads,
298 static_cast<cuda_dim_member_t
>(len));
305 template<named_dim dim,
int BLOCK_SIZE, kernel_sync_requirement sync>
306 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
307 iteration_mapping::DirectUnchecked,
309 cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
311 static_assert(BLOCK_SIZE > 0,
312 "block size must be > 0, named_usage::unspecified, or "
313 "named_usage::ignored with kernel");
315 using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
317 template<
typename IdxT>
318 static LaunchDims get_dimensions(IdxT len)
320 if (len !=
static_cast<IdxT
>(IndexMapper::block_size))
323 "mapped index space");
328 set_cuda_dim<dim>(dims.active.threads,
329 static_cast<cuda_dim_member_t
>(
true));
330 set_cuda_dim<dim>(dims.dims.threads,
331 static_cast<cuda_dim_member_t
>(IndexMapper::block_size));
332 set_cuda_dim<dim>(dims.min_dims.threads,
333 static_cast<cuda_dim_member_t
>(IndexMapper::block_size));
340 template<named_dim dim, kernel_sync_requirement sync>
341 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
342 iteration_mapping::DirectUnchecked,
344 cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
347 cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
349 template<
typename IdxT>
350 static LaunchDims get_dimensions(IdxT len)
354 set_cuda_dim<dim>(dims.active.blocks,
static_cast<cuda_dim_member_t
>(
true));
355 set_cuda_dim<dim>(dims.dims.blocks,
static_cast<cuda_dim_member_t
>(len));
356 set_cuda_dim<dim>(dims.min_dims.blocks,
357 static_cast<cuda_dim_member_t
>(len));
364 template<named_dim dim,
int GRID_SIZE, kernel_sync_requirement sync>
365 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
366 iteration_mapping::DirectUnchecked,
368 cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
370 static_assert(GRID_SIZE > 0,
371 "grid size must be > 0, named_usage::unspecified, or "
372 "named_usage::ignored with kernel");
374 using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
376 template<
typename IdxT>
377 static LaunchDims get_dimensions(IdxT len)
379 if (len !=
static_cast<IdxT
>(IndexMapper::grid_size))
382 "mapped index space");
387 set_cuda_dim<dim>(dims.active.blocks,
static_cast<cuda_dim_member_t
>(
true));
388 set_cuda_dim<dim>(dims.dims.blocks,
389 static_cast<cuda_dim_member_t
>(IndexMapper::grid_size));
390 set_cuda_dim<dim>(dims.min_dims.blocks,
391 static_cast<cuda_dim_member_t
>(IndexMapper::grid_size));
398 template<named_dim dim, kernel_sync_requirement sync>
399 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
400 iteration_mapping::DirectUnchecked,
402 cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
404 using IndexMapper = cuda::
405 IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
407 template<
typename IdxT>
408 static LaunchDims get_dimensions(IdxT len)
410 if (len !=
static_cast<IdxT
>(0))
420 template<named_dim dim,
int GRID_SIZE, kernel_sync_requirement sync>
421 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
422 iteration_mapping::DirectUnchecked,
424 cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
426 static_assert(GRID_SIZE > 0,
427 "grid size must be > 0, named_usage::unspecified, or "
428 "named_usage::ignored with kernel");
431 cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
433 template<
typename IdxT>
434 static LaunchDims get_dimensions(IdxT len)
438 const IdxT block_size =
440 if (len != (block_size *
static_cast<IdxT
>(IndexMapper::grid_size)))
443 "mapped index space");
448 set_cuda_dim<dim>(dims.active.threads,
449 static_cast<cuda_dim_member_t
>(
true));
450 set_cuda_dim<dim>(dims.dims.threads,
451 static_cast<cuda_dim_member_t
>(block_size));
452 set_cuda_dim<dim>(dims.min_dims.threads,
453 static_cast<cuda_dim_member_t
>(block_size));
455 set_cuda_dim<dim>(dims.active.blocks,
static_cast<cuda_dim_member_t
>(
true));
456 set_cuda_dim<dim>(dims.dims.blocks,
457 static_cast<cuda_dim_member_t
>(IndexMapper::grid_size));
458 set_cuda_dim<dim>(dims.min_dims.blocks,
459 static_cast<cuda_dim_member_t
>(IndexMapper::grid_size));
466 template<named_dim dim,
int BLOCK_SIZE, kernel_sync_requirement sync>
467 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
468 iteration_mapping::DirectUnchecked,
470 cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
472 static_assert(BLOCK_SIZE > 0,
473 "block size must be > 0, named_usage::unspecified, or "
474 "named_usage::ignored with kernel");
477 cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
479 template<
typename IdxT>
480 static LaunchDims get_dimensions(IdxT len)
483 len,
static_cast<IdxT
>(IndexMapper::block_size));
484 if (len != (
static_cast<IdxT
>(IndexMapper::block_size) * grid_size))
487 "mapped index space");
492 set_cuda_dim<dim>(dims.active.threads,
493 static_cast<cuda_dim_member_t
>(
true));
494 set_cuda_dim<dim>(dims.dims.threads,
495 static_cast<cuda_dim_member_t
>(IndexMapper::block_size));
496 set_cuda_dim<dim>(dims.min_dims.threads,
497 static_cast<cuda_dim_member_t
>(IndexMapper::block_size));
499 set_cuda_dim<dim>(dims.active.blocks,
static_cast<cuda_dim_member_t
>(
true));
500 set_cuda_dim<dim>(dims.dims.blocks,
501 static_cast<cuda_dim_member_t
>(grid_size));
502 set_cuda_dim<dim>(dims.min_dims.blocks,
503 static_cast<cuda_dim_member_t
>(grid_size));
514 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
515 iteration_mapping::DirectUnchecked,
517 cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
519 static_assert(BLOCK_SIZE > 0,
520 "block size must be > 0, named_usage::unspecified, or "
521 "named_usage::ignored with kernel");
522 static_assert(GRID_SIZE > 0,
523 "grid size must be > 0, named_usage::unspecified, or "
524 "named_usage::ignored with kernel");
526 using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
528 template<
typename IdxT>
529 static LaunchDims get_dimensions(IdxT len)
531 if (len != (
static_cast<IdxT
>(IndexMapper::block_size) *
532 static_cast<IdxT
>(IndexMapper::grid_size)))
535 "mapped index space");
540 set_cuda_dim<dim>(dims.active.threads,
541 static_cast<cuda_dim_member_t
>(
true));
542 set_cuda_dim<dim>(dims.dims.threads,
543 static_cast<cuda_dim_member_t
>(IndexMapper::block_size));
544 set_cuda_dim<dim>(dims.min_dims.threads,
545 static_cast<cuda_dim_member_t
>(IndexMapper::block_size));
547 set_cuda_dim<dim>(dims.active.blocks,
static_cast<cuda_dim_member_t
>(
true));
548 set_cuda_dim<dim>(dims.dims.blocks,
549 static_cast<cuda_dim_member_t
>(IndexMapper::grid_size));
550 set_cuda_dim<dim>(dims.min_dims.blocks,
551 static_cast<cuda_dim_member_t
>(IndexMapper::grid_size));
558 template<named_dim dim, kernel_sync_requirement sync>
559 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
560 iteration_mapping::Direct,
562 cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
565 cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
567 template<
typename IdxT>
568 static LaunchDims get_dimensions(IdxT len)
570 if (len >
static_cast<IdxT
>(1))
573 "len exceeds the size of the directly mapped index space");
576 return LaunchDims {};
581 template<named_dim dim, kernel_sync_requirement sync>
582 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
583 iteration_mapping::Direct,
585 cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
588 cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
590 template<
typename IdxT>
591 static LaunchDims get_dimensions(IdxT len)
597 set_cuda_dim<dim>(dims.active.threads,
598 static_cast<cuda_dim_member_t
>(
true));
599 set_cuda_dim<dim>(dims.dims.threads,
static_cast<cuda_dim_member_t
>(len));
600 set_cuda_dim<dim>(dims.min_dims.threads,
601 static_cast<cuda_dim_member_t
>(len));
608 template<named_dim dim,
int BLOCK_SIZE, kernel_sync_requirement sync>
609 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
610 iteration_mapping::Direct,
612 cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
614 static_assert(BLOCK_SIZE > 0,
615 "block size must be > 0, named_usage::unspecified, or "
616 "named_usage::ignored with kernel");
618 using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
620 template<
typename IdxT>
621 static LaunchDims get_dimensions(IdxT len)
623 constexpr
auto zero =
static_cast<IdxT
>(0);
625 if (len >
static_cast<IdxT
>(IndexMapper::block_size))
628 "len exceeds the size of the directly mapped index space");
633 set_cuda_dim<dim>(dims.active.threads,
634 static_cast<cuda_dim_member_t
>(
true));
635 set_cuda_dim<dim>(dims.dims.threads,
636 static_cast<cuda_dim_member_t
>(
637 (len > zero) ? IndexMapper::block_size : 0));
638 set_cuda_dim<dim>(dims.min_dims.threads,
639 static_cast<cuda_dim_member_t
>(IndexMapper::block_size));
646 template<named_dim dim, kernel_sync_requirement sync>
647 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
648 iteration_mapping::Direct,
650 cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
653 cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
655 template<
typename IdxT>
656 static LaunchDims get_dimensions(IdxT len)
660 set_cuda_dim<dim>(dims.active.blocks,
static_cast<cuda_dim_member_t
>(
true));
661 set_cuda_dim<dim>(dims.dims.blocks,
static_cast<cuda_dim_member_t
>(len));
662 set_cuda_dim<dim>(dims.min_dims.blocks,
663 static_cast<cuda_dim_member_t
>(len));
670 template<named_dim dim,
int GRID_SIZE, kernel_sync_requirement sync>
671 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
672 iteration_mapping::Direct,
674 cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
676 static_assert(GRID_SIZE > 0,
677 "grid size must be > 0, named_usage::unspecified, or "
678 "named_usage::ignored with kernel");
680 using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
682 template<
typename IdxT>
683 static LaunchDims get_dimensions(IdxT len)
685 constexpr
auto zero =
static_cast<IdxT
>(0);
687 if (len >
static_cast<IdxT
>(IndexMapper::grid_size))
690 "len exceeds the size of the directly mapped index space");
695 set_cuda_dim<dim>(dims.active.blocks,
static_cast<cuda_dim_member_t
>(
true));
696 set_cuda_dim<dim>(dims.dims.blocks,
697 static_cast<cuda_dim_member_t
>(
698 (len > zero) ? IndexMapper::grid_size : 0));
699 set_cuda_dim<dim>(dims.min_dims.blocks,
700 static_cast<cuda_dim_member_t
>(IndexMapper::grid_size));
707 template<named_dim dim, kernel_sync_requirement sync>
708 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
709 iteration_mapping::Direct,
711 cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
713 using IndexMapper = cuda::
714 IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
716 template<
typename IdxT>
717 static LaunchDims get_dimensions(IdxT len)
719 if (len >
static_cast<IdxT
>(0))
724 return LaunchDims {};
729 template<named_dim dim,
int GRID_SIZE, kernel_sync_requirement sync>
730 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
731 iteration_mapping::Direct,
733 cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
735 static_assert(GRID_SIZE > 0,
736 "grid size must be > 0, named_usage::unspecified, or "
737 "named_usage::ignored with kernel");
740 cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
742 template<
typename IdxT>
743 static LaunchDims get_dimensions(IdxT len)
745 constexpr
auto zero =
static_cast<IdxT
>(0);
749 const IdxT block_size =
754 set_cuda_dim<dim>(dims.active.threads,
755 static_cast<cuda_dim_member_t
>(
true));
756 set_cuda_dim<dim>(dims.dims.threads,
757 static_cast<cuda_dim_member_t
>(block_size));
758 set_cuda_dim<dim>(dims.min_dims.threads,
759 static_cast<cuda_dim_member_t
>(block_size));
761 set_cuda_dim<dim>(dims.active.blocks,
static_cast<cuda_dim_member_t
>(
true));
762 set_cuda_dim<dim>(dims.dims.blocks,
763 static_cast<cuda_dim_member_t
>(
764 (len > zero) ? IndexMapper::grid_size : 0));
765 set_cuda_dim<dim>(dims.min_dims.blocks,
766 static_cast<cuda_dim_member_t
>(IndexMapper::grid_size));
773 template<named_dim dim,
int BLOCK_SIZE, kernel_sync_requirement sync>
774 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
775 iteration_mapping::Direct,
777 cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
779 static_assert(BLOCK_SIZE > 0,
780 "block size must be > 0, named_usage::unspecified, or "
781 "named_usage::ignored with kernel");
784 cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
786 template<
typename IdxT>
787 static LaunchDims get_dimensions(IdxT len)
789 constexpr
auto zero =
static_cast<IdxT
>(0);
792 len,
static_cast<IdxT
>(IndexMapper::block_size));
796 set_cuda_dim<dim>(dims.active.threads,
797 static_cast<cuda_dim_member_t
>(
true));
798 set_cuda_dim<dim>(dims.dims.threads,
799 static_cast<cuda_dim_member_t
>(
800 (len > zero) ? IndexMapper::block_size : 0));
801 set_cuda_dim<dim>(dims.min_dims.threads,
802 static_cast<cuda_dim_member_t
>(IndexMapper::block_size));
804 set_cuda_dim<dim>(dims.active.blocks,
static_cast<cuda_dim_member_t
>(
true));
805 set_cuda_dim<dim>(dims.dims.blocks,
806 static_cast<cuda_dim_member_t
>(grid_size));
807 set_cuda_dim<dim>(dims.min_dims.blocks,
808 static_cast<cuda_dim_member_t
>(grid_size));
819 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
820 iteration_mapping::Direct,
822 cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
824 static_assert(BLOCK_SIZE > 0,
825 "block size must be > 0, named_usage::unspecified, or "
826 "named_usage::ignored with kernel");
827 static_assert(GRID_SIZE > 0,
828 "grid size must be > 0, named_usage::unspecified, or "
829 "named_usage::ignored with kernel");
831 using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
833 template<
typename IdxT>
834 static LaunchDims get_dimensions(IdxT len)
836 constexpr
auto zero =
static_cast<IdxT
>(0);
838 if (len > (
static_cast<IdxT
>(IndexMapper::block_size) *
839 static_cast<IdxT
>(IndexMapper::grid_size)))
842 "len exceeds the size of the directly mapped index space");
847 set_cuda_dim<dim>(dims.active.threads,
848 static_cast<cuda_dim_member_t
>(
true));
849 set_cuda_dim<dim>(dims.dims.threads,
850 static_cast<cuda_dim_member_t
>(
851 (len > zero) ? IndexMapper::block_size : 0));
852 set_cuda_dim<dim>(dims.min_dims.threads,
853 static_cast<cuda_dim_member_t
>(IndexMapper::block_size));
855 set_cuda_dim<dim>(dims.active.blocks,
static_cast<cuda_dim_member_t
>(
true));
856 set_cuda_dim<dim>(dims.dims.blocks,
857 static_cast<cuda_dim_member_t
>(
858 (len > zero) ? IndexMapper::grid_size : 0));
859 set_cuda_dim<dim>(dims.min_dims.blocks,
860 static_cast<cuda_dim_member_t
>(IndexMapper::grid_size));
867 template<named_dim dim, kernel_sync_requirement sync>
868 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
869 iteration_mapping::StridedLoop<named_usage::unspecified>,
871 cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
874 cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
876 template<
typename IdxT>
879 return LaunchDims {};
884 template<named_dim dim, kernel_sync_requirement sync>
885 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
886 iteration_mapping::StridedLoop<named_usage::unspecified>,
888 cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
891 cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
893 template<
typename IdxT>
894 static LaunchDims get_dimensions(IdxT len)
900 set_cuda_dim<dim>(dims.active.threads,
901 static_cast<cuda_dim_member_t
>(
true));
902 set_cuda_dim<dim>(dims.dims.threads,
static_cast<cuda_dim_member_t
>(len));
903 set_cuda_dim<dim>(dims.min_dims.threads,
static_cast<cuda_dim_member_t
>(1));
910 template<named_dim dim,
int BLOCK_SIZE, kernel_sync_requirement sync>
911 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
912 iteration_mapping::StridedLoop<named_usage::unspecified>,
914 cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
916 static_assert(BLOCK_SIZE > 0,
917 "block size must be > 0, named_usage::unspecified, or "
918 "named_usage::ignored with kernel");
920 using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
922 template<
typename IdxT>
923 static LaunchDims get_dimensions(IdxT len)
925 constexpr
auto zero =
static_cast<IdxT
>(0);
929 set_cuda_dim<dim>(dims.active.threads,
930 static_cast<cuda_dim_member_t
>(
true));
931 set_cuda_dim<dim>(dims.dims.threads,
932 static_cast<cuda_dim_member_t
>(
933 (len > zero) ? IndexMapper::block_size : 0));
934 set_cuda_dim<dim>(dims.min_dims.threads,
935 static_cast<cuda_dim_member_t
>(IndexMapper::block_size));
942 template<named_dim dim, kernel_sync_requirement sync>
943 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
944 iteration_mapping::StridedLoop<named_usage::unspecified>,
946 cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
949 cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
951 template<
typename IdxT>
952 static LaunchDims get_dimensions(IdxT len)
956 set_cuda_dim<dim>(dims.active.blocks,
static_cast<cuda_dim_member_t
>(
true));
957 set_cuda_dim<dim>(dims.dims.blocks,
static_cast<cuda_dim_member_t
>(len));
958 set_cuda_dim<dim>(dims.min_dims.blocks,
static_cast<cuda_dim_member_t
>(1));
965 template<named_dim dim,
int GRID_SIZE, kernel_sync_requirement sync>
966 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
967 iteration_mapping::StridedLoop<named_usage::unspecified>,
969 cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
971 static_assert(GRID_SIZE > 0,
972 "grid size must be > 0, named_usage::unspecified, or "
973 "named_usage::ignored with kernel");
975 using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
977 template<
typename IdxT>
978 static LaunchDims get_dimensions(IdxT len)
980 constexpr
auto zero =
static_cast<IdxT
>(0);
984 set_cuda_dim<dim>(dims.active.blocks,
static_cast<cuda_dim_member_t
>(
true));
985 set_cuda_dim<dim>(dims.dims.blocks,
986 static_cast<cuda_dim_member_t
>(
987 (len > zero) ? IndexMapper::grid_size : 0));
988 set_cuda_dim<dim>(dims.min_dims.blocks,
989 static_cast<cuda_dim_member_t
>(IndexMapper::grid_size));
996 template<named_dim dim, kernel_sync_requirement sync>
997 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
998 iteration_mapping::StridedLoop<named_usage::unspecified>,
1000 cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
1002 using IndexMapper = cuda::
1003 IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
1005 template<
typename IdxT>
1006 static LaunchDims get_dimensions(IdxT len)
1008 constexpr
auto zero =
static_cast<IdxT
>(0);
1012 set_cuda_dim<dim>(dims.active.threads,
1013 static_cast<cuda_dim_member_t
>(
true));
1014 set_cuda_dim<dim>(dims.dims.threads,
1015 static_cast<cuda_dim_member_t
>((len > zero) ? 1 : 0));
1016 set_cuda_dim<dim>(dims.min_dims.threads,
static_cast<cuda_dim_member_t
>(1));
1018 set_cuda_dim<dim>(dims.active.blocks,
static_cast<cuda_dim_member_t
>(
true));
1019 set_cuda_dim<dim>(dims.dims.blocks,
1020 static_cast<cuda_dim_member_t
>((len > zero) ? 1 : 0));
1021 set_cuda_dim<dim>(dims.min_dims.blocks,
static_cast<cuda_dim_member_t
>(1));
1028 template<named_dim dim,
int GRID_SIZE, kernel_sync_requirement sync>
1029 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
1030 iteration_mapping::StridedLoop<named_usage::unspecified>,
1032 cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
1034 static_assert(GRID_SIZE > 0,
1035 "grid size must be > 0, named_usage::unspecified, or "
1036 "named_usage::ignored with kernel");
1039 cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
1041 template<
typename IdxT>
1042 static LaunchDims get_dimensions(IdxT len)
1044 constexpr
auto zero =
static_cast<IdxT
>(0);
1048 const IdxT block_size =
1053 set_cuda_dim<dim>(dims.active.threads,
1054 static_cast<cuda_dim_member_t
>(
true));
1055 set_cuda_dim<dim>(dims.dims.threads,
1056 static_cast<cuda_dim_member_t
>(block_size));
1057 set_cuda_dim<dim>(dims.min_dims.threads,
static_cast<cuda_dim_member_t
>(1));
1059 set_cuda_dim<dim>(dims.active.blocks,
static_cast<cuda_dim_member_t
>(
true));
1060 set_cuda_dim<dim>(dims.dims.blocks,
1061 static_cast<cuda_dim_member_t
>(
1062 (len > zero) ? IndexMapper::grid_size : 0));
1063 set_cuda_dim<dim>(dims.min_dims.blocks,
1064 static_cast<cuda_dim_member_t
>(IndexMapper::grid_size));
1071 template<named_dim dim,
int BLOCK_SIZE, kernel_sync_requirement sync>
1072 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
1073 iteration_mapping::StridedLoop<named_usage::unspecified>,
1075 cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
1077 static_assert(BLOCK_SIZE > 0,
1078 "block size must be > 0, named_usage::unspecified, or "
1079 "named_usage::ignored with kernel");
1082 cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
1084 template<
typename IdxT>
1085 static LaunchDims get_dimensions(IdxT len)
1087 constexpr
auto zero =
static_cast<IdxT
>(0);
1090 len,
static_cast<IdxT
>(IndexMapper::block_size));
1094 set_cuda_dim<dim>(dims.active.threads,
1095 static_cast<cuda_dim_member_t
>(
true));
1096 set_cuda_dim<dim>(dims.dims.threads,
1097 static_cast<cuda_dim_member_t
>(
1098 (len > zero) ? IndexMapper::block_size : 0));
1099 set_cuda_dim<dim>(dims.min_dims.threads,
1100 static_cast<cuda_dim_member_t
>(IndexMapper::block_size));
1102 set_cuda_dim<dim>(dims.active.blocks,
static_cast<cuda_dim_member_t
>(
true));
1103 set_cuda_dim<dim>(dims.dims.blocks,
1104 static_cast<cuda_dim_member_t
>(grid_size));
1105 set_cuda_dim<dim>(dims.min_dims.blocks,
static_cast<cuda_dim_member_t
>(1));
1116 struct KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<
1117 iteration_mapping::StridedLoop<named_usage::unspecified>,
1119 cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
1121 static_assert(BLOCK_SIZE > 0,
1122 "block size must be > 0, named_usage::unspecified, or "
1123 "named_usage::ignored with kernel");
1124 static_assert(GRID_SIZE > 0,
1125 "grid size must be > 0, named_usage::unspecified, or "
1126 "named_usage::ignored with kernel");
1128 using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
1130 template<
typename IdxT>
1131 static LaunchDims get_dimensions(IdxT len)
1133 constexpr
auto zero =
static_cast<IdxT
>(0);
1137 set_cuda_dim<dim>(dims.active.threads,
1138 static_cast<cuda_dim_member_t
>(
true));
1139 set_cuda_dim<dim>(dims.dims.threads,
1140 static_cast<cuda_dim_member_t
>(
1141 (len > zero) ? IndexMapper::block_size : 0));
1142 set_cuda_dim<dim>(dims.min_dims.threads,
1143 static_cast<cuda_dim_member_t
>(IndexMapper::block_size));
1145 set_cuda_dim<dim>(dims.active.blocks,
static_cast<cuda_dim_member_t
>(
true));
1146 set_cuda_dim<dim>(dims.dims.blocks,
1147 static_cast<cuda_dim_member_t
>(
1148 (len > zero) ? IndexMapper::grid_size : 0));
1149 set_cuda_dim<dim>(dims.min_dims.blocks,
1150 static_cast<cuda_dim_member_t
>(IndexMapper::grid_size));
Header file defining prototypes for routines used to manage memory for CUDA reductions and other oper...
Header file containing RAJA CUDA policy definitions.
Header file for common RAJA internal macro definitions.
RAJA_HOST_DEVICE void RAJA_ABORT_OR_THROW(const char *str)
Definition: macros.hpp:143
#define RAJA_UNUSED_ARG(x)
Definition: macros.hpp:97
#define RAJA_DIVIDE_CEILING_INT(dividend, divisor)
Definition: macros.hpp:122
#define RAJA_DEVICE
Definition: macros.hpp:66
camp::list< Stmts... > StatementList
Definition: StatementList.hpp:41
Definition: AlignedRangeIndexSetBuilders.cpp:35
named_dim
Definition: types.hpp:53
kernel_sync_requirement
Definition: types.hpp:63
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155
RAJA header file containing user interface for RAJA::kernel.
Header file for RAJA type definitions.