21 #ifndef RAJA_policy_cuda_kernel_For_HPP
22 #define RAJA_policy_cuda_kernel_For_HPP
24 #include "RAJA/config.hpp"
40 template<
typename Data,
41 camp::idx_t ArgumentId,
44 typename... EnclosedStmts,
46 struct CudaStatementExecutor<
51 cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
62 CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
68 cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>>;
72 const diff_t i = IndexMapper::template index<diff_t>();
75 data.template assign_offset<ArgumentId>(i);
78 enclosed_stmts_t::exec(data, thread_active);
83 const diff_t len = segment_length<ArgumentId>(data);
85 LaunchDims dims = DimensionCalculator::get_dimensions(len);
87 LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
89 return combine(dims, enclosed_dims);
99 template<
typename Data,
100 camp::idx_t ArgumentId,
101 typename IndexMapper,
103 typename... EnclosedStmts,
105 struct CudaStatementExecutor<
107 statement::For<ArgumentId,
108 RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
121 CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
127 cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
131 const diff_t len = segment_length<ArgumentId>(data);
132 const diff_t i = IndexMapper::template index<diff_t>();
136 const bool have_work = (i < len);
139 data.template assign_offset<ArgumentId>(i);
142 enclosed_stmts_t::exec(data, thread_active && have_work);
147 const diff_t len = segment_length<ArgumentId>(data);
149 LaunchDims dims = DimensionCalculator::get_dimensions(len);
151 LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
153 return combine(dims, enclosed_dims);
163 template<
typename Data,
164 camp::idx_t ArgumentId,
165 typename IndexMapper,
166 typename... EnclosedStmts,
168 struct CudaStatementExecutor<
170 statement::For<ArgumentId,
171 RAJA::policy::cuda::cuda_indexer<
172 iteration_mapping::StridedLoop<named_usage::unspecified>,
173 kernel_sync_requirement::sync,
185 CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
190 RAJA::policy::cuda::cuda_indexer<
198 const diff_t len = segment_length<ArgumentId>(data);
199 const diff_t i_init = IndexMapper::template index<diff_t>();
200 const diff_t i_stride = IndexMapper::template size<diff_t>();
204 for (
diff_t ii = 0; ii < len; ii += i_stride)
206 const diff_t i = ii + i_init;
210 const bool have_work = (i < len);
213 data.template assign_offset<ArgumentId>(i);
216 enclosed_stmts_t::exec(data, thread_active && have_work);
222 diff_t len = segment_length<ArgumentId>(data);
224 LaunchDims dims = DimensionCalculator::get_dimensions(len);
226 LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
228 return combine(dims, enclosed_dims);
238 template<
typename Data,
239 camp::idx_t ArgumentId,
240 typename IndexMapper,
241 typename... EnclosedStmts,
243 struct CudaStatementExecutor<
245 statement::For<ArgumentId,
246 RAJA::policy::cuda::cuda_indexer<
247 iteration_mapping::StridedLoop<named_usage::unspecified>,
248 kernel_sync_requirement::none,
260 CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
265 RAJA::policy::cuda::cuda_indexer<
273 const diff_t len = segment_length<ArgumentId>(data);
274 const diff_t i_init = IndexMapper::template index<diff_t>();
275 const diff_t i_stride = IndexMapper::template size<diff_t>();
279 for (
diff_t i = i_init; i < len; i += i_stride)
283 data.template assign_offset<ArgumentId>(i);
286 enclosed_stmts_t::exec(data, thread_active);
292 const diff_t len = segment_length<ArgumentId>(data);
294 LaunchDims dims = DimensionCalculator::get_dimensions(len);
296 LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
298 return combine(dims, enclosed_dims);
305 template<
typename Data,
306 camp::idx_t ArgumentId,
307 typename... EnclosedStmts,
309 struct CudaStatementExecutor<
311 statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
313 : CudaStatementExecutor<
317 RAJA::policy::cuda::cuda_indexer<
318 iteration_mapping::StridedLoop<named_usage::unspecified>,
319 kernel_sync_requirement::none,
320 cuda::IndexGlobal<named_dim::x,
321 named_usage::ignored,
322 named_usage::ignored>>,
332 template<
typename Data,
333 camp::idx_t ArgumentId,
335 typename... EnclosedStmts,
337 struct CudaStatementExecutor<Data,
338 statement::For<ArgumentId,
339 RAJA::cuda_warp_masked_direct<Mask>,
350 CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
356 static_assert(mask_t::max_masked_size <=
357 RAJA::policy::cuda::device_constants.WARP_SIZE,
358 "BitMask is too large for CUDA warp size");
362 const diff_t len = segment_length<ArgumentId>(data);
364 const diff_t i = mask_t::maskValue((
diff_t)threadIdx.x);
367 data.template assign_offset<ArgumentId>(i);
370 enclosed_stmts_t::exec(data, thread_active && (i < len));
376 LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
380 const diff_t len = RAJA::policy::cuda::device_constants.WARP_SIZE;
383 set_cuda_dim<named_dim::x>(dims.dims.threads, len);
386 set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
397 template<
typename Data,
398 camp::idx_t ArgumentId,
400 typename... EnclosedStmts,
402 struct CudaStatementExecutor<Data,
403 statement::For<ArgumentId,
404 RAJA::cuda_warp_masked_loop<Mask>,
415 CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
422 RAJA::internal::KernelDimensionCalculator<cuda_warp_loop>;
424 static_assert(mask_t::max_masked_size <=
425 RAJA::policy::cuda::device_constants.WARP_SIZE,
426 "BitMask is too large for CUDA warp size");
431 const diff_t len = segment_length<ArgumentId>(data);
432 const diff_t i_init = mask_t::maskValue((
diff_t)threadIdx.x);
433 const diff_t i_stride = (
diff_t)mask_t::max_masked_size;
436 for (
diff_t ii = 0; ii < len; ii += i_stride)
438 const diff_t i = ii + i_init;
442 bool have_work = i < len;
445 data.template assign_offset<ArgumentId>(i);
448 enclosed_stmts_t::exec(data, thread_active && have_work);
454 diff_t len = segment_length<ArgumentId>(data);
456 LaunchDims dims = DimensionCalculator::get_dimensions(len);
458 LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
460 return combine(dims, enclosed_dims);
469 template<
typename Data,
470 camp::idx_t ArgumentId,
472 typename... EnclosedStmts,
474 struct CudaStatementExecutor<
476 statement::For<ArgumentId,
477 RAJA::cuda_thread_masked_direct<Mask>,
488 CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
495 cuda_thread_size_x_direct<mask_t::max_input_size>>;
499 const diff_t len = segment_length<ArgumentId>(data);
501 const diff_t i = mask_t::maskValue((
diff_t)threadIdx.x);
504 data.template assign_offset<ArgumentId>(i);
507 enclosed_stmts_t::exec(data, thread_active && (i < len));
512 const diff_t len = segment_length<ArgumentId>(data);
514 LaunchDims dims = DimensionCalculator::get_dimensions(len);
516 LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
518 return combine(dims, enclosed_dims);
527 template<
typename Data,
528 camp::idx_t ArgumentId,
530 typename... EnclosedStmts,
532 struct CudaStatementExecutor<Data,
533 statement::For<ArgumentId,
534 RAJA::cuda_thread_masked_loop<Mask>,
545 CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
552 cuda_thread_size_x_loop<mask_t::max_input_size>>;
557 const diff_t len = segment_length<ArgumentId>(data);
558 const diff_t i_init = mask_t::maskValue((
diff_t)threadIdx.x);
559 const diff_t i_stride = (
diff_t)mask_t::max_masked_size;
562 for (
diff_t ii = 0; ii < len; ii += i_stride)
564 const diff_t i = ii + i_init;
568 bool have_work = i < len;
571 data.template assign_offset<ArgumentId>(i);
574 enclosed_stmts_t::exec(data, thread_active && have_work);
580 diff_t len = segment_length<ArgumentId>(data);
582 LaunchDims dims = DimensionCalculator::get_dimensions(len);
584 LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
586 return combine(dims, enclosed_dims);
#define RAJA_DEVICE
Definition: macros.hpp:66
setSegmentType< Types, Segment, camp::at_v< typename camp::decay< Data >::index_types_t, Segment > > setSegmentTypeFromData
Definition: LoopTypes.hpp:95
camp::list< Stmts... > StatementList
Definition: StatementList.hpp:41
typename std::iterator_traits< typename camp::at_v< typename Data::segment_tuple_t::TList, ArgumentId >::iterator >::difference_type segment_diff_type
Definition: LoopData.hpp:184
Definition: AlignedRangeIndexSetBuilders.cpp:35
kernel_sync_requirement
Definition: types.hpp:63
RAJA header file containing constructs used to run kernel traversals on GPU with CUDA.
setSegmentTypeFromData< Types, ArgumentId, Data > NewTypes
Definition: For.hpp:412
static LaunchDims calculateDimensions(Data const &data)
Definition: For.hpp:452
CudaStatementListExecutor< Data, stmt_list_t, NewTypes > enclosed_stmts_t
Definition: For.hpp:415
static RAJA_DEVICE void exec(Data &data, bool thread_active)
Definition: For.hpp:428
RAJA::internal::KernelDimensionCalculator< cuda_warp_loop > DimensionCalculator
Definition: For.hpp:422
Mask mask_t
Definition: For.hpp:417
segment_diff_type< ArgumentId, Data > diff_t
Definition: For.hpp:419
StatementList< EnclosedStmts... > stmt_list_t
Definition: For.hpp:409
setSegmentTypeFromData< Types, ArgumentId, Data > NewTypes
Definition: For.hpp:542
RAJA::internal::KernelDimensionCalculator< cuda_thread_size_x_loop< mask_t::max_input_size > > DimensionCalculator
Definition: For.hpp:552
segment_diff_type< ArgumentId, Data > diff_t
Definition: For.hpp:549
Mask mask_t
Definition: For.hpp:547
static RAJA_DEVICE void exec(Data &data, bool thread_active)
Definition: For.hpp:554
static LaunchDims calculateDimensions(Data const &data)
Definition: For.hpp:578
StatementList< EnclosedStmts... > stmt_list_t
Definition: For.hpp:539
CudaStatementListExecutor< Data, stmt_list_t, NewTypes > enclosed_stmts_t
Definition: For.hpp:545
segment_diff_type< ArgumentId, Data > diff_t
Definition: For.hpp:123
CudaStatementListExecutor< Data, stmt_list_t, NewTypes > enclosed_stmts_t
Definition: For.hpp:121
StatementList< EnclosedStmts... > stmt_list_t
Definition: For.hpp:115
RAJA::internal::KernelDimensionCalculator< RAJA::policy::cuda::cuda_indexer< iteration_mapping::Direct, sync, IndexMapper > > DimensionCalculator
Definition: For.hpp:127
static LaunchDims calculateDimensions(Data const &data)
Definition: For.hpp:145
setSegmentTypeFromData< Types, ArgumentId, Data > NewTypes
Definition: For.hpp:118
static RAJA_DEVICE void exec(Data &data, bool thread_active)
Definition: For.hpp:129
setSegmentTypeFromData< Types, ArgumentId, Data > NewTypes
Definition: For.hpp:347
static LaunchDims calculateDimensions(Data const &data)
Definition: For.hpp:373
Mask mask_t
Definition: For.hpp:352
StatementList< EnclosedStmts... > stmt_list_t
Definition: For.hpp:344
segment_diff_type< ArgumentId, Data > diff_t
Definition: For.hpp:354
static RAJA_DEVICE void exec(Data &data, bool thread_active)
Definition: For.hpp:360
CudaStatementListExecutor< Data, stmt_list_t, NewTypes > enclosed_stmts_t
Definition: For.hpp:350
segment_diff_type< ArgumentId, Data > diff_t
Definition: For.hpp:492
static RAJA_DEVICE void exec(Data &data, bool thread_active)
Definition: For.hpp:497
static LaunchDims calculateDimensions(Data const &data)
Definition: For.hpp:510
StatementList< EnclosedStmts... > stmt_list_t
Definition: For.hpp:482
CudaStatementListExecutor< Data, stmt_list_t, NewTypes > enclosed_stmts_t
Definition: For.hpp:488
RAJA::internal::KernelDimensionCalculator< cuda_thread_size_x_direct< mask_t::max_input_size > > DimensionCalculator
Definition: For.hpp:495
setSegmentTypeFromData< Types, ArgumentId, Data > NewTypes
Definition: For.hpp:485
Mask mask_t
Definition: For.hpp:490
segment_diff_type< ArgumentId, Data > diff_t
Definition: For.hpp:262
StatementList< EnclosedStmts... > stmt_list_t
Definition: For.hpp:254
CudaStatementListExecutor< Data, stmt_list_t, NewTypes > enclosed_stmts_t
Definition: For.hpp:260
static LaunchDims calculateDimensions(Data const &data)
Definition: For.hpp:290
static RAJA_DEVICE void exec(Data &data, bool thread_active)
Definition: For.hpp:270
setSegmentTypeFromData< Types, ArgumentId, Data > NewTypes
Definition: For.hpp:257
RAJA::internal::KernelDimensionCalculator< RAJA::policy::cuda::cuda_indexer< iteration_mapping::StridedLoop< named_usage::unspecified >, kernel_sync_requirement::none, IndexMapper > > DimensionCalculator
Definition: For.hpp:268
static LaunchDims calculateDimensions(Data const &data)
Definition: For.hpp:220
static RAJA_DEVICE void exec(Data &data, bool thread_active)
Definition: For.hpp:195
StatementList< EnclosedStmts... > stmt_list_t
Definition: For.hpp:179
CudaStatementListExecutor< Data, stmt_list_t, NewTypes > enclosed_stmts_t
Definition: For.hpp:185
segment_diff_type< ArgumentId, Data > diff_t
Definition: For.hpp:187
RAJA::internal::KernelDimensionCalculator< RAJA::policy::cuda::cuda_indexer< iteration_mapping::StridedLoop< named_usage::unspecified >, kernel_sync_requirement::sync, IndexMapper > > DimensionCalculator
Definition: For.hpp:193
setSegmentTypeFromData< Types, ArgumentId, Data > NewTypes
Definition: For.hpp:182
RAJA::internal::KernelDimensionCalculator< RAJA::policy::cuda::cuda_indexer< iteration_mapping::DirectUnchecked, sync, IndexMapper > > DimensionCalculator
Definition: For.hpp:68
setSegmentTypeFromData< Types, ArgumentId, Data > NewTypes
Definition: For.hpp:59
static LaunchDims calculateDimensions(Data const &data)
Definition: For.hpp:81
StatementList< EnclosedStmts... > stmt_list_t
Definition: For.hpp:56
static RAJA_DEVICE void exec(Data &data, bool thread_active)
Definition: For.hpp:70
CudaStatementListExecutor< Data, stmt_list_t, NewTypes > enclosed_stmts_t
Definition: For.hpp:62
segment_diff_type< ArgumentId, Data > diff_t
Definition: For.hpp:64
Definition: types.hpp:209