21 #ifndef RAJA_policy_cuda_kernel_ForICount_HPP
22 #define RAJA_policy_cuda_kernel_ForICount_HPP
24 #include "RAJA/config.hpp"
41 template<
typename Data,
42 camp::idx_t ArgumentId,
46 typename... EnclosedStmts,
48 struct CudaStatementExecutor<
54 cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
57 : CudaStatementExecutor<
59 statement::For<ArgumentId,
60 RAJA::policy::cuda::cuda_indexer<
61 iteration_mapping::DirectUnchecked,
68 using Base = CudaStatementExecutor<
84 const diff_t i = IndexMapper::template index<diff_t>();
87 data.template assign_offset<ArgumentId>(i);
88 data.template assign_param<ParamId>(i);
91 enclosed_stmts_t::exec(data, thread_active);
102 template<
typename Data,
103 camp::idx_t ArgumentId,
105 typename IndexMapper,
107 typename... EnclosedStmts,
109 struct CudaStatementExecutor<
111 statement::ForICount<
115 cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
118 : CudaStatementExecutor<
123 cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
128 using Base = CudaStatementExecutor<
143 const diff_t len = segment_length<ArgumentId>(data);
144 const diff_t i = IndexMapper::template index<diff_t>();
148 const bool have_work = (i < len);
151 data.template assign_offset<ArgumentId>(i);
152 data.template assign_param<ParamId>(i);
155 enclosed_stmts_t::exec(data, thread_active && have_work);
166 template<
typename Data,
167 camp::idx_t ArgumentId,
169 typename IndexMapper,
170 typename... EnclosedStmts,
172 struct CudaStatementExecutor<
174 statement::ForICount<
177 RAJA::policy::cuda::cuda_indexer<
178 iteration_mapping::StridedLoop<named_usage::unspecified>,
179 kernel_sync_requirement::sync,
183 :
public CudaStatementExecutor<
187 RAJA::policy::cuda::cuda_indexer<
188 iteration_mapping::StridedLoop<named_usage::unspecified>,
189 kernel_sync_requirement::sync,
195 using Base = CudaStatementExecutor<
199 RAJA::policy::cuda::cuda_indexer<
212 const diff_t len = segment_length<ArgumentId>(data);
213 const diff_t i_init = IndexMapper::template index<diff_t>();
214 const diff_t i_stride = IndexMapper::template size<diff_t>();
218 for (
diff_t ii = 0; ii < len; ii += i_stride)
220 const diff_t i = ii + i_init;
224 const bool have_work = (i < len);
227 data.template assign_offset<ArgumentId>(i);
228 data.template assign_param<ParamId>(i);
231 enclosed_stmts_t::exec(data, thread_active && have_work);
243 template<
typename Data,
244 camp::idx_t ArgumentId,
246 typename IndexMapper,
247 typename... EnclosedStmts,
249 struct CudaStatementExecutor<
251 statement::ForICount<
254 RAJA::policy::cuda::cuda_indexer<
255 iteration_mapping::StridedLoop<named_usage::unspecified>,
256 kernel_sync_requirement::none,
260 :
public CudaStatementExecutor<
264 RAJA::policy::cuda::cuda_indexer<
265 iteration_mapping::StridedLoop<named_usage::unspecified>,
266 kernel_sync_requirement::none,
272 using Base = CudaStatementExecutor<
276 RAJA::policy::cuda::cuda_indexer<
289 const diff_t len = segment_length<ArgumentId>(data);
290 const diff_t i_init = IndexMapper::template index<diff_t>();
291 const diff_t i_stride = IndexMapper::template size<diff_t>();
295 for (
diff_t i = i_init; i < len; i += i_stride)
299 data.template assign_offset<ArgumentId>(i);
300 data.template assign_param<ParamId>(i);
303 enclosed_stmts_t::exec(data, thread_active);
315 template<
typename Data,
316 camp::idx_t ArgumentId,
318 typename... EnclosedStmts,
320 struct CudaStatementExecutor<
322 statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
324 : CudaStatementExecutor<
326 statement::ForICount<
328 RAJA::policy::cuda::cuda_indexer<
329 iteration_mapping::StridedLoop<named_usage::unspecified>,
330 kernel_sync_requirement::none,
331 cuda::IndexGlobal<named_dim::x,
332 named_usage::ignored,
333 named_usage::ignored>>,
343 template<
typename Data,
344 camp::idx_t ArgumentId,
347 typename... EnclosedStmts,
349 struct CudaStatementExecutor<
351 statement::ForICount<ArgumentId,
353 RAJA::cuda_warp_masked_direct<Mask>,
356 :
public CudaStatementExecutor<
358 statement::For<ArgumentId,
359 RAJA::cuda_warp_masked_direct<Mask>,
365 CudaStatementExecutor<Data,
367 RAJA::cuda_warp_masked_direct<Mask>,
379 CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
383 static_assert(mask_t::max_masked_size <=
384 RAJA::policy::cuda::device_constants.WARP_SIZE,
385 "BitMask is too large for CUDA warp size");
389 const diff_t len = segment_length<ArgumentId>(data);
391 const diff_t i = mask_t::maskValue((
diff_t)threadIdx.x);
394 data.template assign_offset<ArgumentId>(i);
395 data.template assign_param<ParamId>(i);
398 enclosed_stmts_t::exec(data, thread_active && (i < len));
407 template<
typename Data,
408 camp::idx_t ArgumentId,
411 typename... EnclosedStmts,
413 struct CudaStatementExecutor<
415 statement::ForICount<ArgumentId,
417 RAJA::cuda_warp_masked_loop<Mask>,
420 :
public CudaStatementExecutor<
422 statement::For<ArgumentId,
423 RAJA::cuda_warp_masked_loop<Mask>,
429 CudaStatementExecutor<Data,
431 RAJA::cuda_warp_masked_loop<Mask>,
443 CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
447 static_assert(mask_t::max_masked_size <=
448 RAJA::policy::cuda::device_constants.WARP_SIZE,
449 "BitMask is too large for CUDA warp size");
454 const diff_t len = segment_length<ArgumentId>(data);
455 const diff_t i_init = mask_t::maskValue((
diff_t)threadIdx.x);
456 const diff_t i_stride = (
diff_t)mask_t::max_masked_size;
459 for (
diff_t ii = 0; ii < len; ii += i_stride)
461 const diff_t i = ii + i_init;
465 bool have_work = i < len;
468 data.template assign_offset<ArgumentId>(i);
469 data.template assign_param<ParamId>(i);
472 enclosed_stmts_t::exec(data, thread_active && have_work);
482 template<
typename Data,
483 camp::idx_t ArgumentId,
486 typename... EnclosedStmts,
488 struct CudaStatementExecutor<
490 statement::ForICount<ArgumentId,
492 RAJA::cuda_thread_masked_direct<Mask>,
495 :
public CudaStatementExecutor<
497 statement::For<ArgumentId,
498 RAJA::cuda_thread_masked_direct<Mask>,
503 using Base = CudaStatementExecutor<
506 RAJA::cuda_thread_masked_direct<Mask>,
518 CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
524 const diff_t len = segment_length<ArgumentId>(data);
526 const diff_t i = mask_t::maskValue((
diff_t)threadIdx.x);
529 data.template assign_offset<ArgumentId>(i);
530 data.template assign_param<ParamId>(i);
533 enclosed_stmts_t::exec(data, thread_active && (i < len));
542 template<
typename Data,
543 camp::idx_t ArgumentId,
546 typename... EnclosedStmts,
548 struct CudaStatementExecutor<
550 statement::ForICount<ArgumentId,
552 RAJA::cuda_thread_masked_loop<Mask>,
555 :
public CudaStatementExecutor<
557 statement::For<ArgumentId,
558 RAJA::cuda_thread_masked_loop<Mask>,
564 CudaStatementExecutor<Data,
566 RAJA::cuda_thread_masked_loop<Mask>,
578 CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
585 const diff_t len = segment_length<ArgumentId>(data);
586 const diff_t i_init = mask_t::maskValue((
diff_t)threadIdx.x);
587 const diff_t i_stride = (
diff_t)mask_t::max_masked_size;
590 for (
diff_t ii = 0; ii < len; ii += i_stride)
592 const diff_t i = ii + i_init;
596 bool have_work = i < len;
599 data.template assign_offset<ArgumentId>(i);
600 data.template assign_param<ParamId>(i);
603 enclosed_stmts_t::exec(data, thread_active && have_work);
#define RAJA_DEVICE
Definition: macros.hpp:66
setSegmentType< Types, Segment, camp::at_v< typename camp::decay< Data >::index_types_t, Segment > > setSegmentTypeFromData
Definition: LoopTypes.hpp:95
camp::list< Stmts... > StatementList
Definition: StatementList.hpp:41
Definition: AlignedRangeIndexSetBuilders.cpp:35
kernel_sync_requirement
Definition: types.hpp:63
RAJA header file containing constructs used to run kernel traversals on GPU with CUDA.
static RAJA_DEVICE void exec(Data &data, bool thread_active)
Definition: ForICount.hpp:387
static RAJA_DEVICE void exec(Data &data, bool thread_active)
Definition: ForICount.hpp:81
static RAJA_DEVICE void exec(Data &data, bool thread_active)
Definition: ForICount.hpp:522
static RAJA_DEVICE void exec(Data &data, bool thread_active)
Definition: ForICount.hpp:286
static RAJA_DEVICE void exec(Data &data, bool thread_active)
Definition: ForICount.hpp:140
static RAJA_DEVICE void exec(Data &data, bool thread_active)
Definition: ForICount.hpp:451
static RAJA_DEVICE void exec(Data &data, bool thread_active)
Definition: ForICount.hpp:582
static RAJA_DEVICE void exec(Data &data, bool thread_active)
Definition: ForICount.hpp:209
setSegmentTypeFromData< Types, ArgumentId, Data > NewTypes
Definition: For.hpp:412
CudaStatementListExecutor< Data, stmt_list_t, NewTypes > enclosed_stmts_t
Definition: For.hpp:415
Mask mask_t
Definition: For.hpp:417
segment_diff_type< ArgumentId, Data > diff_t
Definition: For.hpp:419
StatementList< EnclosedStmts... > stmt_list_t
Definition: For.hpp:409
setSegmentTypeFromData< Types, ArgumentId, Data > NewTypes
Definition: For.hpp:542
segment_diff_type< ArgumentId, Data > diff_t
Definition: For.hpp:549
Mask mask_t
Definition: For.hpp:547
StatementList< EnclosedStmts... > stmt_list_t
Definition: For.hpp:539
CudaStatementListExecutor< Data, stmt_list_t, NewTypes > enclosed_stmts_t
Definition: For.hpp:545
segment_diff_type< ArgumentId, Data > diff_t
Definition: For.hpp:123
CudaStatementListExecutor< Data, stmt_list_t, NewTypes > enclosed_stmts_t
Definition: For.hpp:121
setSegmentTypeFromData< Types, ArgumentId, Data > NewTypes
Definition: For.hpp:347
Mask mask_t
Definition: For.hpp:352
StatementList< EnclosedStmts... > stmt_list_t
Definition: For.hpp:344
segment_diff_type< ArgumentId, Data > diff_t
Definition: For.hpp:354
CudaStatementListExecutor< Data, stmt_list_t, NewTypes > enclosed_stmts_t
Definition: For.hpp:350
segment_diff_type< ArgumentId, Data > diff_t
Definition: For.hpp:492
StatementList< EnclosedStmts... > stmt_list_t
Definition: For.hpp:482
CudaStatementListExecutor< Data, stmt_list_t, NewTypes > enclosed_stmts_t
Definition: For.hpp:488
setSegmentTypeFromData< Types, ArgumentId, Data > NewTypes
Definition: For.hpp:485
Mask mask_t
Definition: For.hpp:490
segment_diff_type< ArgumentId, Data > diff_t
Definition: For.hpp:262
CudaStatementListExecutor< Data, stmt_list_t, NewTypes > enclosed_stmts_t
Definition: For.hpp:260
CudaStatementListExecutor< Data, stmt_list_t, NewTypes > enclosed_stmts_t
Definition: For.hpp:185
segment_diff_type< ArgumentId, Data > diff_t
Definition: For.hpp:187
CudaStatementListExecutor< Data, stmt_list_t, NewTypes > enclosed_stmts_t
Definition: For.hpp:62
segment_diff_type< ArgumentId, Data > diff_t
Definition: For.hpp:64
Definition: types.hpp:119
Definition: types.hpp:143
Definition: types.hpp:209