RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
For.hpp
Go to the documentation of this file.
1 
11 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
12 // Copyright (c) Lawrence Livermore National Security, LLC and other
13 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
14 // files for dates and other details. No copyright assignment is required
15 // to contribute to RAJA.
16 //
17 // SPDX-License-Identifier: (BSD-3-Clause)
18 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
19 
20 
21 #ifndef RAJA_policy_cuda_kernel_For_HPP
22 #define RAJA_policy_cuda_kernel_For_HPP
23 
24 #include "RAJA/config.hpp"
25 
27 
28 namespace RAJA
29 {
30 
31 namespace internal
32 {
33 
34 /*
35  * Executor for work sharing inside CudaKernel.
36  * Mapping without checking from IndexMapper to indices
37  * Assigns the loop index to offset ArgumentId
38  * Meets all sync requirements
39  */
40 template<typename Data,
41  camp::idx_t ArgumentId,
42  typename IndexMapper,
44  typename... EnclosedStmts,
45  typename Types>
46 struct CudaStatementExecutor<
47  Data,
48  statement::For<
49  ArgumentId,
50  RAJA::policy::cuda::
51  cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
52  EnclosedStmts...>,
53  Types>
54 {
55 
56  using stmt_list_t = StatementList<EnclosedStmts...>;
57 
58  // Set the argument type for this loop
60 
62  CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
63 
65 
66  using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
67  RAJA::policy::cuda::
68  cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>>;
69 
70  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
71  {
72  const diff_t i = IndexMapper::template index<diff_t>();
73 
74  // Assign the index to the argument
75  data.template assign_offset<ArgumentId>(i);
76 
77  // execute enclosed statements
78  enclosed_stmts_t::exec(data, thread_active);
79  }
80 
81  static inline LaunchDims calculateDimensions(Data const& data)
82  {
83  const diff_t len = segment_length<ArgumentId>(data);
84 
85  LaunchDims dims = DimensionCalculator::get_dimensions(len);
86 
87  LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
88 
89  return combine(dims, enclosed_dims);
90  }
91 };
92 
93 /*
94  * Executor for work sharing inside CudaKernel.
95  * Mapping directly from IndexMapper to indices
96  * Assigns the loop index to offset ArgumentId
97  * Meets all sync requirements
98  */
99 template<typename Data,
100  camp::idx_t ArgumentId,
101  typename IndexMapper,
103  typename... EnclosedStmts,
104  typename Types>
105 struct CudaStatementExecutor<
106  Data,
107  statement::For<ArgumentId,
108  RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
109  sync,
110  IndexMapper>,
111  EnclosedStmts...>,
112  Types>
113 {
114 
115  using stmt_list_t = StatementList<EnclosedStmts...>;
116 
117  // Set the argument type for this loop
119 
121  CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
122 
124 
125  using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
126  RAJA::policy::cuda::
127  cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
128 
129  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
130  {
131  const diff_t len = segment_length<ArgumentId>(data);
132  const diff_t i = IndexMapper::template index<diff_t>();
133 
134  // execute enclosed statements if any thread will
135  // but mask off threads without work
136  const bool have_work = (i < len);
137 
138  // Assign the index to the argument
139  data.template assign_offset<ArgumentId>(i);
140 
141  // execute enclosed statements
142  enclosed_stmts_t::exec(data, thread_active && have_work);
143  }
144 
145  static inline LaunchDims calculateDimensions(Data const& data)
146  {
147  const diff_t len = segment_length<ArgumentId>(data);
148 
149  LaunchDims dims = DimensionCalculator::get_dimensions(len);
150 
151  LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
152 
153  return combine(dims, enclosed_dims);
154  }
155 };
156 
157 /*
158  * Executor for work sharing inside CudaKernel.
159  * Provides a strided loop for IndexMapper.
160  * Assigns the loop index to offset ArgumentId.
161  * Meets all sync requirements
162  */
163 template<typename Data,
164  camp::idx_t ArgumentId,
165  typename IndexMapper,
166  typename... EnclosedStmts,
167  typename Types>
168 struct CudaStatementExecutor<
169  Data,
170  statement::For<ArgumentId,
171  RAJA::policy::cuda::cuda_indexer<
172  iteration_mapping::StridedLoop<named_usage::unspecified>,
173  kernel_sync_requirement::sync,
174  IndexMapper>,
175  EnclosedStmts...>,
176  Types>
177 {
178 
179  using stmt_list_t = StatementList<EnclosedStmts...>;
180 
181  // Set the argument type for this loop
183 
185  CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
186 
188 
189  using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
190  RAJA::policy::cuda::cuda_indexer<
193  IndexMapper>>;
194 
195  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
196  {
197  // grid stride loop
198  const diff_t len = segment_length<ArgumentId>(data);
199  const diff_t i_init = IndexMapper::template index<diff_t>();
200  const diff_t i_stride = IndexMapper::template size<diff_t>();
201 
202  // Iterate through in chunks
203  // threads will have the same numbers of iterations
204  for (diff_t ii = 0; ii < len; ii += i_stride)
205  {
206  const diff_t i = ii + i_init;
207 
208  // execute enclosed statements if any thread will
209  // but mask off threads without work
210  const bool have_work = (i < len);
211 
212  // Assign the index to the argument
213  data.template assign_offset<ArgumentId>(i);
214 
215  // execute enclosed statements
216  enclosed_stmts_t::exec(data, thread_active && have_work);
217  }
218  }
219 
220  static inline LaunchDims calculateDimensions(Data const& data)
221  {
222  diff_t len = segment_length<ArgumentId>(data);
223 
224  LaunchDims dims = DimensionCalculator::get_dimensions(len);
225 
226  LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
227 
228  return combine(dims, enclosed_dims);
229  }
230 };
231 
232 /*
233  * Executor for work sharing inside CudaKernel.
234  * Provides a strided loop for IndexMapper.
235  * Assigns the loop index to offset ArgumentId.
236  * Meets no sync requirements
237  */
238 template<typename Data,
239  camp::idx_t ArgumentId,
240  typename IndexMapper,
241  typename... EnclosedStmts,
242  typename Types>
243 struct CudaStatementExecutor<
244  Data,
245  statement::For<ArgumentId,
246  RAJA::policy::cuda::cuda_indexer<
247  iteration_mapping::StridedLoop<named_usage::unspecified>,
248  kernel_sync_requirement::none,
249  IndexMapper>,
250  EnclosedStmts...>,
251  Types>
252 {
253 
254  using stmt_list_t = StatementList<EnclosedStmts...>;
255 
256  // Set the argument type for this loop
258 
260  CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
261 
263 
264  using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
265  RAJA::policy::cuda::cuda_indexer<
268  IndexMapper>>;
269 
270  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
271  {
272  // grid stride loop
273  const diff_t len = segment_length<ArgumentId>(data);
274  const diff_t i_init = IndexMapper::template index<diff_t>();
275  const diff_t i_stride = IndexMapper::template size<diff_t>();
276 
277  // Iterate through one at a time
278  // threads will have different numbers of iterations
279  for (diff_t i = i_init; i < len; i += i_stride)
280  {
281 
282  // Assign the index to the argument
283  data.template assign_offset<ArgumentId>(i);
284 
285  // execute enclosed statements
286  enclosed_stmts_t::exec(data, thread_active);
287  }
288  }
289 
290  static inline LaunchDims calculateDimensions(Data const& data)
291  {
292  const diff_t len = segment_length<ArgumentId>(data);
293 
294  LaunchDims dims = DimensionCalculator::get_dimensions(len);
295 
296  LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
297 
298  return combine(dims, enclosed_dims);
299  }
300 };
301 
302 /*
303  * Executor for sequential loops inside of a CudaKernel.
304  */
305 template<typename Data,
306  camp::idx_t ArgumentId,
307  typename... EnclosedStmts,
308  typename Types>
309 struct CudaStatementExecutor<
310  Data,
311  statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
312  Types>
313  : CudaStatementExecutor<
314  Data,
315  statement::For<
316  ArgumentId,
317  RAJA::policy::cuda::cuda_indexer<
318  iteration_mapping::StridedLoop<named_usage::unspecified>,
319  kernel_sync_requirement::none,
320  cuda::IndexGlobal<named_dim::x,
321  named_usage::ignored,
322  named_usage::ignored>>,
323  EnclosedStmts...>,
324  Types>
325 {};
326 
327 /*
328  * Executor for thread work sharing loop inside CudaKernel.
329  * Mapping directly from a warp lane
330  * Assigns the loop index to offset ArgumentId
331  */
332 template<typename Data,
333  camp::idx_t ArgumentId,
334  typename Mask,
335  typename... EnclosedStmts,
336  typename Types>
337 struct CudaStatementExecutor<Data,
338  statement::For<ArgumentId,
339  RAJA::cuda_warp_masked_direct<Mask>,
340  EnclosedStmts...>,
341  Types>
342 {
343 
344  using stmt_list_t = StatementList<EnclosedStmts...>;
345 
346  // Set the argument type for this loop
348 
350  CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
351 
352  using mask_t = Mask;
353 
355 
356  static_assert(mask_t::max_masked_size <=
357  RAJA::policy::cuda::device_constants.WARP_SIZE,
358  "BitMask is too large for CUDA warp size");
359 
360  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
361  {
362  const diff_t len = segment_length<ArgumentId>(data);
363 
364  const diff_t i = mask_t::maskValue((diff_t)threadIdx.x);
365 
366  // assign thread id directly to offset
367  data.template assign_offset<ArgumentId>(i);
368 
369  // execute enclosed statements if in bounds
370  enclosed_stmts_t::exec(data, thread_active && (i < len));
371  }
372 
373  static inline LaunchDims calculateDimensions(Data const& data)
374  {
375  // Get enclosed statements
376  LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
377 
378  // we always get EXACTLY one warp by allocating one warp in the X
379  // dimension
380  const diff_t len = RAJA::policy::cuda::device_constants.WARP_SIZE;
381 
382  // request one thread per element in the segment
383  set_cuda_dim<named_dim::x>(dims.dims.threads, len);
384 
385  // since we are direct-mapping, we REQUIRE len
386  set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
387 
388  return (dims);
389  }
390 };
391 
392 /*
393  * Executor for thread work sharing loop inside CudaKernel.
394  * Mapping directly from a warp lane
395  * Assigns the loop index to offset ArgumentId
396  */
397 template<typename Data,
398  camp::idx_t ArgumentId,
399  typename Mask,
400  typename... EnclosedStmts,
401  typename Types>
402 struct CudaStatementExecutor<Data,
403  statement::For<ArgumentId,
404  RAJA::cuda_warp_masked_loop<Mask>,
405  EnclosedStmts...>,
406  Types>
407 {
408 
409  using stmt_list_t = StatementList<EnclosedStmts...>;
410 
411  // Set the argument type for this loop
413 
415  CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
416 
417  using mask_t = Mask;
418 
420 
422  RAJA::internal::KernelDimensionCalculator<cuda_warp_loop>;
423 
424  static_assert(mask_t::max_masked_size <=
425  RAJA::policy::cuda::device_constants.WARP_SIZE,
426  "BitMask is too large for CUDA warp size");
427 
428  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
429  {
430  // masked size strided loop
431  const diff_t len = segment_length<ArgumentId>(data);
432  const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
433  const diff_t i_stride = (diff_t)mask_t::max_masked_size;
434 
435  // Iterate through grid stride of chunks
436  for (diff_t ii = 0; ii < len; ii += i_stride)
437  {
438  const diff_t i = ii + i_init;
439 
440  // execute enclosed statements if any thread will
441  // but mask off threads without work
442  bool have_work = i < len;
443 
444  // Assign the x thread to the argument
445  data.template assign_offset<ArgumentId>(i);
446 
447  // execute enclosed statements
448  enclosed_stmts_t::exec(data, thread_active && have_work);
449  }
450  }
451 
452  static inline LaunchDims calculateDimensions(Data const& data)
453  {
454  diff_t len = segment_length<ArgumentId>(data);
455 
456  LaunchDims dims = DimensionCalculator::get_dimensions(len);
457 
458  LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
459 
460  return combine(dims, enclosed_dims);
461  }
462 };
463 
464 /*
465  * Executor for thread work sharing loop inside CudaKernel.
466  * Mapping directly from raw threadIdx.x
467  * Assigns the loop index to offset ArgumentId
468  */
469 template<typename Data,
470  camp::idx_t ArgumentId,
471  typename Mask,
472  typename... EnclosedStmts,
473  typename Types>
474 struct CudaStatementExecutor<
475  Data,
476  statement::For<ArgumentId,
477  RAJA::cuda_thread_masked_direct<Mask>,
478  EnclosedStmts...>,
479  Types>
480 {
481 
482  using stmt_list_t = StatementList<EnclosedStmts...>;
483 
484  // Set the argument type for this loop
486 
488  CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
489 
490  using mask_t = Mask;
491 
493 
494  using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
495  cuda_thread_size_x_direct<mask_t::max_input_size>>;
496 
497  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
498  {
499  const diff_t len = segment_length<ArgumentId>(data);
500 
501  const diff_t i = mask_t::maskValue((diff_t)threadIdx.x);
502 
503  // assign thread id directly to offset
504  data.template assign_offset<ArgumentId>(i);
505 
506  // execute enclosed statements if in bounds
507  enclosed_stmts_t::exec(data, thread_active && (i < len));
508  }
509 
510  static inline LaunchDims calculateDimensions(Data const& data)
511  {
512  const diff_t len = segment_length<ArgumentId>(data);
513 
514  LaunchDims dims = DimensionCalculator::get_dimensions(len);
515 
516  LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
517 
518  return combine(dims, enclosed_dims);
519  }
520 };
521 
522 /*
523  * Executor for thread work sharing loop inside CudaKernel.
524  * Mapping directly from a warp lane
525  * Assigns the loop index to offset ArgumentId
526  */
527 template<typename Data,
528  camp::idx_t ArgumentId,
529  typename Mask,
530  typename... EnclosedStmts,
531  typename Types>
532 struct CudaStatementExecutor<Data,
533  statement::For<ArgumentId,
534  RAJA::cuda_thread_masked_loop<Mask>,
535  EnclosedStmts...>,
536  Types>
537 {
538 
539  using stmt_list_t = StatementList<EnclosedStmts...>;
540 
541  // Set the argument type for this loop
543 
545  CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
546 
547  using mask_t = Mask;
548 
550 
551  using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
552  cuda_thread_size_x_loop<mask_t::max_input_size>>;
553 
554  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
555  {
556  // masked size strided loop
557  const diff_t len = segment_length<ArgumentId>(data);
558  const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
559  const diff_t i_stride = (diff_t)mask_t::max_masked_size;
560 
561  // Iterate through grid stride of chunks
562  for (diff_t ii = 0; ii < len; ii += i_stride)
563  {
564  const diff_t i = ii + i_init;
565 
566  // execute enclosed statements if any thread will
567  // but mask off threads without work
568  bool have_work = i < len;
569 
570  // Assign the x thread to the argument
571  data.template assign_offset<ArgumentId>(i);
572 
573  // execute enclosed statements
574  enclosed_stmts_t::exec(data, thread_active && have_work);
575  }
576  }
577 
578  static inline LaunchDims calculateDimensions(Data const& data)
579  {
580  diff_t len = segment_length<ArgumentId>(data);
581 
582  LaunchDims dims = DimensionCalculator::get_dimensions(len);
583 
584  LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
585 
586  return combine(dims, enclosed_dims);
587  }
588 };
589 
590 } // namespace internal
591 } // end namespace RAJA
592 
593 
594 #endif /* RAJA_policy_cuda_kernel_For_HPP */
#define RAJA_DEVICE
Definition: macros.hpp:66
setSegmentType< Types, Segment, camp::at_v< typename camp::decay< Data >::index_types_t, Segment > > setSegmentTypeFromData
Definition: LoopTypes.hpp:95
camp::list< Stmts... > StatementList
Definition: StatementList.hpp:41
typename std::iterator_traits< typename camp::at_v< typename Data::segment_tuple_t::TList, ArgumentId >::iterator >::difference_type segment_diff_type
Definition: LoopData.hpp:184
Definition: AlignedRangeIndexSetBuilders.cpp:35
kernel_sync_requirement
Definition: types.hpp:63
RAJA header file containing constructs used to run kernel traversals on GPU with CUDA.
RAJA::internal::KernelDimensionCalculator< cuda_thread_size_x_loop< mask_t::max_input_size > > DimensionCalculator
Definition: For.hpp:552
RAJA::internal::KernelDimensionCalculator< RAJA::policy::cuda::cuda_indexer< iteration_mapping::Direct, sync, IndexMapper > > DimensionCalculator
Definition: For.hpp:127
RAJA::internal::KernelDimensionCalculator< cuda_thread_size_x_direct< mask_t::max_input_size > > DimensionCalculator
Definition: For.hpp:495
RAJA::internal::KernelDimensionCalculator< RAJA::policy::cuda::cuda_indexer< iteration_mapping::StridedLoop< named_usage::unspecified >, kernel_sync_requirement::none, IndexMapper > > DimensionCalculator
Definition: For.hpp:268
RAJA::internal::KernelDimensionCalculator< RAJA::policy::cuda::cuda_indexer< iteration_mapping::StridedLoop< named_usage::unspecified >, kernel_sync_requirement::sync, IndexMapper > > DimensionCalculator
Definition: For.hpp:193
RAJA::internal::KernelDimensionCalculator< RAJA::policy::cuda::cuda_indexer< iteration_mapping::DirectUnchecked, sync, IndexMapper > > DimensionCalculator
Definition: For.hpp:68
Definition: types.hpp:209