RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
launch.hpp
Go to the documentation of this file.
1 
11 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
12 // Copyright (c) Lawrence Livermore National Security, LLC and other
13 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
14 // files for dates and other details. No copyright assignment is required
15 // to contribute to RAJA.
16 //
17 // SPDX-License-Identifier: (BSD-3-Clause)
18 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
19 
20 #ifndef RAJA_pattern_launch_cuda_HPP
21 #define RAJA_pattern_launch_cuda_HPP
22 
28 #include "RAJA/util/resource.hpp"
29 
30 namespace RAJA
31 {
32 
38 template<typename IndicesAndDimsT>
39 class LaunchContextT<cuda::LaunchContextIndicesAndDimsPolicy<IndicesAndDimsT>>
40  : public LaunchContextBase
41 {
42 public:
43  using indices_and_dims_t = IndicesAndDimsT;
44 
45  /*[[no_unique_address]] -- add when we move to C++20 */
47 
50  indices_and_dims()
51  {}
52 
54  const
55  {
56  return indices_and_dims;
57  }
58 };
59 
60 template<typename BODY, typename ReduceParams>
61 __global__ void launch_new_reduce_global_fcn(const RAJA_CUDA_GRID_CONSTANT BODY
62  body_in,
63  ReduceParams reduce_params)
64 {
66  auto privatizer = thread_privatize(body_in);
67  auto& body = privatizer.get_priv();
68 
69  // Set pointer to shared memory
70  extern __shared__ char raja_shmem_ptr[];
71 
72  using LaunchContextType =
74 
76  ctx.shared_mem_ptr = raja_shmem_ptr;
78 
79  // Using a flatten global policy as we may use all dimensions
81  RAJA::cuda_flatten_global_xyz_direct {}, reduce_params);
82 }
83 
84 template<bool async>
86  RAJA::policy::cuda::cuda_launch_explicit_t<async,
87  named_usage::unspecified,
88  named_usage::unspecified>>
89 {
90 
91  template<typename BODY_IN, typename ReduceParams>
92  static concepts::enable_if_t<
93  resources::EventProxy<resources::Resource>,
95  exec(RAJA::resources::Resource res,
96  const LaunchParams& launch_params,
97  BODY_IN&& body_in,
98  ReduceParams& launch_reducers)
99  {
100  using BODY = camp::decay<BODY_IN>;
101  using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<
103  EXEC_POL pol {};
104 
105  auto func = reinterpret_cast<const void*>(
106  &launch_new_reduce_global_fcn<BODY, camp::decay<ReduceParams>>);
107 
108  resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
109 
110  //
111  // Compute the number of blocks and threads
112  //
113 
114  cuda_dim_t gridSize {
115  static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
116  static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
117  static_cast<cuda_dim_member_t>(launch_params.teams.value[2])};
118 
119  cuda_dim_t blockSize {
120  static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
121  static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
122  static_cast<cuda_dim_member_t>(launch_params.threads.value[2])};
123 
124  // Only launch kernel if we have something to iterate over
125  constexpr cuda_dim_member_t zero = 0;
126  if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
127  blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
128  {
129 
130 
131  size_t shared_mem_size = launch_params.shared_mem_size;
132  RAJA::cuda::detail::cudaInfo launch_info;
133  launch_info.gridDim = gridSize;
134  launch_info.blockDim = blockSize;
135  launch_info.dynamic_smem = &shared_mem_size;
136  launch_info.res = cuda_res;
137 
138  {
140  launch_info);
141 
142  //
143  // Privatize the loop_body, using make_launch_body to setup reductions
144  //
145  BODY body = RAJA::cuda::make_launch_body(
146  func, gridSize, blockSize, shared_mem_size, cuda_res,
147  std::forward<BODY_IN>(body_in));
148 
149  //
150  // Launch the kernel
151  //
152  void* args[] = {(void*)&body, (void*)&launch_reducers};
153  RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
154  cuda_res, async);
155 
157  launch_info);
158  }
159  }
160 
161  return resources::EventProxy<resources::Resource>(res);
162  }
163 };
164 
165 template<typename BODY,
166  int num_threads,
167  size_t BLOCKS_PER_SM,
168  typename ReduceParams>
169 __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
170  void launch_new_reduce_global_fcn_fixed(const RAJA_CUDA_GRID_CONSTANT BODY
171  body_in,
172  ReduceParams reduce_params)
173 {
174 
176  auto privatizer = thread_privatize(body_in);
177  auto& body = privatizer.get_priv();
178 
179  // Set pointer to shared memory
180  extern __shared__ char raja_shmem_ptr[];
181 
184 
186  ctx.shared_mem_ptr = raja_shmem_ptr;
188 
189  // Using a flatten global policy as we may use all dimensions
191  RAJA::cuda_flatten_global_xyz_direct {}, reduce_params);
192 }
193 
194 template<bool async, int nthreads, size_t BLOCKS_PER_SM>
196  RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>>
197 {
198 
199  template<typename BODY_IN, typename ReduceParams>
200  static concepts::enable_if_t<
201  resources::EventProxy<resources::Resource>,
203  exec(RAJA::resources::Resource res,
204  const LaunchParams& launch_params,
205  BODY_IN&& body_in,
206  ReduceParams& launch_reducers)
207  {
208  using BODY = camp::decay<BODY_IN>;
209  // Use a generic block size policy here to match that used in
210  // parampack_combine
211  using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<
213  EXEC_POL pol {};
214 
215  auto func = reinterpret_cast<const void*>(
216  &launch_new_reduce_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM,
217  camp::decay<ReduceParams>>);
218 
219  resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
220 
221  //
222  // Compute the number of blocks and threads
223  //
224 
225  cuda_dim_t gridSize {
226  static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
227  static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
228  static_cast<cuda_dim_member_t>(launch_params.teams.value[2])};
229 
230  cuda_dim_t blockSize {
231  static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
232  static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
233  static_cast<cuda_dim_member_t>(launch_params.threads.value[2])};
234 
235  // Only launch kernel if we have something to iterate over
236  constexpr cuda_dim_member_t zero = 0;
237  if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
238  blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
239  {
240 
241 
242  size_t shared_mem_size = launch_params.shared_mem_size;
243  RAJA::cuda::detail::cudaInfo launch_info;
244  launch_info.gridDim = gridSize;
245  launch_info.blockDim = blockSize;
246  launch_info.dynamic_smem = &shared_mem_size;
247  launch_info.res = cuda_res;
248 
249  {
251  launch_info);
252 
253  //
254  // Privatize the loop_body, using make_launch_body to setup reductions
255  //
256  BODY body = RAJA::cuda::make_launch_body(
257  func, gridSize, blockSize, shared_mem_size, cuda_res,
258  std::forward<BODY_IN>(body_in));
259 
260  //
261  // Launch the kernel
262  //
263  void* args[] = {(void*)&body, (void*)&launch_reducers};
264  RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
265  cuda_res, async);
266 
268  launch_info);
269  }
270  }
271 
272  return resources::EventProxy<resources::Resource>(res);
273  }
274 };
275 
276 /*
277  CUDA generic loop implementations
278 */
279 template<typename SEGMENT, typename IndexMapper>
280 struct LoopExecute<
281  RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
282  kernel_sync_requirement::none,
283  IndexMapper>,
284  SEGMENT>
285 {
286 
287  using diff_t = typename std::iterator_traits<
288  typename SEGMENT::iterator>::difference_type;
289 
290  template<typename LaunchContextPolicy, typename BODY>
291  static RAJA_INLINE RAJA_DEVICE void exec(
293  SEGMENT const& segment,
294  BODY const& body)
295  {
296  const diff_t i =
297  IndexMapper::template index<diff_t>(ctx.get_indices_and_dims());
298 
299  body(*(segment.begin() + i));
300  }
301 };
302 
303 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
304 struct LoopExecute<
305  RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
306  kernel_sync_requirement::none,
307  IndexMapper0,
308  IndexMapper1>,
309  SEGMENT>
310 {
311 
312  using diff_t = typename std::iterator_traits<
313  typename SEGMENT::iterator>::difference_type;
314 
315  template<typename LaunchContextPolicy, typename BODY>
316  static RAJA_INLINE RAJA_DEVICE void exec(
318  SEGMENT const& segment0,
319  SEGMENT const& segment1,
320  BODY const& body)
321  {
322  const diff_t i0 =
323  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims());
324  const diff_t i1 =
325  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims());
326 
327  body(*(segment0.begin() + i0), *(segment1.begin() + i1));
328  }
329 };
330 
331 template<typename SEGMENT,
332  typename IndexMapper0,
333  typename IndexMapper1,
334  typename IndexMapper2>
335 struct LoopExecute<
336  RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
337  kernel_sync_requirement::none,
338  IndexMapper0,
339  IndexMapper1,
340  IndexMapper2>,
341  SEGMENT>
342 {
343 
344  using diff_t = typename std::iterator_traits<
345  typename SEGMENT::iterator>::difference_type;
346 
347  template<typename LaunchContextPolicy, typename BODY>
348  static RAJA_INLINE RAJA_DEVICE void exec(
350  SEGMENT const& segment0,
351  SEGMENT const& segment1,
352  SEGMENT const& segment2,
353  BODY const& body)
354  {
355  const diff_t i0 =
356  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims());
357  const diff_t i1 =
358  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims());
359  const diff_t i2 =
360  IndexMapper2::template index<diff_t>(ctx.get_indices_and_dims());
361 
362  body(*(segment0.begin() + i0), *(segment1.begin() + i1),
363  *(segment2.begin() + i2));
364  }
365 };
366 
367 template<typename SEGMENT, typename IndexMapper>
368 struct LoopExecute<
369  RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
370  kernel_sync_requirement::none,
371  IndexMapper>,
372  SEGMENT>
373 {
374 
375  using diff_t = typename std::iterator_traits<
376  typename SEGMENT::iterator>::difference_type;
377 
378  template<typename LaunchContextPolicy, typename BODY>
379  static RAJA_INLINE RAJA_DEVICE void exec(
381  SEGMENT const& segment,
382  BODY const& body)
383  {
384  const diff_t len = segment.end() - segment.begin();
385  const diff_t i =
386  IndexMapper::template index<diff_t>(ctx.get_indices_and_dims());
387 
388  if (i < len)
389  {
390  body(*(segment.begin() + i));
391  }
392  }
393 };
394 
395 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
396 struct LoopExecute<
397  RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
398  kernel_sync_requirement::none,
399  IndexMapper0,
400  IndexMapper1>,
401  SEGMENT>
402 {
403 
404  using diff_t = typename std::iterator_traits<
405  typename SEGMENT::iterator>::difference_type;
406 
407  template<typename LaunchContextPolicy, typename BODY>
408  static RAJA_INLINE RAJA_DEVICE void exec(
410  SEGMENT const& segment0,
411  SEGMENT const& segment1,
412  BODY const& body)
413  {
414  const int len0 = segment0.end() - segment0.begin();
415  const int len1 = segment1.end() - segment1.begin();
416 
417  const diff_t i0 =
418  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims());
419  const diff_t i1 =
420  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims());
421 
422  if (i0 < len0 && i1 < len1)
423  {
424  body(*(segment0.begin() + i0), *(segment1.begin() + i1));
425  }
426  }
427 };
428 
429 template<typename SEGMENT,
430  typename IndexMapper0,
431  typename IndexMapper1,
432  typename IndexMapper2>
433 struct LoopExecute<
434  RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
435  kernel_sync_requirement::none,
436  IndexMapper0,
437  IndexMapper1,
438  IndexMapper2>,
439  SEGMENT>
440 {
441 
442  using diff_t = typename std::iterator_traits<
443  typename SEGMENT::iterator>::difference_type;
444 
445  template<typename LaunchContextPolicy, typename BODY>
446  static RAJA_INLINE RAJA_DEVICE void exec(
448  SEGMENT const& segment0,
449  SEGMENT const& segment1,
450  SEGMENT const& segment2,
451  BODY const& body)
452  {
453  const int len0 = segment0.end() - segment0.begin();
454  const int len1 = segment1.end() - segment1.begin();
455  const int len2 = segment2.end() - segment2.begin();
456 
457  const diff_t i0 =
458  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims());
459  const diff_t i1 =
460  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims());
461  const diff_t i2 =
462  IndexMapper2::template index<diff_t>(ctx.get_indices_and_dims());
463 
464  if (i0 < len0 && i1 < len1 && i2 < len2)
465  {
466  body(*(segment0.begin() + i0), *(segment1.begin() + i1),
467  *(segment2.begin() + i2));
468  }
469  }
470 };
471 
472 template<typename SEGMENT, typename IndexMapper>
473 struct LoopExecute<
474  RAJA::policy::cuda::cuda_indexer<
475  RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
476  kernel_sync_requirement::none,
477  IndexMapper>,
478  SEGMENT>
479 {
480 
481  using diff_t = typename std::iterator_traits<
482  typename SEGMENT::iterator>::difference_type;
483 
484  template<typename BODY, typename LaunchContextPolicy>
485  static RAJA_INLINE RAJA_DEVICE void exec(
487  SEGMENT const& segment,
488  BODY const& body)
489  {
490  const diff_t len = segment.end() - segment.begin();
491  const diff_t i_init =
492  IndexMapper::template index<diff_t>(ctx.get_indices_and_dims());
493  const diff_t i_stride =
494  IndexMapper::template size<diff_t>(ctx.get_indices_and_dims());
495 
496  for (diff_t i = i_init; i < len; i += i_stride)
497  {
498  body(*(segment.begin() + i));
499  }
500  }
501 };
502 
503 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
504 struct LoopExecute<
505  RAJA::policy::cuda::cuda_indexer<
506  RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
507  kernel_sync_requirement::none,
508  IndexMapper0,
509  IndexMapper1>,
510  SEGMENT>
511 {
512 
513  using diff_t = typename std::iterator_traits<
514  typename SEGMENT::iterator>::difference_type;
515 
516  template<typename LaunchContextPolicy, typename BODY>
517  static RAJA_INLINE RAJA_DEVICE void exec(
519  SEGMENT const& segment0,
520  SEGMENT const& segment1,
521  BODY const& body)
522  {
523  const int len0 = segment0.end() - segment0.begin();
524  const int len1 = segment1.end() - segment1.begin();
525 
526  const diff_t i0_init =
527  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims());
528  const diff_t i1_init =
529  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims());
530 
531  const diff_t i0_stride =
532  IndexMapper0::template size<diff_t>(ctx.get_indices_and_dims());
533  const diff_t i1_stride =
534  IndexMapper1::template size<diff_t>(ctx.get_indices_and_dims());
535 
536  for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
537  {
538 
539  for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
540  {
541 
542  body(*(segment0.begin() + i0), *(segment1.begin() + i1));
543  }
544  }
545  }
546 };
547 
548 template<typename SEGMENT,
549  typename IndexMapper0,
550  typename IndexMapper1,
551  typename IndexMapper2>
552 struct LoopExecute<
553  RAJA::policy::cuda::cuda_indexer<
554  RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
555  kernel_sync_requirement::none,
556  IndexMapper0,
557  IndexMapper1,
558  IndexMapper2>,
559  SEGMENT>
560 {
561 
562  using diff_t = typename std::iterator_traits<
563  typename SEGMENT::iterator>::difference_type;
564 
565  template<typename LaunchContextPolicy, typename BODY>
566  static RAJA_INLINE RAJA_DEVICE void exec(
568  SEGMENT const& segment0,
569  SEGMENT const& segment1,
570  SEGMENT const& segment2,
571  BODY const& body)
572  {
573  const int len0 = segment0.end() - segment0.begin();
574  const int len1 = segment1.end() - segment1.begin();
575  const int len2 = segment2.end() - segment2.begin();
576 
577  const diff_t i0_init =
578  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims());
579  const diff_t i1_init =
580  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims());
581  const diff_t i2_init =
582  IndexMapper2::template index<diff_t>(ctx.get_indices_and_dims());
583 
584  const diff_t i0_stride =
585  IndexMapper0::template size<diff_t>(ctx.get_indices_and_dims());
586  const diff_t i1_stride =
587  IndexMapper1::template size<diff_t>(ctx.get_indices_and_dims());
588  const diff_t i2_stride =
589  IndexMapper2::template size<diff_t>(ctx.get_indices_and_dims());
590 
591  for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
592  {
593 
594  for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
595  {
596 
597  for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
598  {
599 
600  body(*(segment0.begin() + i0), *(segment1.begin() + i1),
601  *(segment2.begin() + i2));
602  }
603  }
604  }
605  }
606 };
607 
608 /*
609  CUDA generic loop_icount implementations
610 */
611 template<typename SEGMENT, typename IndexMapper>
613  RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
614  kernel_sync_requirement::none,
615  IndexMapper>,
616  SEGMENT>
617 {
618 
619  using diff_t = typename std::iterator_traits<
620  typename SEGMENT::iterator>::difference_type;
621 
622  template<typename LaunchContextPolicy, typename BODY>
623  static RAJA_INLINE RAJA_DEVICE void exec(
625  SEGMENT const& segment,
626  BODY const& body)
627  {
628  const diff_t i =
629  IndexMapper::template index<diff_t>(ctx.get_indices_and_dims());
630 
631  body(*(segment.begin() + i), i);
632  }
633 };
634 
635 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
637  RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
638  kernel_sync_requirement::none,
639  IndexMapper0,
640  IndexMapper1>,
641  SEGMENT>
642 {
643 
644  using diff_t = typename std::iterator_traits<
645  typename SEGMENT::iterator>::difference_type;
646 
647  template<typename LaunchContextPolicy, typename BODY>
648  static RAJA_INLINE RAJA_DEVICE void exec(
650  SEGMENT const& segment0,
651  SEGMENT const& segment1,
652  BODY const& body)
653  {
654  const diff_t i0 =
655  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims());
656  const diff_t i1 =
657  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims());
658 
659  body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
660  }
661 };
662 
663 template<typename SEGMENT,
664  typename IndexMapper0,
665  typename IndexMapper1,
666  typename IndexMapper2>
668  RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
669  kernel_sync_requirement::none,
670  IndexMapper0,
671  IndexMapper1,
672  IndexMapper2>,
673  SEGMENT>
674 {
675 
676  using diff_t = typename std::iterator_traits<
677  typename SEGMENT::iterator>::difference_type;
678 
679  template<typename LaunchContextPolicy, typename BODY>
680  static RAJA_INLINE RAJA_DEVICE void exec(
682  SEGMENT const& segment0,
683  SEGMENT const& segment1,
684  SEGMENT const& segment2,
685  BODY const& body)
686  {
687  const diff_t i0 =
688  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims());
689  const diff_t i1 =
690  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims());
691  const diff_t i2 =
692  IndexMapper2::template index<diff_t>(ctx.get_indices_and_dims());
693 
694  body(*(segment0.begin() + i0), *(segment1.begin() + i1),
695  *(segment2.begin() + i2), i0, i1, i2);
696  }
697 };
698 
699 template<typename SEGMENT, typename IndexMapper>
701  RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
702  kernel_sync_requirement::none,
703  IndexMapper>,
704  SEGMENT>
705 {
706 
707  using diff_t = typename std::iterator_traits<
708  typename SEGMENT::iterator>::difference_type;
709 
710  template<typename LaunchContextPolicy, typename BODY>
711  static RAJA_INLINE RAJA_DEVICE void exec(
713  SEGMENT const& segment,
714  BODY const& body)
715  {
716  const diff_t len = segment.end() - segment.begin();
717  const diff_t i =
718  IndexMapper::template index<diff_t>(ctx.get_indices_and_dims());
719 
720  if (i < len)
721  {
722  body(*(segment.begin() + i), i);
723  }
724  }
725 };
726 
727 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
729  RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
730  kernel_sync_requirement::none,
731  IndexMapper0,
732  IndexMapper1>,
733  SEGMENT>
734 {
735 
736  using diff_t = typename std::iterator_traits<
737  typename SEGMENT::iterator>::difference_type;
738 
739  template<typename LaunchContextPolicy, typename BODY>
740  static RAJA_INLINE RAJA_DEVICE void exec(
742  SEGMENT const& segment0,
743  SEGMENT const& segment1,
744  BODY const& body)
745  {
746  const int len0 = segment0.end() - segment0.begin();
747  const int len1 = segment1.end() - segment1.begin();
748 
749  const diff_t i0 =
750  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims());
751  const diff_t i1 =
752  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims());
753 
754  if (i0 < len0 && i1 < len1)
755  {
756  body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
757  }
758  }
759 };
760 
761 template<typename SEGMENT,
762  typename IndexMapper0,
763  typename IndexMapper1,
764  typename IndexMapper2>
766  RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
767  kernel_sync_requirement::none,
768  IndexMapper0,
769  IndexMapper1,
770  IndexMapper2>,
771  SEGMENT>
772 {
773 
774  using diff_t = typename std::iterator_traits<
775  typename SEGMENT::iterator>::difference_type;
776 
777  template<typename LaunchContextPolicy, typename BODY>
778  static RAJA_INLINE RAJA_DEVICE void exec(
780  SEGMENT const& segment0,
781  SEGMENT const& segment1,
782  SEGMENT const& segment2,
783  BODY const& body)
784  {
785  const int len0 = segment0.end() - segment0.begin();
786  const int len1 = segment1.end() - segment1.begin();
787  const int len2 = segment2.end() - segment2.begin();
788 
789  const diff_t i0 =
790  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims());
791  const diff_t i1 =
792  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims());
793  const diff_t i2 =
794  IndexMapper2::template index<diff_t>(ctx.get_indices_and_dims());
795 
796  if (i0 < len0 && i1 < len1 && i2 < len2)
797  {
798  body(*(segment0.begin() + i0), *(segment1.begin() + i1),
799  *(segment2.begin() + i2), i0, i1, i2);
800  }
801  }
802 };
803 
804 template<typename SEGMENT, typename IndexMapper>
806  RAJA::policy::cuda::cuda_indexer<
807  RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
808  kernel_sync_requirement::none,
809  IndexMapper>,
810  SEGMENT>
811 {
812 
813  using diff_t = typename std::iterator_traits<
814  typename SEGMENT::iterator>::difference_type;
815 
816  template<typename LaunchContextPolicy, typename BODY>
817  static RAJA_INLINE RAJA_DEVICE void exec(
819  SEGMENT const& segment,
820  BODY const& body)
821  {
822  const diff_t len = segment.end() - segment.begin();
823  const diff_t i_init =
824  IndexMapper::template index<diff_t>(ctx.get_indices_and_dims());
825  const diff_t i_stride =
826  IndexMapper::template size<diff_t>(ctx.get_indices_and_dims());
827 
828  for (diff_t i = i_init; i < len; i += i_stride)
829  {
830  body(*(segment.begin() + i), i);
831  }
832  }
833 };
834 
835 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
837  RAJA::policy::cuda::cuda_indexer<
838  RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
839  kernel_sync_requirement::none,
840  IndexMapper0,
841  IndexMapper1>,
842  SEGMENT>
843 {
844 
845  using diff_t = typename std::iterator_traits<
846  typename SEGMENT::iterator>::difference_type;
847 
848  template<typename LaunchContextPolicy, typename BODY>
849  static RAJA_INLINE RAJA_DEVICE void exec(
851  SEGMENT const& segment0,
852  SEGMENT const& segment1,
853  BODY const& body)
854  {
855  const int len0 = segment0.end() - segment0.begin();
856  const int len1 = segment1.end() - segment1.begin();
857 
858  const diff_t i0_init =
859  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims());
860  const diff_t i1_init =
861  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims());
862 
863  const diff_t i0_stride =
864  IndexMapper0::template size<diff_t>(ctx.get_indices_and_dims());
865  const diff_t i1_stride =
866  IndexMapper1::template size<diff_t>(ctx.get_indices_and_dims());
867 
868  for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
869  {
870 
871  for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
872  {
873 
874  body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
875  }
876  }
877  }
878 };
879 
880 template<typename SEGMENT,
881  typename IndexMapper0,
882  typename IndexMapper1,
883  typename IndexMapper2>
885  RAJA::policy::cuda::cuda_indexer<
886  RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
887  kernel_sync_requirement::none,
888  IndexMapper0,
889  IndexMapper1,
890  IndexMapper2>,
891  SEGMENT>
892 {
893 
894  using diff_t = typename std::iterator_traits<
895  typename SEGMENT::iterator>::difference_type;
896 
897  template<typename LaunchContextPolicy, typename BODY>
898  static RAJA_INLINE RAJA_DEVICE void exec(
900  SEGMENT const& segment0,
901  SEGMENT const& segment1,
902  SEGMENT const& segment2,
903  BODY const& body)
904  {
905  const int len0 = segment0.end() - segment0.begin();
906  const int len1 = segment1.end() - segment1.begin();
907  const int len2 = segment2.end() - segment2.begin();
908 
909  const diff_t i0_init =
910  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims());
911  const diff_t i1_init =
912  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims());
913  const diff_t i2_init =
914  IndexMapper2::template index<diff_t>(ctx.get_indices_and_dims());
915 
916  const diff_t i0_stride =
917  IndexMapper0::template size<diff_t>(ctx.get_indices_and_dims());
918  const diff_t i1_stride =
919  IndexMapper1::template size<diff_t>(ctx.get_indices_and_dims());
920  const diff_t i2_stride =
921  IndexMapper2::template size<diff_t>(ctx.get_indices_and_dims());
922 
923  for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
924  {
925 
926  for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
927  {
928 
929  for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
930  {
931 
932  body(*(segment0.begin() + i0), *(segment1.begin() + i1),
933  *(segment2.begin() + i2), i0, i1, i2);
934  }
935  }
936  }
937  }
938 };
939 
940 /*
941  CUDA generic flattened loop implementations
942 */
943 template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
944 struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<
945  RAJA::iteration_mapping::DirectUnchecked,
946  sync,
947  IndexMapper0>,
948  SEGMENT>
949  : LoopExecute<RAJA::policy::cuda::cuda_indexer<
950  RAJA::iteration_mapping::DirectUnchecked,
951  sync,
952  IndexMapper0>,
953  SEGMENT>
954 {};
955 
956 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
957 struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<
958  RAJA::iteration_mapping::DirectUnchecked,
959  kernel_sync_requirement::none,
960  IndexMapper0,
961  IndexMapper1>,
962  SEGMENT>
963 {
964  using diff_t = typename std::iterator_traits<
965  typename SEGMENT::iterator>::difference_type;
966 
967  template<typename LaunchContextPolicy, typename BODY>
968  static RAJA_INLINE RAJA_DEVICE void exec(
970  SEGMENT const& segment,
971  BODY const& body)
972  {
973  const int i0 =
974  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims());
975  const int i1 =
976  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims());
977 
978  const diff_t i0_stride =
979  IndexMapper0::template size<diff_t>(ctx.get_indices_and_dims());
980 
981  const int i = i0 + i0_stride * i1;
982 
983  body(*(segment.begin() + i));
984  }
985 };
986 
987 template<typename SEGMENT,
988  typename IndexMapper0,
989  typename IndexMapper1,
990  typename IndexMapper2>
991 struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<
992  RAJA::iteration_mapping::DirectUnchecked,
993  kernel_sync_requirement::none,
994  IndexMapper0,
995  IndexMapper1,
996  IndexMapper2>,
997  SEGMENT>
998 {
999  using diff_t = typename std::iterator_traits<
1000  typename SEGMENT::iterator>::difference_type;
1001 
1002  template<typename LaunchContextPolicy, typename BODY>
1003  static RAJA_INLINE RAJA_DEVICE void exec(
1005  SEGMENT const& segment,
1006  BODY const& body)
1007  {
1008  const int i0 =
1009  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims());
1010  const int i1 =
1011  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims());
1012  const int i2 =
1013  IndexMapper2::template index<diff_t>(ctx.get_indices_and_dims());
1014 
1015  const diff_t i0_stride =
1016  IndexMapper0::template size<diff_t>(ctx.get_indices_and_dims());
1017  const diff_t i1_stride =
1018  IndexMapper1::template size<diff_t>(ctx.get_indices_and_dims());
1019 
1020  const int i = i0 + i0_stride * (i1 + i1_stride * i2);
1021 
1022  body(*(segment.begin() + i));
1023  }
1024 };
1025 
1026 template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
1028  RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
1029  sync,
1030  IndexMapper0>,
1031  SEGMENT>
1032  : LoopExecute<
1033  RAJA::policy::cuda::
1034  cuda_indexer<RAJA::iteration_mapping::Direct, sync, IndexMapper0>,
1035  SEGMENT>
1036 {};
1037 
1038 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
1040  RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
1041  kernel_sync_requirement::none,
1042  IndexMapper0,
1043  IndexMapper1>,
1044  SEGMENT>
1045 {
1046  using diff_t = typename std::iterator_traits<
1047  typename SEGMENT::iterator>::difference_type;
1048 
1049  template<typename LaunchContextPolicy, typename BODY>
1050  static RAJA_INLINE RAJA_DEVICE void exec(
1052  SEGMENT const& segment,
1053  BODY const& body)
1054  {
1055  const int len = segment.end() - segment.begin();
1056 
1057  const int i0 =
1058  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims());
1059  const int i1 =
1060  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims());
1061 
1062  const diff_t i0_stride =
1063  IndexMapper0::template size<diff_t>(ctx.get_indices_and_dims());
1064 
1065  const int i = i0 + i0_stride * i1;
1066 
1067  if (i < len)
1068  {
1069  body(*(segment.begin() + i));
1070  }
1071  }
1072 };
1073 
1074 template<typename SEGMENT,
1075  typename IndexMapper0,
1076  typename IndexMapper1,
1077  typename IndexMapper2>
1079  RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
1080  kernel_sync_requirement::none,
1081  IndexMapper0,
1082  IndexMapper1,
1083  IndexMapper2>,
1084  SEGMENT>
1085 {
1086  using diff_t = typename std::iterator_traits<
1087  typename SEGMENT::iterator>::difference_type;
1088 
1089  template<typename LaunchContextPolicy, typename BODY>
1090  static RAJA_INLINE RAJA_DEVICE void exec(
1092  SEGMENT const& segment,
1093  BODY const& body)
1094  {
1095  const int len = segment.end() - segment.begin();
1096 
1097  const int i0 =
1098  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims());
1099  const int i1 =
1100  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims());
1101  const int i2 =
1102  IndexMapper2::template index<diff_t>(ctx.get_indices_and_dims());
1103 
1104  const diff_t i0_stride =
1105  IndexMapper0::template size<diff_t>(ctx.get_indices_and_dims());
1106  const diff_t i1_stride =
1107  IndexMapper1::template size<diff_t>(ctx.get_indices_and_dims());
1108 
1109  const int i = i0 + i0_stride * (i1 + i1_stride * i2);
1110 
1111  if (i < len)
1112  {
1113  body(*(segment.begin() + i));
1114  }
1115  }
1116 };
1117 
1118 template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
1120  RAJA::policy::cuda::cuda_flatten_indexer<
1121  RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
1122  sync,
1123  IndexMapper0>,
1124  SEGMENT>
1125  : LoopExecute<
1126  RAJA::policy::cuda::cuda_indexer<
1127  RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
1128  sync,
1129  IndexMapper0>,
1130  SEGMENT>
1131 {};
1132 
1133 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
1135  RAJA::policy::cuda::cuda_flatten_indexer<
1136  RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
1137  kernel_sync_requirement::none,
1138  IndexMapper0,
1139  IndexMapper1>,
1140  SEGMENT>
1141 {
1142  using diff_t = typename std::iterator_traits<
1143  typename SEGMENT::iterator>::difference_type;
1144 
1145  template<typename LaunchContextPolicy, typename BODY>
1146  static RAJA_INLINE RAJA_DEVICE void exec(
1148  SEGMENT const& segment,
1149  BODY const& body)
1150  {
1151  const int len = segment.end() - segment.begin();
1152 
1153  const int i0 =
1154  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims());
1155  const int i1 =
1156  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims());
1157 
1158  const int i0_stride =
1159  IndexMapper0::template size<diff_t>(ctx.get_indices_and_dims());
1160  const int i1_stride =
1161  IndexMapper1::template size<diff_t>(ctx.get_indices_and_dims());
1162 
1163  for (int i = i0 + i0_stride * i1; i < len; i += i0_stride * i1_stride)
1164  {
1165  body(*(segment.begin() + i));
1166  }
1167  }
1168 };
1169 
1170 template<typename SEGMENT,
1171  typename IndexMapper0,
1172  typename IndexMapper1,
1173  typename IndexMapper2>
1175  RAJA::policy::cuda::cuda_flatten_indexer<
1176  RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
1177  kernel_sync_requirement::none,
1178  IndexMapper0,
1179  IndexMapper1,
1180  IndexMapper2>,
1181  SEGMENT>
1182 {
1183  using diff_t = typename std::iterator_traits<
1184  typename SEGMENT::iterator>::difference_type;
1185 
1186  template<typename LaunchContextPolicy, typename BODY>
1187  static RAJA_INLINE RAJA_DEVICE void exec(
1189  SEGMENT const& segment,
1190  BODY const& body)
1191  {
1192  const int len = segment.end() - segment.begin();
1193 
1194  const int i0 =
1195  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims());
1196  const int i1 =
1197  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims());
1198  const int i2 =
1199  IndexMapper2::template index<diff_t>(ctx.get_indices_and_dims());
1200 
1201  const int i0_stride =
1202  IndexMapper0::template size<diff_t>(ctx.get_indices_and_dims());
1203  const int i1_stride =
1204  IndexMapper1::template size<diff_t>(ctx.get_indices_and_dims());
1205  const int i2_stride =
1206  IndexMapper2::template size<diff_t>(ctx.get_indices_and_dims());
1207 
1208  for (int i = i0 + i0_stride * (i1 + i1_stride * i2); i < len;
1209  i += i0_stride * i1_stride * i2_stride)
1210  {
1211  body(*(segment.begin() + i));
1212  }
1213  }
1214 };
1215 
1216 /*
1217  CUDA generic tile implementations
1218 */
1219 template<typename SEGMENT, typename IndexMapper>
1221  RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
1222  kernel_sync_requirement::none,
1223  IndexMapper>,
1224  SEGMENT>
1225 {
1226 
1227  using diff_t = typename std::iterator_traits<
1228  typename SEGMENT::iterator>::difference_type;
1229 
1230  template<typename LaunchContextPolicy, typename TILE_T, typename BODY>
1231  static RAJA_INLINE RAJA_DEVICE void exec(
1233  TILE_T tile_size,
1234  SEGMENT const& segment,
1235  BODY const& body)
1236  {
1237  const diff_t i =
1238  IndexMapper::template index<diff_t>(ctx.get_indices_and_dims()) *
1239  static_cast<diff_t>(tile_size);
1240 
1241  body(segment.slice(i, static_cast<diff_t>(tile_size)));
1242  }
1243 };
1244 
1245 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
1247  RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
1248  kernel_sync_requirement::none,
1249  IndexMapper0,
1250  IndexMapper1>,
1251  SEGMENT>
1252 {
1253 
1254  using diff_t = typename std::iterator_traits<
1255  typename SEGMENT::iterator>::difference_type;
1256 
1257  template<typename LaunchContextPolicy, typename TILE_T, typename BODY>
1258  static RAJA_INLINE RAJA_DEVICE void exec(
1260  TILE_T tile_size0,
1261  TILE_T tile_size1,
1262  SEGMENT const& segment0,
1263  SEGMENT const& segment1,
1264  BODY const& body)
1265  {
1266  const diff_t i0 =
1267  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims()) *
1268  static_cast<diff_t>(tile_size0);
1269  const diff_t i1 =
1270  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims()) *
1271  static_cast<diff_t>(tile_size1);
1272 
1273  body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
1274  segment1.slice(i1, static_cast<diff_t>(tile_size1)));
1275  }
1276 };
1277 
1278 template<typename SEGMENT,
1279  typename IndexMapper0,
1280  typename IndexMapper1,
1281  typename IndexMapper2>
1283  RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
1284  kernel_sync_requirement::none,
1285  IndexMapper0,
1286  IndexMapper1,
1287  IndexMapper2>,
1288  SEGMENT>
1289 {
1290 
1291  using diff_t = typename std::iterator_traits<
1292  typename SEGMENT::iterator>::difference_type;
1293 
1294  template<typename LaunchContextPolicy, typename TILE_T, typename BODY>
1295  static RAJA_INLINE RAJA_DEVICE void exec(
1297  TILE_T tile_size0,
1298  TILE_T tile_size1,
1299  TILE_T tile_size2,
1300  SEGMENT const& segment0,
1301  SEGMENT const& segment1,
1302  SEGMENT const& segment2,
1303  BODY const& body)
1304  {
1305  const diff_t i0 =
1306  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims()) *
1307  static_cast<diff_t>(tile_size0);
1308  const diff_t i1 =
1309  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims()) *
1310  static_cast<diff_t>(tile_size1);
1311  const diff_t i2 =
1312  IndexMapper2::template index<diff_t>(ctx.get_indices_and_dims()) *
1313  static_cast<diff_t>(tile_size2);
1314 
1315  body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
1316  segment1.slice(i1, static_cast<diff_t>(tile_size1)),
1317  segment2.slice(i2, static_cast<diff_t>(tile_size2)));
1318  }
1319 };
1320 
1321 template<typename SEGMENT, typename IndexMapper>
1323  RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
1324  kernel_sync_requirement::none,
1325  IndexMapper>,
1326  SEGMENT>
1327 {
1328 
1329  using diff_t = typename std::iterator_traits<
1330  typename SEGMENT::iterator>::difference_type;
1331 
1332  template<typename LaunchContextPolicy, typename TILE_T, typename BODY>
1333  static RAJA_INLINE RAJA_DEVICE void exec(
1335  TILE_T tile_size,
1336  SEGMENT const& segment,
1337  BODY const& body)
1338  {
1339  const diff_t len = segment.end() - segment.begin();
1340  const diff_t i =
1341  IndexMapper::template index<diff_t>(ctx.get_indices_and_dims()) *
1342  static_cast<diff_t>(tile_size);
1343 
1344  if (i < len)
1345  {
1346  body(segment.slice(i, static_cast<diff_t>(tile_size)));
1347  }
1348  }
1349 };
1350 
1351 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
1353  RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
1354  kernel_sync_requirement::none,
1355  IndexMapper0,
1356  IndexMapper1>,
1357  SEGMENT>
1358 {
1359 
1360  using diff_t = typename std::iterator_traits<
1361  typename SEGMENT::iterator>::difference_type;
1362 
1363  template<typename LaunchContextPolicy, typename TILE_T, typename BODY>
1364  static RAJA_INLINE RAJA_DEVICE void exec(
1366  TILE_T tile_size0,
1367  TILE_T tile_size1,
1368  SEGMENT const& segment0,
1369  SEGMENT const& segment1,
1370  BODY const& body)
1371  {
1372  const diff_t len0 = segment0.end() - segment0.begin();
1373  const diff_t len1 = segment1.end() - segment1.begin();
1374 
1375  const diff_t i0 =
1376  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims()) *
1377  static_cast<diff_t>(tile_size0);
1378  const diff_t i1 =
1379  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims()) *
1380  static_cast<diff_t>(tile_size1);
1381 
1382  if (i0 < len0 && i1 < len1)
1383  {
1384  body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
1385  segment1.slice(i1, static_cast<diff_t>(tile_size1)));
1386  }
1387  }
1388 };
1389 
1390 template<typename SEGMENT,
1391  typename IndexMapper0,
1392  typename IndexMapper1,
1393  typename IndexMapper2>
1395  RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
1396  kernel_sync_requirement::none,
1397  IndexMapper0,
1398  IndexMapper1,
1399  IndexMapper2>,
1400  SEGMENT>
1401 {
1402 
1403  using diff_t = typename std::iterator_traits<
1404  typename SEGMENT::iterator>::difference_type;
1405 
1406  template<typename LaunchContextPolicy, typename TILE_T, typename BODY>
1407  static RAJA_INLINE RAJA_DEVICE void exec(
1409  TILE_T tile_size0,
1410  TILE_T tile_size1,
1411  TILE_T tile_size2,
1412  SEGMENT const& segment0,
1413  SEGMENT const& segment1,
1414  SEGMENT const& segment2,
1415  BODY const& body)
1416  {
1417  const diff_t len0 = segment0.end() - segment0.begin();
1418  const diff_t len1 = segment1.end() - segment1.begin();
1419  const diff_t len2 = segment2.end() - segment2.begin();
1420 
1421  const diff_t i0 =
1422  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims()) *
1423  static_cast<diff_t>(tile_size0);
1424  const diff_t i1 =
1425  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims()) *
1426  static_cast<diff_t>(tile_size1);
1427  const diff_t i2 =
1428  IndexMapper2::template index<diff_t>(ctx.get_indices_and_dims()) *
1429  static_cast<diff_t>(tile_size2);
1430 
1431  if (i0 < len0 && i1 < len1 && i2 < len2)
1432  {
1433  body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
1434  segment1.slice(i1, static_cast<diff_t>(tile_size1)),
1435  segment2.slice(i2, static_cast<diff_t>(tile_size2)));
1436  }
1437  }
1438 };
1439 
1440 template<typename SEGMENT, typename IndexMapper>
1442  RAJA::policy::cuda::cuda_indexer<
1443  RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
1444  kernel_sync_requirement::none,
1445  IndexMapper>,
1446  SEGMENT>
1447 {
1448 
1449  using diff_t = typename std::iterator_traits<
1450  typename SEGMENT::iterator>::difference_type;
1451 
1452  template<typename LaunchContextPolicy, typename TILE_T, typename BODY>
1453  static RAJA_INLINE RAJA_DEVICE void exec(
1455  TILE_T tile_size,
1456  SEGMENT const& segment,
1457  BODY const& body)
1458  {
1459  const diff_t len = segment.end() - segment.begin();
1460  const diff_t i_init =
1461  IndexMapper::template index<diff_t>(ctx.get_indices_and_dims()) *
1462  static_cast<diff_t>(tile_size);
1463  const diff_t i_stride =
1464  IndexMapper::template size<diff_t>(ctx.get_indices_and_dims()) *
1465  static_cast<diff_t>(tile_size);
1466 
1467  for (diff_t i = i_init; i < len; i += i_stride)
1468  {
1469  body(segment.slice(i, static_cast<diff_t>(tile_size)));
1470  }
1471  }
1472 };
1473 
1474 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
1476  RAJA::policy::cuda::cuda_indexer<
1477  RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
1478  kernel_sync_requirement::none,
1479  IndexMapper0,
1480  IndexMapper1>,
1481  SEGMENT>
1482 {
1483 
1484  using diff_t = typename std::iterator_traits<
1485  typename SEGMENT::iterator>::difference_type;
1486 
1487  template<typename LaunchContextPolicy, typename TILE_T, typename BODY>
1488  static RAJA_INLINE RAJA_DEVICE void exec(
1490  TILE_T tile_size0,
1491  TILE_T tile_size1,
1492  SEGMENT const& segment0,
1493  SEGMENT const& segment1,
1494  BODY const& body)
1495  {
1496  const diff_t len0 = segment0.end() - segment0.begin();
1497  const diff_t len1 = segment1.end() - segment1.begin();
1498 
1499  const diff_t i0_init =
1500  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims()) *
1501  static_cast<diff_t>(tile_size0);
1502  const diff_t i1_init =
1503  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims()) *
1504  static_cast<diff_t>(tile_size1);
1505 
1506  const diff_t i0_stride =
1507  IndexMapper0::template size<diff_t>(ctx.get_indices_and_dims()) *
1508  static_cast<diff_t>(tile_size0);
1509  const diff_t i1_stride =
1510  IndexMapper1::template size<diff_t>(ctx.get_indices_and_dims()) *
1511  static_cast<diff_t>(tile_size1);
1512 
1513  for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
1514  {
1515  for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
1516  {
1517  body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
1518  segment1.slice(i1, static_cast<diff_t>(tile_size1)));
1519  }
1520  }
1521  }
1522 };
1523 
1524 template<typename SEGMENT,
1525  typename IndexMapper0,
1526  typename IndexMapper1,
1527  typename IndexMapper2>
1529  RAJA::policy::cuda::cuda_indexer<
1530  RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
1531  kernel_sync_requirement::none,
1532  IndexMapper0,
1533  IndexMapper1,
1534  IndexMapper2>,
1535  SEGMENT>
1536 {
1537 
1538  using diff_t = typename std::iterator_traits<
1539  typename SEGMENT::iterator>::difference_type;
1540 
1541  template<typename LaunchContextPolicy, typename TILE_T, typename BODY>
1542  static RAJA_INLINE RAJA_DEVICE void exec(
1544  TILE_T tile_size0,
1545  TILE_T tile_size1,
1546  TILE_T tile_size2,
1547  SEGMENT const& segment0,
1548  SEGMENT const& segment1,
1549  SEGMENT const& segment2,
1550  BODY const& body)
1551  {
1552  const diff_t len0 = segment0.end() - segment0.begin();
1553  const diff_t len1 = segment1.end() - segment1.begin();
1554  const diff_t len2 = segment2.end() - segment2.begin();
1555 
1556  const diff_t i0_init =
1557  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims()) *
1558  static_cast<diff_t>(tile_size0);
1559  const diff_t i1_init =
1560  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims()) *
1561  static_cast<diff_t>(tile_size1);
1562  const diff_t i2_init =
1563  IndexMapper2::template index<diff_t>(ctx.get_indices_and_dims()) *
1564  static_cast<diff_t>(tile_size2);
1565 
1566  const diff_t i0_stride =
1567  IndexMapper0::template size<diff_t>(ctx.get_indices_and_dims()) *
1568  static_cast<diff_t>(tile_size0);
1569  const diff_t i1_stride =
1570  IndexMapper1::template size<diff_t>(ctx.get_indices_and_dims()) *
1571  static_cast<diff_t>(tile_size1);
1572  const diff_t i2_stride =
1573  IndexMapper2::template size<diff_t>(ctx.get_indices_and_dims()) *
1574  static_cast<diff_t>(tile_size2);
1575 
1576  for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
1577  {
1578  for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
1579  {
1580  for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
1581  {
1582  body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
1583  segment1.slice(i1, static_cast<diff_t>(tile_size1)),
1584  segment2.slice(i2, static_cast<diff_t>(tile_size2)));
1585  }
1586  }
1587  }
1588  }
1589 };
1590 
1591 /*
1592  CUDA generic tile_tcount implementations
1593 */
1594 template<typename SEGMENT, typename IndexMapper>
1596  RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
1597  kernel_sync_requirement::none,
1598  IndexMapper>,
1599  SEGMENT>
1600 {
1601 
1602  using diff_t = typename std::iterator_traits<
1603  typename SEGMENT::iterator>::difference_type;
1604 
1605  template<typename LaunchContextPolicy, typename TILE_T, typename BODY>
1606  static RAJA_INLINE RAJA_DEVICE void exec(
1608  TILE_T tile_size,
1609  SEGMENT const& segment,
1610  BODY const& body)
1611  {
1612  const diff_t t =
1613  IndexMapper::template index<diff_t>(ctx.get_indices_and_dims());
1614  const diff_t i = t * static_cast<diff_t>(tile_size);
1615 
1616  body(segment.slice(i, static_cast<diff_t>(tile_size)), t);
1617  }
1618 };
1619 
1620 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
1622  RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
1623  kernel_sync_requirement::none,
1624  IndexMapper0,
1625  IndexMapper1>,
1626  SEGMENT>
1627 {
1628 
1629  using diff_t = typename std::iterator_traits<
1630  typename SEGMENT::iterator>::difference_type;
1631 
1632  template<typename LaunchContextPolicy, typename TILE_T, typename BODY>
1633  static RAJA_INLINE RAJA_DEVICE void exec(
1635  TILE_T tile_size0,
1636  TILE_T tile_size1,
1637  SEGMENT const& segment0,
1638  SEGMENT const& segment1,
1639  BODY const& body)
1640  {
1641  const diff_t t0 =
1642  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims());
1643  const diff_t t1 =
1644  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims());
1645 
1646  const diff_t i0 = t0 * static_cast<diff_t>(tile_size0);
1647  const diff_t i1 = t1 * static_cast<diff_t>(tile_size1);
1648 
1649  body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
1650  segment1.slice(i1, static_cast<diff_t>(tile_size1)), t0, t1);
1651  }
1652 };
1653 
1654 template<typename SEGMENT,
1655  typename IndexMapper0,
1656  typename IndexMapper1,
1657  typename IndexMapper2>
1659  RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
1660  kernel_sync_requirement::none,
1661  IndexMapper0,
1662  IndexMapper1,
1663  IndexMapper2>,
1664  SEGMENT>
1665 {
1666 
1667  using diff_t = typename std::iterator_traits<
1668  typename SEGMENT::iterator>::difference_type;
1669 
1670  template<typename LaunchContextPolicy, typename TILE_T, typename BODY>
1671  static RAJA_INLINE RAJA_DEVICE void exec(
1673  TILE_T tile_size0,
1674  TILE_T tile_size1,
1675  TILE_T tile_size2,
1676  SEGMENT const& segment0,
1677  SEGMENT const& segment1,
1678  SEGMENT const& segment2,
1679  BODY const& body)
1680  {
1681  const diff_t t0 =
1682  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims());
1683  const diff_t t1 =
1684  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims());
1685  const diff_t t2 =
1686  IndexMapper2::template index<diff_t>(ctx.get_indices_and_dims());
1687 
1688  const diff_t i0 = t0 * static_cast<diff_t>(tile_size0);
1689  const diff_t i1 = t1 * static_cast<diff_t>(tile_size1);
1690  const diff_t i2 = t2 * static_cast<diff_t>(tile_size2);
1691 
1692  body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
1693  segment1.slice(i1, static_cast<diff_t>(tile_size1)),
1694  segment2.slice(i2, static_cast<diff_t>(tile_size2)), t0, t1, t2);
1695  }
1696 };
1697 
1698 template<typename SEGMENT, typename IndexMapper>
1700  RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
1701  kernel_sync_requirement::none,
1702  IndexMapper>,
1703  SEGMENT>
1704 {
1705 
1706  using diff_t = typename std::iterator_traits<
1707  typename SEGMENT::iterator>::difference_type;
1708 
1709  template<typename LaunchContextPolicy, typename TILE_T, typename BODY>
1710  static RAJA_INLINE RAJA_DEVICE void exec(
1712  TILE_T tile_size,
1713  SEGMENT const& segment,
1714  BODY const& body)
1715  {
1716  const diff_t len = segment.end() - segment.begin();
1717  const diff_t t =
1718  IndexMapper::template index<diff_t>(ctx.get_indices_and_dims());
1719  const diff_t i = t * static_cast<diff_t>(tile_size);
1720 
1721  if (i < len)
1722  {
1723  body(segment.slice(i, static_cast<diff_t>(tile_size)), t);
1724  }
1725  }
1726 };
1727 
1728 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
1730  RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
1731  kernel_sync_requirement::none,
1732  IndexMapper0,
1733  IndexMapper1>,
1734  SEGMENT>
1735 {
1736 
1737  using diff_t = typename std::iterator_traits<
1738  typename SEGMENT::iterator>::difference_type;
1739 
1740  template<typename LaunchContextPolicy, typename TILE_T, typename BODY>
1741  static RAJA_INLINE RAJA_DEVICE void exec(
1743  TILE_T tile_size0,
1744  TILE_T tile_size1,
1745  SEGMENT const& segment0,
1746  SEGMENT const& segment1,
1747  BODY const& body)
1748  {
1749  const diff_t len0 = segment0.end() - segment0.begin();
1750  const diff_t len1 = segment1.end() - segment1.begin();
1751 
1752  const diff_t t0 =
1753  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims());
1754  const diff_t t1 =
1755  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims());
1756 
1757  const diff_t i0 = t0 * static_cast<diff_t>(tile_size0);
1758  const diff_t i1 = t1 * static_cast<diff_t>(tile_size1);
1759 
1760  if (i0 < len0 && i1 < len1)
1761  {
1762  body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
1763  segment1.slice(i1, static_cast<diff_t>(tile_size1)), t0, t1);
1764  }
1765  }
1766 };
1767 
1768 template<typename SEGMENT,
1769  typename IndexMapper0,
1770  typename IndexMapper1,
1771  typename IndexMapper2>
1773  RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
1774  kernel_sync_requirement::none,
1775  IndexMapper0,
1776  IndexMapper1,
1777  IndexMapper2>,
1778  SEGMENT>
1779 {
1780 
1781  using diff_t = typename std::iterator_traits<
1782  typename SEGMENT::iterator>::difference_type;
1783 
1784  template<typename LaunchContextPolicy, typename TILE_T, typename BODY>
1785  static RAJA_INLINE RAJA_DEVICE void exec(
1787  TILE_T tile_size0,
1788  TILE_T tile_size1,
1789  TILE_T tile_size2,
1790  SEGMENT const& segment0,
1791  SEGMENT const& segment1,
1792  SEGMENT const& segment2,
1793  BODY const& body)
1794  {
1795  const diff_t len0 = segment0.end() - segment0.begin();
1796  const diff_t len1 = segment1.end() - segment1.begin();
1797  const diff_t len2 = segment2.end() - segment2.begin();
1798 
1799  const diff_t t0 =
1800  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims());
1801  const diff_t t1 =
1802  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims());
1803  const diff_t t2 =
1804  IndexMapper2::template index<diff_t>(ctx.get_indices_and_dims());
1805 
1806  const diff_t i0 = t0 * static_cast<diff_t>(tile_size0);
1807  const diff_t i1 = t1 * static_cast<diff_t>(tile_size1);
1808  const diff_t i2 = t2 * static_cast<diff_t>(tile_size2);
1809 
1810  if (i0 < len0 && i1 < len1 && i2 < len2)
1811  {
1812  body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
1813  segment1.slice(i1, static_cast<diff_t>(tile_size1)),
1814  segment2.slice(i2, static_cast<diff_t>(tile_size2)), t0, t1, t2);
1815  }
1816  }
1817 };
1818 
1819 template<typename SEGMENT, typename IndexMapper>
1821  RAJA::policy::cuda::cuda_indexer<
1822  RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
1823  kernel_sync_requirement::none,
1824  IndexMapper>,
1825  SEGMENT>
1826 {
1827 
1828  using diff_t = typename std::iterator_traits<
1829  typename SEGMENT::iterator>::difference_type;
1830 
1831  template<typename LaunchContextPolicy, typename TILE_T, typename BODY>
1832  static RAJA_INLINE RAJA_DEVICE void exec(
1834  TILE_T tile_size,
1835  SEGMENT const& segment,
1836  BODY const& body)
1837  {
1838  const diff_t len = segment.end() - segment.begin();
1839  const diff_t t_init =
1840  IndexMapper::template index<diff_t>(ctx.get_indices_and_dims());
1841  const diff_t i_init = t_init * static_cast<diff_t>(tile_size);
1842  const diff_t t_stride =
1843  IndexMapper::template size<diff_t>(ctx.get_indices_and_dims());
1844  const diff_t i_stride = t_stride * static_cast<diff_t>(tile_size);
1845 
1846  for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
1847  {
1848  body(segment.slice(i, static_cast<diff_t>(tile_size)), t);
1849  }
1850  }
1851 };
1852 
1853 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
1855  RAJA::policy::cuda::cuda_indexer<
1856  RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
1857  kernel_sync_requirement::none,
1858  IndexMapper0,
1859  IndexMapper1>,
1860  SEGMENT>
1861 {
1862 
1863  using diff_t = typename std::iterator_traits<
1864  typename SEGMENT::iterator>::difference_type;
1865 
1866  template<typename LaunchContextPolicy, typename TILE_T, typename BODY>
1867  static RAJA_INLINE RAJA_DEVICE void exec(
1869  TILE_T tile_size0,
1870  TILE_T tile_size1,
1871  SEGMENT const& segment0,
1872  SEGMENT const& segment1,
1873  BODY const& body)
1874  {
1875  const diff_t len0 = segment0.end() - segment0.begin();
1876  const diff_t len1 = segment1.end() - segment1.begin();
1877 
1878  const diff_t t0_init =
1879  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims());
1880  const diff_t t1_init =
1881  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims());
1882 
1883  const diff_t i0_init = t0_init * static_cast<diff_t>(tile_size0);
1884  const diff_t i1_init = t1_init * static_cast<diff_t>(tile_size1);
1885 
1886  const diff_t t0_stride =
1887  IndexMapper0::template size<diff_t>(ctx.get_indices_and_dims());
1888  const diff_t t1_stride =
1889  IndexMapper1::template size<diff_t>(ctx.get_indices_and_dims());
1890 
1891  const diff_t i0_stride = t0_stride * static_cast<diff_t>(tile_size0);
1892  const diff_t i1_stride = t1_stride * static_cast<diff_t>(tile_size1);
1893 
1894  for (diff_t i0 = i0_init, t0 = t0_init; i0 < len0;
1895  i0 += i0_stride, t0 += t0_stride)
1896  {
1897  for (diff_t i1 = i1_init, t1 = t1_init; i1 < len1;
1898  i1 += i1_stride, t1 += t1_stride)
1899  {
1900  body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
1901  segment1.slice(i1, static_cast<diff_t>(tile_size1)), t0, t1);
1902  }
1903  }
1904  }
1905 };
1906 
1907 template<typename SEGMENT,
1908  typename IndexMapper0,
1909  typename IndexMapper1,
1910  typename IndexMapper2>
1912  RAJA::policy::cuda::cuda_indexer<
1913  RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
1914  kernel_sync_requirement::none,
1915  IndexMapper0,
1916  IndexMapper1,
1917  IndexMapper2>,
1918  SEGMENT>
1919 {
1920 
1921  using diff_t = typename std::iterator_traits<
1922  typename SEGMENT::iterator>::difference_type;
1923 
1924  template<typename LaunchContextPolicy, typename TILE_T, typename BODY>
1925  static RAJA_INLINE RAJA_DEVICE void exec(
1927  TILE_T tile_size0,
1928  TILE_T tile_size1,
1929  TILE_T tile_size2,
1930  SEGMENT const& segment0,
1931  SEGMENT const& segment1,
1932  SEGMENT const& segment2,
1933  BODY const& body)
1934  {
1935  const diff_t len0 = segment0.end() - segment0.begin();
1936  const diff_t len1 = segment1.end() - segment1.begin();
1937  const diff_t len2 = segment2.end() - segment2.begin();
1938 
1939  const diff_t t0_init =
1940  IndexMapper0::template index<diff_t>(ctx.get_indices_and_dims());
1941  const diff_t t1_init =
1942  IndexMapper1::template index<diff_t>(ctx.get_indices_and_dims());
1943  const diff_t t2_init =
1944  IndexMapper2::template index<diff_t>(ctx.get_indices_and_dims());
1945 
1946  const diff_t i0_init = t0_init * static_cast<diff_t>(tile_size0);
1947  const diff_t i1_init = t1_init * static_cast<diff_t>(tile_size1);
1948  const diff_t i2_init = t2_init * static_cast<diff_t>(tile_size2);
1949 
1950  const diff_t t0_stride =
1951  IndexMapper0::template size<diff_t>(ctx.get_indices_and_dims());
1952  const diff_t t1_stride =
1953  IndexMapper1::template size<diff_t>(ctx.get_indices_and_dims());
1954  const diff_t t2_stride =
1955  IndexMapper2::template size<diff_t>(ctx.get_indices_and_dims());
1956 
1957  const diff_t i0_stride = t0_stride * static_cast<diff_t>(tile_size0);
1958  const diff_t i1_stride = t1_stride * static_cast<diff_t>(tile_size1);
1959  const diff_t i2_stride = t2_stride * static_cast<diff_t>(tile_size2);
1960 
1961  for (diff_t i0 = i0_init, t0 = t0_init; i0 < len0;
1962  i0 += i0_stride, t0 += t0_stride)
1963  {
1964  for (diff_t i1 = i1_init, t1 = t1_init; i1 < len1;
1965  i1 += i1_stride, t1 += t1_stride)
1966  {
1967  for (diff_t i2 = i2_init, t2 = t2_init; i2 < len2;
1968  i2 += i2_stride, t2 += t2_stride)
1969  {
1970  body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
1971  segment1.slice(i1, static_cast<diff_t>(tile_size1)),
1972  segment2.slice(i2, static_cast<diff_t>(tile_size2)), t0, t1, t2);
1973  }
1974  }
1975  }
1976  }
1977 };
1978 
1979 } // namespace RAJA
1980 #endif
Header file defining prototypes for routines used to manage memory for CUDA reductions and other oper...
Definition: launch_core.hpp:192
RAJA_HOST_DEVICE RAJA_INLINE LaunchContextT()
Definition: launch.hpp:48
RAJA_HOST_DEVICE RAJA_INLINE indices_and_dims_t const & get_indices_and_dims() const
Definition: launch.hpp:53
Definition: launch_context_policy.hpp:30
Header file containing RAJA CUDA policy definitions.
RAJA header file containing the core components of RAJA::launch.
#define RAJA_HOST_DEVICE
Definition: macros.hpp:65
#define RAJA_DEVICE
Definition: macros.hpp:66
Args args
Definition: WorkRunner.hpp:212
constexpr RAJA_HOST_DEVICE auto invoke_body(Params &&params, Fn &&f, Ts &&... extra)
Definition: forall.hpp:598
RAJA_HOST_DEVICE auto thread_privatize(const T &item) -> Privatizer< T >
Create a private copy of the argument to be stored on the current thread's stack in a class of the Pr...
Definition: privatizer.hpp:88
Definition: AlignedRangeIndexSetBuilders.cpp:35
LaunchContextType ctx
Definition: launch.hpp:185
__launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ void launch_new_reduce_global_fcn_fixed(const RAJA_CUDA_GRID_CONSTANT BODY body_in
ReduceParams reduce_params
Definition: launch.hpp:173
__global__ void launch_new_reduce_global_fcn(const RAJA_CUDA_GRID_CONSTANT BODY body_in, ReduceParams reduce_params)
Definition: launch.hpp:61
void launch(LaunchParams const &launch_params, ReduceParams &&... rest_of_launch_args)
Definition: launch_core.hpp:268
auto & body
Definition: launch.hpp:177
__shared__ char raja_shmem_ptr[]
Definition: launch.hpp:174
@ unspecified
Definition: types.hpp:46
typename RAJA::detail::launch_context_type< BODY >::type LaunchContextType
Definition: launch.hpp:183
auto privatizer
Definition: launch.hpp:176
Header file containing utility methods used in CUDA operations.
Header file for RAJA resource definitions.
static concepts::enable_if_t< resources::EventProxy< resources::Resource >, RAJA::expt::type_traits::is_ForallParamPack< ReduceParams > > exec(RAJA::resources::Resource res, const LaunchParams &launch_params, BODY_IN &&body_in, ReduceParams &launch_reducers)
Definition: launch.hpp:203
static concepts::enable_if_t< resources::EventProxy< resources::Resource >, RAJA::expt::type_traits::is_ForallParamPack< ReduceParams > > exec(RAJA::resources::Resource res, const LaunchParams &launch_params, BODY_IN &&body_in, ReduceParams &launch_reducers)
Definition: launch.hpp:95
Definition: launch_core.hpp:263
Definition: launch_core.hpp:163
size_t shared_mem_size
Definition: launch_core.hpp:167
Teams teams
Definition: launch_core.hpp:165
Threads threads
Definition: launch_core.hpp:166
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, SEGMENT const &segment, BODY const &body)
Definition: launch.hpp:1050
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, SEGMENT const &segment, BODY const &body)
Definition: launch.hpp:1146
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, SEGMENT const &segment, BODY const &body)
Definition: launch.hpp:1187
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, SEGMENT const &segment, BODY const &body)
Definition: launch.hpp:968
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, SEGMENT const &segment, BODY const &body)
Definition: launch.hpp:1003
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, SEGMENT const &segment, BODY const &body)
Definition: launch.hpp:1090
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, SEGMENT const &segment0, SEGMENT const &segment1, BODY const &body)
Definition: launch.hpp:517
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, SEGMENT const &segment0, SEGMENT const &segment1, SEGMENT const &segment2, BODY const &body)
Definition: launch.hpp:348
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, SEGMENT const &segment0, SEGMENT const &segment1, SEGMENT const &segment2, BODY const &body)
Definition: launch.hpp:446
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, SEGMENT const &segment0, SEGMENT const &segment1, SEGMENT const &segment2, BODY const &body)
Definition: launch.hpp:566
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, SEGMENT const &segment0, SEGMENT const &segment1, BODY const &body)
Definition: launch.hpp:316
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, SEGMENT const &segment, BODY const &body)
Definition: launch.hpp:379
typename std::iterator_traits< typename SEGMENT::iterator >::difference_type diff_t
Definition: launch.hpp:376
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, SEGMENT const &segment, BODY const &body)
Definition: launch.hpp:291
typename std::iterator_traits< typename SEGMENT::iterator >::difference_type diff_t
Definition: launch.hpp:288
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, SEGMENT const &segment, BODY const &body)
Definition: launch.hpp:485
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, SEGMENT const &segment0, SEGMENT const &segment1, BODY const &body)
Definition: launch.hpp:408
Definition: launch_core.hpp:480
typename std::iterator_traits< typename SEGMENT::iterator >::difference_type diff_t
Definition: launch.hpp:708
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, SEGMENT const &segment, BODY const &body)
Definition: launch.hpp:711
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, SEGMENT const &segment, BODY const &body)
Definition: launch.hpp:817
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, SEGMENT const &segment0, SEGMENT const &segment1, SEGMENT const &segment2, BODY const &body)
Definition: launch.hpp:680
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, SEGMENT const &segment0, SEGMENT const &segment1, SEGMENT const &segment2, BODY const &body)
Definition: launch.hpp:778
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, SEGMENT const &segment0, SEGMENT const &segment1, BODY const &body)
Definition: launch.hpp:740
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, SEGMENT const &segment, BODY const &body)
Definition: launch.hpp:623
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, SEGMENT const &segment0, SEGMENT const &segment1, BODY const &body)
Definition: launch.hpp:849
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, SEGMENT const &segment0, SEGMENT const &segment1, SEGMENT const &segment2, BODY const &body)
Definition: launch.hpp:898
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, SEGMENT const &segment0, SEGMENT const &segment1, BODY const &body)
Definition: launch.hpp:648
Definition: launch_core.hpp:483
int value[3]
Definition: launch_core.hpp:99
int value[3]
Definition: launch_core.hpp:124
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, SEGMENT const &segment0, SEGMENT const &segment1, SEGMENT const &segment2, BODY const &body)
Definition: launch.hpp:1542
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, TILE_T tile_size, SEGMENT const &segment, BODY const &body)
Definition: launch.hpp:1333
typename std::iterator_traits< typename SEGMENT::iterator >::difference_type diff_t
Definition: launch.hpp:1330
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, TILE_T tile_size0, TILE_T tile_size1, SEGMENT const &segment0, SEGMENT const &segment1, BODY const &body)
Definition: launch.hpp:1364
typename std::iterator_traits< typename SEGMENT::iterator >::difference_type diff_t
Definition: launch.hpp:1361
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, TILE_T tile_size0, TILE_T tile_size1, SEGMENT const &segment0, SEGMENT const &segment1, BODY const &body)
Definition: launch.hpp:1488
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, SEGMENT const &segment0, SEGMENT const &segment1, SEGMENT const &segment2, BODY const &body)
Definition: launch.hpp:1407
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, SEGMENT const &segment0, SEGMENT const &segment1, SEGMENT const &segment2, BODY const &body)
Definition: launch.hpp:1295
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, TILE_T tile_size, SEGMENT const &segment, BODY const &body)
Definition: launch.hpp:1453
typename std::iterator_traits< typename SEGMENT::iterator >::difference_type diff_t
Definition: launch.hpp:1228
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, TILE_T tile_size, SEGMENT const &segment, BODY const &body)
Definition: launch.hpp:1231
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, TILE_T tile_size0, TILE_T tile_size1, SEGMENT const &segment0, SEGMENT const &segment1, BODY const &body)
Definition: launch.hpp:1258
Definition: launch_core.hpp:579
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, TILE_T tile_size0, TILE_T tile_size1, SEGMENT const &segment0, SEGMENT const &segment1, BODY const &body)
Definition: launch.hpp:1867
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, SEGMENT const &segment0, SEGMENT const &segment1, SEGMENT const &segment2, BODY const &body)
Definition: launch.hpp:1671
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, TILE_T tile_size, SEGMENT const &segment, BODY const &body)
Definition: launch.hpp:1832
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, SEGMENT const &segment0, SEGMENT const &segment1, SEGMENT const &segment2, BODY const &body)
Definition: launch.hpp:1785
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, TILE_T tile_size0, TILE_T tile_size1, SEGMENT const &segment0, SEGMENT const &segment1, BODY const &body)
Definition: launch.hpp:1741
typename std::iterator_traits< typename SEGMENT::iterator >::difference_type diff_t
Definition: launch.hpp:1707
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, TILE_T tile_size, SEGMENT const &segment, BODY const &body)
Definition: launch.hpp:1710
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, TILE_T tile_size0, TILE_T tile_size1, SEGMENT const &segment0, SEGMENT const &segment1, BODY const &body)
Definition: launch.hpp:1633
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, SEGMENT const &segment0, SEGMENT const &segment1, SEGMENT const &segment2, BODY const &body)
Definition: launch.hpp:1925
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT< LaunchContextPolicy > const &ctx, TILE_T tile_size, SEGMENT const &segment, BODY const &body)
Definition: launch.hpp:1606
typename std::iterator_traits< typename SEGMENT::iterator >::difference_type diff_t
Definition: launch.hpp:1603
Definition: launch_core.hpp:582
static constexpr void parampack_resolve(EXEC_POL const &pol, ForallParamPack< Params... > &f_params, Args &&... args)
Definition: forall.hpp:304
static constexpr void parampack_init(EXEC_POL const &pol, ForallParamPack< Params... > &f_params, Args &&... args)
Definition: forall.hpp:269
static RAJA_HOST_DEVICE constexpr void parampack_combine(EXEC_POL const &pol, ForallParamPack< Params... > &f_params, Args &&... args)
Definition: forall.hpp:286
Definition: TypeTraits.hpp:59