RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
policy.hpp
Go to the documentation of this file.
1 
11 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
12 // Copyright (c) Lawrence Livermore National Security, LLC and other
13 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
14 // files for dates and other details. No copyright assignment is required
15 // to contribute to RAJA.
16 //
17 // SPDX-License-Identifier: (BSD-3-Clause)
18 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
19 
20 #ifndef RAJA_policy_cuda_HPP
21 #define RAJA_policy_cuda_HPP
22 
23 #include "RAJA/config.hpp"
24 
25 #if defined(RAJA_CUDA_ACTIVE)
26 
27 #include <cstddef>
28 #include <utility>
29 
30 #include "RAJA/pattern/reduce.hpp"
31 
35 
36 #include "RAJA/util/Operators.hpp"
38 #include "RAJA/util/types.hpp"
39 #include "RAJA/util/math.hpp"
40 
41 namespace RAJA
42 {
43 
44 using cuda_dim_t = RAJA_CUDA_DIM_T;
45 
46 using cuda_dim_member_t = camp::decay<decltype(std::declval<cuda_dim_t>().x)>;
47 
48 //
50 //
51 // Execution policies
52 //
54 //
55 
59 
60 namespace detail
61 {
62 template<bool Async>
63 struct get_launch
64 {
65  static constexpr RAJA::Launch value = RAJA::Launch::async;
66 };
67 
68 template<>
69 struct get_launch<false>
70 {
71  static constexpr RAJA::Launch value = RAJA::Launch::sync;
72 };
73 } // end namespace detail
74 
75 namespace cuda
76 {
77 
79 template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
80 struct IndexGlobal;
81 
82 template<typename... indexers>
83 struct IndexFlatten;
84 
85 template<size_t divisor, typename index>
86 struct IndexDivide;
87 
88 template<size_t divisor, typename index>
89 struct IndexModulo;
90 
97 struct MaxOccupancyConcretizer
98 {
99  template<typename IdxT, typename Data>
100  static IdxT get_max_grid_size(Data const& data)
101  {
102  IdxT device_sm_per_device = data.device_sm_per_device;
103  IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
104 
105  IdxT func_max_blocks_per_device =
106  func_max_blocks_per_sm * device_sm_per_device;
107 
108  return func_max_blocks_per_device;
109  }
110 };
111 
119 template<typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
120 struct FractionOffsetOccupancyConcretizer
121 {
122  template<typename IdxT, typename Data>
123  static IdxT get_max_grid_size(Data const& data)
124  {
125  using Fraction = typename t_Fraction::template rebind<IdxT>;
126 
127  IdxT device_sm_per_device = data.device_sm_per_device;
128  IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
129 
130  if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0))
131  {
132  func_max_blocks_per_sm = Fraction::multiply(func_max_blocks_per_sm);
133  }
134 
135  if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) >
136  IdxT(0))
137  {
138  func_max_blocks_per_sm =
139  IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
140  }
141 
142  IdxT func_max_blocks_per_device =
143  func_max_blocks_per_sm * device_sm_per_device;
144 
145  return func_max_blocks_per_device;
146  }
147 };
148 
157 template<typename AvoidMaxOccupancyConcretizer>
158 struct AvoidDeviceMaxThreadOccupancyConcretizer
159 {
160  template<typename IdxT, typename Data>
161  static IdxT get_max_grid_size(Data const& data)
162  {
163  IdxT device_max_threads_per_sm = data.device_max_threads_per_sm;
164  IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
165  IdxT func_threads_per_block = data.func_threads_per_block;
166 
167  IdxT func_max_threads_per_sm =
168  func_threads_per_block * func_max_blocks_per_sm;
169 
170  if (func_max_threads_per_sm < device_max_threads_per_sm)
171  {
172  return MaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
173  }
174  else
175  {
176  return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(
177  data);
178  }
179  }
180 };
181 
185 template<size_t preferred_replication>
186 struct ConstantPreferredReplicationConcretizer
187 {
188  template<typename IdxT, typename Data>
189  static IdxT get_preferred_replication(Data const& RAJA_UNUSED_ARG(data))
190  {
191  return IdxT(preferred_replication);
192  }
193 };
194 
200 template<size_t t_cutoff,
201  size_t preferred_replication_before_cutoff,
202  size_t preferred_replication_after_cutoff>
203 struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
204 {
205  template<typename IdxT, typename Data>
206  static IdxT get_preferred_replication(Data const& data)
207  {
208  IdxT cutoff = t_cutoff;
209  IdxT func_threads_per_block = data.func_threads_per_block;
210 
211  if (func_threads_per_block < cutoff)
212  {
213  return IdxT(preferred_replication_before_cutoff);
214  }
215  else
216  {
217  return IdxT(preferred_replication_after_cutoff);
218  }
219  }
220 };
221 
227 template<typename GetPreferredReplication>
228 struct SharedAtomicReplicationMaxPow2Concretizer
229 {
230  template<typename IdxT, typename Data>
231  static IdxT get_shared_replication(Data const& data)
232  {
233  IdxT func_max_shared_replication_per_block =
234  data.func_max_shared_replication_per_block;
235 
236  IdxT preferred_replication =
237  GetPreferredReplication {}.template get_preferred_replication<IdxT>(
238  data);
239 
240  return prev_pow2(
241  std::min(preferred_replication, func_max_shared_replication_per_block));
242  }
243 };
244 
250 template<typename GetPreferredReplication>
251 struct GlobalAtomicReplicationMinPow2Concretizer
252 {
253  template<typename IdxT, typename Data>
254  static IdxT get_global_replication(Data const& data)
255  {
256  IdxT func_min_global_replication = data.func_min_global_replication;
257 
258  IdxT preferred_replication =
259  GetPreferredReplication {}.template get_preferred_replication<IdxT>(
260  data);
261 
262  return next_pow2(
263  std::max(preferred_replication, func_min_global_replication));
264  }
265 };
266 
267 
268 enum struct reduce_algorithm : int
269 {
270  combine_last_block,
271  init_device_combine_atomic_block,
272  init_host_combine_atomic_block
273 };
274 
275 enum struct block_communication_mode : int
276 {
277  device_fence,
278  block_fence
279 };
280 
281 template<reduce_algorithm t_algorithm,
282  block_communication_mode t_comm_mode,
283  size_t t_replication,
284  size_t t_atomic_stride>
285 struct ReduceTuning
286 {
287  static constexpr reduce_algorithm algorithm = t_algorithm;
288  static constexpr block_communication_mode comm_mode = t_comm_mode;
289  static constexpr size_t replication = t_replication;
290  static constexpr size_t atomic_stride = t_atomic_stride;
291  static constexpr bool consistent =
292  (algorithm == reduce_algorithm::combine_last_block);
293 };
294 
295 
296 enum struct multi_reduce_algorithm : int
297 {
298  init_host_combine_block_atomic_then_grid_atomic,
299  init_host_combine_global_atomic
300 };
301 
302 template<typename t_AtomicReplicationConcretizer,
303  typename t_ReplicationIndexer,
304  typename t_OffsetCalculator>
305 struct AtomicReplicationTuning
306 {
307  using AtomicReplicationConcretizer = t_AtomicReplicationConcretizer;
308  using ReplicationIndexer = t_ReplicationIndexer;
309  using OffsetCalculator = t_OffsetCalculator;
310 };
311 
312 template<multi_reduce_algorithm t_algorithm,
313  typename t_SharedAtomicReplicationTuning,
314  typename t_GlobalAtomicReplicationTuning>
315 struct MultiReduceTuning
316 {
317  static constexpr multi_reduce_algorithm algorithm = t_algorithm;
318  using SharedAtomicReplicationTuning = t_SharedAtomicReplicationTuning;
319  using GlobalAtomicReplicationTuning = t_GlobalAtomicReplicationTuning;
320  static constexpr bool consistent = false;
321 };
322 
323 } // namespace cuda
324 
325 namespace policy
326 {
327 namespace cuda
328 {
329 
330 template<typename _IterationMapping,
332  typename... _IterationGetters>
333 struct cuda_indexer
334 {};
335 
336 template<typename _IterationMapping,
338  typename... _IterationGetters>
339 struct cuda_flatten_indexer
341  RAJA::Policy::cuda,
342  RAJA::Pattern::region,
343  detail::get_launch<true /*async */>::value,
344  RAJA::Platform::cuda>
345 {
346  using IterationGetter = RAJA::cuda::IndexFlatten<_IterationGetters...>;
347 };
348 
349 template<typename _IterationMapping,
350  typename _IterationGetter,
351  typename _LaunchConcretizer,
352  size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
353  bool Async = false>
354 struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
355  RAJA::Policy::cuda,
356  RAJA::Pattern::forall,
357  detail::get_launch<Async>::value,
358  RAJA::Platform::cuda>
359 {
360  using IterationMapping = _IterationMapping;
361  using IterationGetter = _IterationGetter;
362  using LaunchConcretizer = _LaunchConcretizer;
363 };
364 
365 template<bool Async,
366  int num_threads = named_usage::unspecified,
367  size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
368 struct cuda_launch_explicit_t
370  RAJA::Policy::cuda,
371  RAJA::Pattern::region,
372  detail::get_launch<Async>::value,
373  RAJA::Platform::cuda>
374 {};
375 
376 //
377 // NOTE: There is no Index set segment iteration policy for CUDA
378 //
379 
383 template<size_t BLOCK_SIZE,
384  size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
385  bool Async = false>
386 struct cuda_work_explicit : public RAJA::make_policy_pattern_launch_platform_t<
387  RAJA::Policy::cuda,
388  RAJA::Pattern::workgroup_exec,
389  detail::get_launch<Async>::value,
390  RAJA::Platform::cuda>
391 {};
392 
397 struct unordered_cuda_loop_y_block_iter_x_threadblock_average
399  RAJA::Policy::cuda,
400  RAJA::Pattern::workgroup_order,
401  RAJA::Platform::cuda>
402 {};
403 
411 
412 template<typename tuning>
413 struct cuda_reduce_policy : public RAJA::make_policy_pattern_launch_platform_t<
414  RAJA::Policy::cuda,
415  RAJA::Pattern::reduce,
416  detail::get_launch<false>::value,
417  RAJA::Platform::cuda,
418  std::conditional_t<tuning::consistent,
419  reduce::ordered,
420  reduce::unordered>>
421 {};
422 
423 template<typename tuning>
424 struct cuda_multi_reduce_policy
426  RAJA::Policy::cuda,
427  RAJA::Pattern::multi_reduce,
428  detail::get_launch<false>::value,
429  RAJA::Platform::cuda,
430  std::conditional_t<tuning::consistent,
431  reduce::ordered,
432  reduce::unordered>>
433 {};
434 
439 template<typename host_policy>
440 struct cuda_atomic_explicit
441 {};
442 
447 using cuda_atomic = cuda_atomic_explicit<seq_atomic>;
448 
449 // Policy for RAJA::statement::Reduce that reduces threads in a block
450 // down to threadIdx 0
451 struct cuda_block_reduce
452 {};
453 
454 // Policy for RAJA::statement::Reduce that reduces threads in a warp
455 // down to the first lane of the warp
456 struct cuda_warp_reduce
457 {};
458 
459 // Policy to map work directly to threads within a warp
460 // Maximum iteration count is WARP_SIZE
461 // Cannot be used in conjunction with cuda_thread_x_*
462 // Multiple warps have to be created by using cuda_thread_{yz}_*
463 struct cuda_warp_direct
464 {};
465 
466 // Policy to map work to threads within a warp using a warp-stride loop
467 // Cannot be used in conjunction with cuda_thread_x_*
468 // Multiple warps have to be created by using cuda_thread_{yz}_*
469 struct cuda_warp_loop
470 {};
471 
472 // Policy to map work to threads within a warp using a bit mask
473 // Cannot be used in conjunction with cuda_thread_x_*
474 // Multiple warps have to be created by using cuda_thread_{yz}_*
475 // Since we are masking specific threads, multiple nested
476 // cuda_warp_masked
477 // can be used to create complex thread interleaving patterns
478 template<typename Mask>
479 struct cuda_warp_masked_direct
480 {};
481 
482 // Policy to map work to threads within a warp using a bit mask
483 // Cannot be used in conjunction with cuda_thread_x_*
484 // Multiple warps have to be created by using cuda_thread_{yz}_*
485 // Since we are masking specific threads, multiple nested
486 // cuda_warp_masked
487 // can be used to create complex thread interleaving patterns
488 template<typename Mask>
489 struct cuda_warp_masked_loop
490 {};
491 
492 template<typename Mask>
493 struct cuda_thread_masked_direct
494 {};
495 
496 template<typename Mask>
497 struct cuda_thread_masked_loop
498 {};
499 
500 struct cuda_synchronize : make_policy_pattern_launch_t<Policy::cuda,
501  Pattern::synchronize,
502  Launch::sync>
503 {};
504 
505 } // end namespace cuda
506 } // end namespace policy
507 
508 namespace internal
509 {
510 
511 RAJA_INLINE
512 int get_size(cuda_dim_t dims)
513 {
514  if (dims.x == 0 && dims.y == 0 && dims.z == 0)
515  {
516  return 0;
517  }
518  return (dims.x ? dims.x : 1) * (dims.y ? dims.y : 1) * (dims.z ? dims.z : 1);
519 }
520 
521 struct CudaDims
522 {
523 
524  cuda_dim_t blocks {0, 0, 0};
525  cuda_dim_t threads {0, 0, 0};
526 
527  CudaDims() = default;
528  CudaDims(CudaDims const&) = default;
529  CudaDims& operator=(CudaDims const&) = default;
530 
531  RAJA_INLINE
532  CudaDims(cuda_dim_member_t default_val)
533  : blocks {default_val, default_val, default_val},
534  threads {default_val, default_val, default_val}
535  {}
536 
537  RAJA_INLINE
538  int num_blocks() const { return get_size(blocks); }
539 
540  RAJA_INLINE
541  int num_threads() const { return get_size(threads); }
542 
543  RAJA_INLINE
544  cuda_dim_t get_blocks() const
545  {
546  if (num_blocks() != 0)
547  {
548  return {(blocks.x ? blocks.x : 1), (blocks.y ? blocks.y : 1),
549  (blocks.z ? blocks.z : 1)};
550  }
551  else
552  {
553  return blocks;
554  }
555  }
556 
557  RAJA_INLINE
558  cuda_dim_t get_threads() const
559  {
560  if (num_threads() != 0)
561  {
562  return {(threads.x ? threads.x : 1), (threads.y ? threads.y : 1),
563  (threads.z ? threads.z : 1)};
564  }
565  else
566  {
567  return threads;
568  }
569  }
570 };
571 
572 template<named_dim dim>
573 struct CudaDimHelper;
574 
575 template<>
576 struct CudaDimHelper<named_dim::x>
577 {
578 
579  template<typename dim_t>
580  RAJA_HOST_DEVICE inline static constexpr cuda_dim_member_t get(dim_t const& d)
581  {
582  return d.x;
583  }
584 
585  template<typename dim_t>
586  RAJA_HOST_DEVICE inline static void set(dim_t& d, cuda_dim_member_t value)
587  {
588  d.x = value;
589  }
590 };
591 
592 template<>
593 struct CudaDimHelper<named_dim::y>
594 {
595 
596  template<typename dim_t>
597  RAJA_HOST_DEVICE inline static constexpr cuda_dim_member_t get(dim_t const& d)
598  {
599  return d.y;
600  }
601 
602  template<typename dim_t>
603  RAJA_HOST_DEVICE inline static void set(dim_t& d, cuda_dim_member_t value)
604  {
605  d.y = value;
606  }
607 };
608 
609 template<>
610 struct CudaDimHelper<named_dim::z>
611 {
612 
613  template<typename dim_t>
614  RAJA_HOST_DEVICE inline static constexpr cuda_dim_member_t get(dim_t const& d)
615  {
616  return d.z;
617  }
618 
619  template<typename dim_t>
620  RAJA_HOST_DEVICE inline static void set(dim_t& d, cuda_dim_member_t value)
621  {
622  d.z = value;
623  }
624 };
625 
626 template<named_dim dim, typename dim_t>
627 RAJA_HOST_DEVICE constexpr cuda_dim_member_t get_cuda_dim(dim_t const& d)
628 {
629  return CudaDimHelper<dim>::get(d);
630 }
631 
632 template<named_dim dim, typename dim_t>
633 RAJA_HOST_DEVICE void set_cuda_dim(dim_t& d, cuda_dim_member_t value)
634 {
635  return CudaDimHelper<dim>::set(d, value);
636 }
637 
638 } // namespace internal
639 
640 namespace cuda
641 {
642 
644 struct IndexSize
645 {
646  cuda_dim_member_t block_size = named_usage::unspecified;
647  cuda_dim_member_t grid_size = named_usage::unspecified;
648 
649  RAJA_HOST_DEVICE constexpr IndexSize(
650  cuda_dim_member_t _block_size = named_usage::unspecified,
651  cuda_dim_member_t _grid_size = named_usage::unspecified)
652  : block_size(_block_size),
653  grid_size(_grid_size)
654  {}
655 };
656 
657 // Class to help cache thread indices or not based on template arg
658 template<bool cache_threadIdx>
659 struct ThreadIndices
660 {
661  template<named_dim dim>
662  RAJA_DEVICE constexpr cuda_dim_member_t get_threadIdx() const
663  {
665  }
666 };
667 
668 template<>
669 struct ThreadIndices<true>
670 {
671  dim3 m_threadIdx;
672 
673  RAJA_HOST_DEVICE ThreadIndices()
674 #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
675  : m_threadIdx(threadIdx)
676 #endif
677  {}
678 
679  template<named_dim dim>
680  RAJA_DEVICE constexpr cuda_dim_member_t get_threadIdx() const
681  {
683  }
684 };
685 
686 // Class to help cache block indices or not based on template arg
687 template<bool cache_blockIdx>
688 struct BlockIndices
689 {
690  template<named_dim dim>
691  RAJA_DEVICE constexpr cuda_dim_member_t get_blockIdx() const
692  {
694  }
695 };
696 
697 template<>
698 struct BlockIndices<true>
699 {
700  dim3 m_blockIdx;
701 
702  RAJA_HOST_DEVICE BlockIndices()
703 #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
704  : m_blockIdx(blockIdx)
705 #endif
706  {}
707 
708  template<named_dim dim>
709  RAJA_DEVICE constexpr cuda_dim_member_t get_blockIdx() const
710  {
712  }
713 };
714 
715 // Class to help cache block dimensions or not based on template arg
716 template<bool cache_blockDim>
717 struct BlockDimensions
718 {
719  template<named_dim dim>
720  RAJA_DEVICE constexpr cuda_dim_member_t get_blockDim() const
721  {
723  }
724 };
725 
726 template<>
727 struct BlockDimensions<true>
728 {
729  dim3 m_blockDim;
730 
731  RAJA_HOST_DEVICE BlockDimensions()
732 #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
733  : m_blockDim(blockDim)
734 #endif
735  {}
736 
737  template<named_dim dim>
738  RAJA_DEVICE constexpr cuda_dim_member_t get_blockDim() const
739  {
741  }
742 };
743 
744 // Class to help cache grid dimensions or not based on template arg
745 template<bool cache_gridDim>
746 struct GridDimensions
747 {
748  template<named_dim dim>
749  RAJA_DEVICE constexpr cuda_dim_member_t get_gridDim() const
750  {
752  }
753 };
754 
755 template<>
756 struct GridDimensions<true>
757 {
758  dim3 m_gridDim = gridDim;
759 
760  RAJA_HOST_DEVICE GridDimensions()
761 #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
762  : m_gridDim(gridDim)
763 #endif
764  {}
765 
766  template<named_dim dim>
767  RAJA_DEVICE constexpr cuda_dim_member_t get_gridDim() const
768  {
770  }
771 };
772 
773 // Class to help cache indices and dimensions or not based on template args
774 template<bool cache_threadIdx,
775  bool cache_blockIdx,
776  bool cache_blockDim,
777  bool cache_gridDim>
778 struct IndicesAndDims : ThreadIndices<cache_threadIdx>,
779  BlockIndices<cache_blockIdx>,
780  BlockDimensions<cache_blockDim>,
781  GridDimensions<cache_gridDim>
782 {};
783 
784 // Nothing cached
785 using NonCachedIndicesAndDims = IndicesAndDims<false, false, false, false>;
786 
787 // threadIdx and blockDim cached, rest not cached
788 using CachedBlockDims = IndicesAndDims<false, false, true, false>;
789 
790 // threadIdx, blockIdx, blockDim, gridDim cached
791 using AllCachedIndicesAndDims = IndicesAndDims<true, true, true, true>;
792 
800 template<typename IndicesAndDimsT = NonCachedIndicesAndDims>
801 struct LaunchContextIndicesAndDimsPolicy
802 {
803  using indices_and_dims_t = IndicesAndDimsT;
804 };
805 
806 using LaunchContextNonCachedIndicesAndDimsPolicy =
807  LaunchContextIndicesAndDimsPolicy<NonCachedIndicesAndDims>;
808 
809 using LaunchContextCachedBlockDimsPolicy =
810  LaunchContextIndicesAndDimsPolicy<CachedBlockDims>;
811 
812 using LaunchContextAllCachedIndicesAndDimsPolicy =
813  LaunchContextIndicesAndDimsPolicy<AllCachedIndicesAndDims>;
814 
817 
820 template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
821 struct IndexGlobal
822 {
823  static_assert(BLOCK_SIZE > 0, "block size must not be negative");
824  static_assert(GRID_SIZE > 0, "grid size must not be negative");
825 
826  static constexpr int block_size = BLOCK_SIZE;
827  static constexpr int grid_size = GRID_SIZE;
828 
829  template<typename IdxT = cuda_dim_member_t,
830  typename IdxNDims = NonCachedIndicesAndDims>
831  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
832  {
833  return static_cast<IdxT>(idxNDims.template get_threadIdx<dim>()) +
834  static_cast<IdxT>(block_size) *
835  static_cast<IdxT>(idxNDims.template get_blockIdx<dim>());
836  }
837 
838  template<typename IdxT = cuda_dim_member_t,
839  typename IdxNDims = NonCachedIndicesAndDims>
840  RAJA_DEVICE static constexpr IdxT size(IdxNDims const& idxNDims = IdxNDims {})
841  {
842  return static_cast<IdxT>(block_size) * static_cast<IdxT>(grid_size);
843  }
844 };
845 
847 template<named_dim dim, int GRID_SIZE>
848 struct IndexGlobal<dim, 1, GRID_SIZE>
849 {
850  static_assert(GRID_SIZE > 0, "grid size must not be negative");
851 
852  static constexpr int block_size = 1;
853  static constexpr int grid_size = GRID_SIZE;
854 
855  template<typename IdxT = cuda_dim_member_t,
856  typename IdxNDims = NonCachedIndicesAndDims>
857  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
858  {
859  return static_cast<IdxT>(idxNDims.template get_blockIdx<dim>());
860  }
861 
862  template<typename IdxT = cuda_dim_member_t,
863  typename IdxNDims = NonCachedIndicesAndDims>
864  RAJA_DEVICE static constexpr IdxT size(IdxNDims const& idxNDims = IdxNDims {})
865  {
866  return static_cast<IdxT>(grid_size);
867  }
868 };
869 
871 template<named_dim dim, int BLOCK_SIZE>
872 struct IndexGlobal<dim, BLOCK_SIZE, 1>
873 {
874  static_assert(BLOCK_SIZE > 0, "block size must not be negative");
875 
876  static constexpr int block_size = BLOCK_SIZE;
877  static constexpr int grid_size = 1;
878 
879  template<typename IdxT = cuda_dim_member_t,
880  typename IdxNDims = NonCachedIndicesAndDims>
881  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
882  {
883  return static_cast<IdxT>(idxNDims.template get_threadIdx<dim>());
884  }
885 
886  template<typename IdxT = cuda_dim_member_t,
887  typename IdxNDims = NonCachedIndicesAndDims>
888  RAJA_DEVICE static constexpr IdxT size(IdxNDims const& idxNDims = IdxNDims {})
889  {
890  return static_cast<IdxT>(block_size);
891  }
892 };
893 
895 template<named_dim dim>
896 struct IndexGlobal<dim, 1, 1>
897 {
898  static constexpr int block_size = 1;
899  static constexpr int grid_size = 1;
900 
901  template<typename IdxT = cuda_dim_member_t,
902  typename IdxNDims = NonCachedIndicesAndDims>
903  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
904  {
905  return static_cast<IdxT>(0);
906  }
907 
908  template<typename IdxT = cuda_dim_member_t,
909  typename IdxNDims = NonCachedIndicesAndDims>
910  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
911  {
912  return static_cast<IdxT>(1);
913  }
914 };
915 
917 template<named_dim dim, int GRID_SIZE>
918 struct IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>
919 {
920  static_assert(GRID_SIZE > 0, "grid size must not be negative");
921 
922  static constexpr int block_size = named_usage::unspecified;
923  static constexpr int grid_size = GRID_SIZE;
924 
925  template<typename IdxT = cuda_dim_member_t,
926  typename IdxNDims = NonCachedIndicesAndDims>
927  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
928  {
929  return static_cast<IdxT>(idxNDims.template get_threadIdx<dim>()) +
930  static_cast<IdxT>(idxNDims.template get_blockDim<dim>()) *
931  static_cast<IdxT>(idxNDims.template get_blockIdx<dim>());
932  }
933 
934  template<typename IdxT = cuda_dim_member_t,
935  typename IdxNDims = NonCachedIndicesAndDims>
936  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
937  {
938  return static_cast<IdxT>(idxNDims.template get_blockDim<dim>()) *
939  static_cast<IdxT>(grid_size);
940  }
941 };
942 
944 template<named_dim dim>
945 struct IndexGlobal<dim, named_usage::unspecified, 1>
946 {
947  static constexpr int block_size = named_usage::unspecified;
948  static constexpr int grid_size = 1;
949 
950  template<typename IdxT = cuda_dim_member_t,
951  typename IdxNDims = NonCachedIndicesAndDims>
952  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
953  {
954  return static_cast<IdxT>(idxNDims.template get_threadIdx<dim>());
955  }
956 
957  template<typename IdxT = cuda_dim_member_t,
958  typename IdxNDims = NonCachedIndicesAndDims>
959  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
960  {
961  return static_cast<IdxT>(idxNDims.template get_blockDim<dim>());
962  }
963 };
964 
966 template<named_dim dim, int BLOCK_SIZE>
967 struct IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>
968 {
969  static_assert(BLOCK_SIZE > 0, "block size must not be negative");
970 
971  static constexpr int block_size = BLOCK_SIZE;
972  static constexpr int grid_size = named_usage::unspecified;
973 
974  template<typename IdxT = cuda_dim_member_t,
975  typename IdxNDims = NonCachedIndicesAndDims>
976  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
977  {
978  return static_cast<IdxT>(idxNDims.template get_threadIdx<dim>()) +
979  static_cast<IdxT>(block_size) *
980  static_cast<IdxT>(idxNDims.template get_blockIdx<dim>());
981  }
982 
983  template<typename IdxT = cuda_dim_member_t,
984  typename IdxNDims = NonCachedIndicesAndDims>
985  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
986  {
987  return static_cast<IdxT>(block_size) *
988  static_cast<IdxT>(idxNDims.template get_gridDim<dim>());
989  }
990 };
991 
993 template<named_dim dim>
994 struct IndexGlobal<dim, 1, named_usage::unspecified>
995 {
996  static constexpr int block_size = 1;
997  static constexpr int grid_size = named_usage::unspecified;
998 
999  template<typename IdxT = cuda_dim_member_t,
1000  typename IdxNDims = NonCachedIndicesAndDims>
1001  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1002  {
1003  return static_cast<IdxT>(idxNDims.template get_blockIdx<dim>());
1004  }
1005 
1006  template<typename IdxT = cuda_dim_member_t,
1007  typename IdxNDims = NonCachedIndicesAndDims>
1008  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1009  {
1010  return static_cast<IdxT>(idxNDims.template get_gridDim<dim>());
1011  }
1012 };
1013 
1015 template<named_dim dim>
1016 struct IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>
1017 {
1018  static constexpr int block_size = named_usage::unspecified;
1019  static constexpr int grid_size = named_usage::unspecified;
1020 
1021  template<typename IdxT = cuda_dim_member_t,
1022  typename IdxNDims = NonCachedIndicesAndDims>
1023  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1024  {
1025  return static_cast<IdxT>(idxNDims.template get_threadIdx<dim>()) +
1026  static_cast<IdxT>(idxNDims.template get_blockDim<dim>()) *
1027  static_cast<IdxT>(idxNDims.template get_blockIdx<dim>());
1028  }
1029 
1030  template<typename IdxT = cuda_dim_member_t,
1031  typename IdxNDims = NonCachedIndicesAndDims>
1032  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1033  {
1034  return static_cast<IdxT>(idxNDims.template get_blockDim<dim>()) *
1035  static_cast<IdxT>(idxNDims.template get_gridDim<dim>());
1036  }
1037 };
1038 
1041 template<named_dim dim, int GRID_SIZE>
1042 struct IndexGlobal<dim, named_usage::ignored, GRID_SIZE>
1043 {
1044  static_assert(GRID_SIZE > 0, "grid size must not be negative");
1045 
1046  static constexpr int block_size = named_usage::ignored;
1047  static constexpr int grid_size = GRID_SIZE;
1048 
1049  template<typename IdxT = cuda_dim_member_t,
1050  typename IdxNDims = NonCachedIndicesAndDims>
1051  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1052  {
1053  return static_cast<IdxT>(idxNDims.template get_blockIdx<dim>());
1054  }
1055 
1056  template<typename IdxT = cuda_dim_member_t,
1057  typename IdxNDims = NonCachedIndicesAndDims>
1058  RAJA_DEVICE static constexpr IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1059  {
1060  return static_cast<IdxT>(grid_size);
1061  }
1062 };
1063 
1065 template<named_dim dim>
1066 struct IndexGlobal<dim, named_usage::ignored, 1>
1067 {
1068  static constexpr int block_size = named_usage::ignored;
1069  static constexpr int grid_size = 1;
1070 
1071  template<typename IdxT = cuda_dim_member_t,
1072  typename IdxNDims = NonCachedIndicesAndDims>
1073  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1074  {
1075  return static_cast<IdxT>(0);
1076  }
1077 
1078  template<typename IdxT = cuda_dim_member_t,
1079  typename IdxNDims = NonCachedIndicesAndDims>
1080  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1081  {
1082  return static_cast<IdxT>(1);
1083  }
1084 };
1085 
1087 template<named_dim dim>
1088 struct IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>
1089 {
1090  static constexpr int block_size = named_usage::ignored;
1091  static constexpr int grid_size = named_usage::unspecified;
1092 
1093  template<typename IdxT = cuda_dim_member_t,
1094  typename IdxNDims = NonCachedIndicesAndDims>
1095  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1096  {
1097  return static_cast<IdxT>(idxNDims.template get_blockIdx<dim>());
1098  }
1099 
1100  template<typename IdxT = cuda_dim_member_t,
1101  typename IdxNDims = NonCachedIndicesAndDims>
1102  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1103  {
1104  return static_cast<IdxT>(idxNDims.template get_gridDim<dim>());
1105  }
1106 };
1107 
1110 template<named_dim dim, int BLOCK_SIZE>
1111 struct IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>
1112 {
1113  static_assert(BLOCK_SIZE > 0, "block size must not be negative");
1114 
1115  static constexpr int block_size = BLOCK_SIZE;
1116  static constexpr int grid_size = named_usage::ignored;
1117 
1118  template<typename IdxT = cuda_dim_member_t,
1119  typename IdxNDims = NonCachedIndicesAndDims>
1120  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1121  {
1122  return static_cast<IdxT>(idxNDims.template get_threadIdx<dim>());
1123  }
1124 
1125  template<typename IdxT = cuda_dim_member_t,
1126  typename IdxNDims = NonCachedIndicesAndDims>
1127  RAJA_DEVICE static constexpr IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1128  {
1129  return static_cast<IdxT>(block_size);
1130  }
1131 };
1132 
1134 template<named_dim dim>
1135 struct IndexGlobal<dim, 1, named_usage::ignored>
1136 {
1137  static constexpr int block_size = 1;
1138  static constexpr int grid_size = named_usage::ignored;
1139 
1140  template<typename IdxT = cuda_dim_member_t,
1141  typename IdxNDims = NonCachedIndicesAndDims>
1142  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1143  {
1144  return static_cast<IdxT>(0);
1145  }
1146 
1147  template<typename IdxT = cuda_dim_member_t,
1148  typename IdxNDims = NonCachedIndicesAndDims>
1149  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1150  {
1151  return static_cast<IdxT>(1);
1152  }
1153 };
1154 
1156 template<named_dim dim>
1157 struct IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>
1158 {
1159  static constexpr int block_size = named_usage::unspecified;
1160  static constexpr int grid_size = named_usage::ignored;
1161 
1162  template<typename IdxT = cuda_dim_member_t,
1163  typename IdxNDims = NonCachedIndicesAndDims>
1164  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1165  {
1166  return static_cast<IdxT>(idxNDims.template get_threadIdx<dim>());
1167  }
1168 
1169  template<typename IdxT = cuda_dim_member_t,
1170  typename IdxNDims = NonCachedIndicesAndDims>
1171  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1172  {
1173  return static_cast<IdxT>(idxNDims.template get_blockDim<dim>());
1174  }
1175 };
1176 
1179 template<named_dim dim>
1180 struct IndexGlobal<dim, named_usage::ignored, named_usage::ignored>
1181 {
1182  static constexpr int block_size = named_usage::ignored;
1183  static constexpr int grid_size = named_usage::ignored;
1184 
1185  template<typename IdxT = cuda_dim_member_t,
1186  typename IdxNDims = NonCachedIndicesAndDims>
1187  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1188  {
1189  return static_cast<IdxT>(0);
1190  }
1191 
1192  template<typename IdxT = cuda_dim_member_t,
1193  typename IdxNDims = NonCachedIndicesAndDims>
1194  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1195  {
1196  return static_cast<IdxT>(1);
1197  }
1198 };
1199 
1200 // useful for flatten global index (includes x)
1201 template<typename x_index>
1202 struct IndexFlatten<x_index>
1203 {
1204 
1205  template<typename IdxT = cuda_dim_member_t,
1206  typename IdxNDims = NonCachedIndicesAndDims>
1207  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1208  {
1209 
1210  return x_index::template index<IdxT>(idxNDims);
1211  }
1212 
1213  template<typename IdxT = cuda_dim_member_t,
1214  typename IdxNDims = NonCachedIndicesAndDims>
1215  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1216  {
1217  return x_index::template size<IdxT>(idxNDims);
1218  }
1219 };
1220 
1221 // useful for flatten global index (includes x,y)
1222 template<typename x_index, typename y_index>
1223 struct IndexFlatten<x_index, y_index>
1224 {
1225 
1226  template<typename IdxT = cuda_dim_member_t,
1227  typename IdxNDims = NonCachedIndicesAndDims>
1228  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1229  {
1230 
1231  return x_index::template index<IdxT>(idxNDims) +
1232  x_index::template size<IdxT>(idxNDims) *
1233  (y_index::template index<IdxT>(idxNDims));
1234  }
1235 
1236  template<typename IdxT = cuda_dim_member_t,
1237  typename IdxNDims = NonCachedIndicesAndDims>
1238  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1239  {
1240  return x_index::template size<IdxT>(idxNDims) *
1241  y_index::template size<IdxT>(idxNDims);
1242  }
1243 };
1244 
1245 // useful for flatten global index (includes x,y,z)
1246 template<typename x_index, typename y_index, typename z_index>
1247 struct IndexFlatten<x_index, y_index, z_index>
1248 {
1249 
1250  template<typename IdxT = cuda_dim_member_t,
1251  typename IdxNDims = NonCachedIndicesAndDims>
1252  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1253  {
1254 
1255  return x_index::template index<IdxT>(idxNDims) +
1256  x_index::template size<IdxT>(idxNDims) *
1257  (y_index::template index<IdxT>(idxNDims) +
1258  y_index::template size<IdxT>(idxNDims) *
1259  z_index::template index<IdxT>(idxNDims));
1260  }
1261 
1262  template<typename IdxT = cuda_dim_member_t,
1263  typename IdxNDims = NonCachedIndicesAndDims>
1264  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1265  {
1266  return x_index::template size<IdxT>(idxNDims) *
1267  y_index::template size<IdxT>(idxNDims) *
1268  z_index::template size<IdxT>(idxNDims);
1269  }
1270 };
1271 
1272 template<size_t divisor, typename indexer>
1273 struct IndexDivide
1274 {
1275  template<typename IdxT = cuda_dim_member_t,
1276  typename IdxNDims = NonCachedIndicesAndDims>
1277  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1278  {
1279  return indexer::template index<IdxT>(idxNDims) / static_cast<IdxT>(divisor);
1280  }
1281 
1282  template<typename IdxT = cuda_dim_member_t,
1283  typename IdxNDims = NonCachedIndicesAndDims>
1284  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1285  {
1286  return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(idxNDims),
1287  static_cast<IdxT>(divisor));
1288  }
1289 };
1290 
1291 template<size_t divisor, typename indexer>
1292 struct IndexModulo
1293 {
1294  template<typename IdxT = cuda_dim_member_t,
1295  typename IdxNDims = NonCachedIndicesAndDims>
1296  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1297  {
1298  return indexer::template index<IdxT>(idxNDims) % static_cast<IdxT>(divisor);
1299  }
1300 
1301  template<typename IdxT = cuda_dim_member_t,
1302  typename IdxNDims = NonCachedIndicesAndDims>
1303  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1304  {
1305  return static_cast<IdxT>(divisor);
1306  }
1307 };
1308 
1309 
1310 // helper to get just the thread indexing part of IndexGlobal
1311 template<typename index_global>
1312 struct get_index_thread;
1313 
1315 template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
1316 struct get_index_thread<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
1317 {
1318  using type = IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
1319 };
1320 
1322 template<typename x_index, typename y_index, typename z_index>
1323 struct get_index_thread<IndexFlatten<x_index, y_index, z_index>>
1324 {
1325  using type = IndexFlatten<typename get_index_thread<x_index>::type,
1326  typename get_index_thread<y_index>::type,
1327  typename get_index_thread<z_index>::type>;
1328 };
1329 
1330 // helper to get just the block indexing part of IndexGlobal
1331 template<typename index_global>
1332 struct get_index_block;
1333 
1335 template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
1336 struct get_index_block<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
1337 {
1338  using type = IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
1339 };
1340 
1342 template<typename x_index, typename y_index, typename z_index>
1343 struct get_index_block<IndexFlatten<x_index, y_index, z_index>>
1344 {
1345  using type = IndexFlatten<typename get_index_block<x_index>::type,
1346  typename get_index_block<y_index>::type,
1347  typename get_index_block<z_index>::type>;
1348 };
1349 
1350 template<size_t BLOCK_SIZE = named_usage::unspecified>
1351 using thread_x = IndexGlobal<named_dim::x, BLOCK_SIZE, named_usage::ignored>;
1352 template<size_t BLOCK_SIZE = named_usage::unspecified>
1353 using thread_y = IndexGlobal<named_dim::y, BLOCK_SIZE, named_usage::ignored>;
1354 template<size_t BLOCK_SIZE = named_usage::unspecified>
1355 using thread_z = IndexGlobal<named_dim::z, BLOCK_SIZE, named_usage::ignored>;
1356 
1357 template<size_t BLOCK_SIZE_X = named_usage::unspecified,
1358  size_t BLOCK_SIZE_Y = named_usage::unspecified,
1359  size_t BLOCK_SIZE_Z = named_usage::unspecified>
1360 using thread_xyz = IndexFlatten<thread_x<BLOCK_SIZE_X>,
1361  thread_y<BLOCK_SIZE_Y>,
1362  thread_z<BLOCK_SIZE_Z>>;
1363 
1364 template<size_t GRID_SIZE = named_usage::unspecified>
1365 using block_x = IndexGlobal<named_dim::x, named_usage::ignored, GRID_SIZE>;
1366 template<size_t GRID_SIZE = named_usage::unspecified>
1367 using block_y = IndexGlobal<named_dim::y, named_usage::ignored, GRID_SIZE>;
1368 template<size_t GRID_SIZE = named_usage::unspecified>
1369 using block_z = IndexGlobal<named_dim::z, named_usage::ignored, GRID_SIZE>;
1370 
1371 template<size_t GRID_SIZE_X = named_usage::unspecified,
1372  size_t GRID_SIZE_Y = named_usage::unspecified,
1373  size_t GRID_SIZE_Z = named_usage::unspecified>
1374 using block_xyz = IndexFlatten<block_x<GRID_SIZE_X>,
1375  block_y<GRID_SIZE_Y>,
1376  block_z<GRID_SIZE_Z>>;
1377 
1378 template<size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
1379 using global_x = IndexGlobal<named_dim::x, BLOCK_SIZE, GRID_SIZE>;
1380 template<size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
1381 using global_y = IndexGlobal<named_dim::y, BLOCK_SIZE, GRID_SIZE>;
1382 template<size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
1383 using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
1384 
1385 
1386 template<size_t BLOCK_SIZE_X,
1387  size_t BLOCK_SIZE_Y,
1388  size_t BLOCK_SIZE_Z,
1389  size_t GRID_SIZE_X = named_usage::unspecified,
1390  size_t GRID_SIZE_Y = named_usage::unspecified,
1391  size_t GRID_SIZE_Z = named_usage::unspecified>
1392 using global_xyz = IndexFlatten<global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
1393  global_y<BLOCK_SIZE_Y, GRID_SIZE_Y>,
1394  global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
1395 
1396 
1397 template<size_t WARP_SIZE = RAJA::policy::cuda::device_constants.WARP_SIZE,
1398  size_t BLOCK_SIZE_X = named_usage::unspecified,
1399  size_t BLOCK_SIZE_Y = named_usage::unspecified,
1400  size_t BLOCK_SIZE_Z = named_usage::unspecified>
1401 using warp_xyz =
1402  IndexDivide<WARP_SIZE,
1403  thread_xyz<BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>>;
1404 
1405 template<size_t WARP_SIZE = RAJA::policy::cuda::device_constants.WARP_SIZE,
1406  size_t BLOCK_SIZE_X = named_usage::unspecified,
1407  size_t BLOCK_SIZE_Y = named_usage::unspecified,
1408  size_t BLOCK_SIZE_Z = named_usage::unspecified,
1409  size_t GRID_SIZE_X = named_usage::unspecified,
1410  size_t GRID_SIZE_Y = named_usage::unspecified,
1411  size_t GRID_SIZE_Z = named_usage::unspecified>
1412 using warp_global_xyz =
1413  IndexFlatten<warp_xyz<WARP_SIZE, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>,
1414  block_xyz<GRID_SIZE_X, GRID_SIZE_Y, GRID_SIZE_Z>>;
1415 
1416 } // namespace cuda
1417 
1418 using CudaAllCachedIndicesAndDims = cuda::AllCachedIndicesAndDims;
1419 using CudaCachedBlockDims = cuda::CachedBlockDims;
1420 using CudaNonCachedIndicesAndDims = cuda::NonCachedIndicesAndDims;
1421 
1422 template<bool cache_threadIdx,
1423  bool cache_blockIdx,
1424  bool cache_blockDim,
1425  bool cache_gridDim>
1426 using CudaIndicesAndDims = cuda::IndicesAndDims<cache_threadIdx,
1427  cache_blockIdx,
1428  cache_blockDim,
1429  cache_gridDim>;
1430 
1431 using CudaLaunchContextAllCachedIndicesAndDimsPolicy =
1432  cuda::LaunchContextAllCachedIndicesAndDimsPolicy;
1433 using CudaLaunchContextCachedBlockDimsPolicy =
1434  cuda::LaunchContextCachedBlockDimsPolicy;
1435 template<typename IndicesAndDimsT = cuda::NonCachedIndicesAndDims>
1436 using CudaLaunchContextIndicesAndDimsPolicy =
1437  cuda::LaunchContextIndicesAndDimsPolicy<IndicesAndDimsT>;
1438 using CudaLaunchContextNonCachedIndicesAndDimsPolicy =
1439  cuda::LaunchContextNonCachedIndicesAndDimsPolicy;
1440 
1441 // contretizers used in forall, scan, and sort policies
1442 
1443 using CudaAvoidDeviceMaxThreadOccupancyConcretizer =
1444  cuda::AvoidDeviceMaxThreadOccupancyConcretizer<
1445  cuda::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
1446 
1447 template<typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
1448 using CudaFractionOffsetOccupancyConcretizer =
1449  cuda::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
1450 
1451 using CudaMaxOccupancyConcretizer = cuda::MaxOccupancyConcretizer;
1452 
1453 using CudaReduceDefaultConcretizer = CudaMaxOccupancyConcretizer;
1454 
1455 using CudaDefaultConcretizer = CudaMaxOccupancyConcretizer;
1456 
1457 // policies usable with forall, scan, and sort
1458 
1459 template<size_t BLOCK_SIZE,
1460  size_t GRID_SIZE,
1461  size_t BLOCKS_PER_SM,
1462  bool Async = false>
1463 using cuda_exec_grid_explicit = policy::cuda::cuda_exec_explicit<
1464  iteration_mapping::StridedLoop<named_usage::unspecified>,
1465  cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
1466  CudaDefaultConcretizer,
1467  BLOCKS_PER_SM,
1468  Async>;
1469 
1470 template<size_t BLOCK_SIZE, size_t GRID_SIZE, size_t BLOCKS_PER_SM>
1471 using cuda_exec_grid_explicit_async = policy::cuda::cuda_exec_explicit<
1472  iteration_mapping::StridedLoop<named_usage::unspecified>,
1473  cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
1474  CudaDefaultConcretizer,
1475  BLOCKS_PER_SM,
1476  true>;
1477 
1478 template<size_t BLOCK_SIZE, size_t GRID_SIZE, bool Async = false>
1479 using cuda_exec_grid = policy::cuda::cuda_exec_explicit<
1480  iteration_mapping::StridedLoop<named_usage::unspecified>,
1481  cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
1482  CudaDefaultConcretizer,
1483  policy::cuda::MIN_BLOCKS_PER_SM,
1484  Async>;
1485 
1486 template<size_t BLOCK_SIZE, size_t GRID_SIZE>
1487 using cuda_exec_grid_async = policy::cuda::cuda_exec_explicit<
1488  iteration_mapping::StridedLoop<named_usage::unspecified>,
1489  cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
1490  CudaDefaultConcretizer,
1491  policy::cuda::MIN_BLOCKS_PER_SM,
1492  true>;
1493 
1494 template<size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
1495 using cuda_exec_explicit =
1496  policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
1497  cuda::global_x<BLOCK_SIZE>,
1498  CudaDefaultConcretizer,
1499  BLOCKS_PER_SM,
1500  Async>;
1501 
1502 template<size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
1503 using cuda_exec_explicit_async =
1504  policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
1505  cuda::global_x<BLOCK_SIZE>,
1506  CudaDefaultConcretizer,
1507  BLOCKS_PER_SM,
1508  true>;
1509 
1510 template<size_t BLOCK_SIZE, bool Async = false>
1511 using cuda_exec =
1512  policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
1513  cuda::global_x<BLOCK_SIZE>,
1514  CudaDefaultConcretizer,
1515  policy::cuda::MIN_BLOCKS_PER_SM,
1516  Async>;
1517 
1518 template<size_t BLOCK_SIZE>
1519 using cuda_exec_async =
1520  policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
1521  cuda::global_x<BLOCK_SIZE>,
1522  CudaDefaultConcretizer,
1523  policy::cuda::MIN_BLOCKS_PER_SM,
1524  true>;
1525 
1526 template<size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
1527 using cuda_exec_occ_calc_explicit = policy::cuda::cuda_exec_explicit<
1528  iteration_mapping::StridedLoop<named_usage::unspecified>,
1529  cuda::global_x<BLOCK_SIZE>,
1530  CudaDefaultConcretizer,
1531  BLOCKS_PER_SM,
1532  Async>;
1533 
1534 template<size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
1535 using cuda_exec_occ_calc_explicit_async = policy::cuda::cuda_exec_explicit<
1536  iteration_mapping::StridedLoop<named_usage::unspecified>,
1537  cuda::global_x<BLOCK_SIZE>,
1538  CudaDefaultConcretizer,
1539  BLOCKS_PER_SM,
1540  true>;
1541 
1542 template<size_t BLOCK_SIZE, bool Async = false>
1543 using cuda_exec_occ_calc = policy::cuda::cuda_exec_explicit<
1544  iteration_mapping::StridedLoop<named_usage::unspecified>,
1545  cuda::global_x<BLOCK_SIZE>,
1546  CudaDefaultConcretizer,
1547  policy::cuda::MIN_BLOCKS_PER_SM,
1548  Async>;
1549 
1550 template<size_t BLOCK_SIZE>
1551 using cuda_exec_occ_calc_async = policy::cuda::cuda_exec_explicit<
1552  iteration_mapping::StridedLoop<named_usage::unspecified>,
1553  cuda::global_x<BLOCK_SIZE>,
1554  CudaDefaultConcretizer,
1555  policy::cuda::MIN_BLOCKS_PER_SM,
1556  true>;
1557 
1558 template<size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
1559 using cuda_exec_occ_max_explicit = policy::cuda::cuda_exec_explicit<
1560  iteration_mapping::StridedLoop<named_usage::unspecified>,
1561  cuda::global_x<BLOCK_SIZE>,
1562  CudaMaxOccupancyConcretizer,
1563  BLOCKS_PER_SM,
1564  Async>;
1565 
1566 template<size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
1567 using cuda_exec_occ_max_explicit_async = policy::cuda::cuda_exec_explicit<
1568  iteration_mapping::StridedLoop<named_usage::unspecified>,
1569  cuda::global_x<BLOCK_SIZE>,
1570  CudaMaxOccupancyConcretizer,
1571  BLOCKS_PER_SM,
1572  true>;
1573 
1574 template<size_t BLOCK_SIZE, bool Async = false>
1575 using cuda_exec_occ_max = policy::cuda::cuda_exec_explicit<
1576  iteration_mapping::StridedLoop<named_usage::unspecified>,
1577  cuda::global_x<BLOCK_SIZE>,
1578  CudaMaxOccupancyConcretizer,
1579  policy::cuda::MIN_BLOCKS_PER_SM,
1580  Async>;
1581 
1582 template<size_t BLOCK_SIZE>
1583 using cuda_exec_occ_max_async = policy::cuda::cuda_exec_explicit<
1584  iteration_mapping::StridedLoop<named_usage::unspecified>,
1585  cuda::global_x<BLOCK_SIZE>,
1586  CudaMaxOccupancyConcretizer,
1587  policy::cuda::MIN_BLOCKS_PER_SM,
1588  true>;
1589 
1590 template<size_t BLOCK_SIZE,
1591  size_t BLOCKS_PER_SM,
1592  typename Fraction,
1593  bool Async = false>
1594 using cuda_exec_occ_fraction_explicit = policy::cuda::cuda_exec_explicit<
1595  iteration_mapping::StridedLoop<named_usage::unspecified>,
1596  cuda::global_x<BLOCK_SIZE>,
1597  CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
1598  BLOCKS_PER_SM,
1599  Async>;
1600 
1601 template<size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Fraction>
1602 using cuda_exec_occ_fraction_explicit_async = policy::cuda::cuda_exec_explicit<
1603  iteration_mapping::StridedLoop<named_usage::unspecified>,
1604  cuda::global_x<BLOCK_SIZE>,
1605  CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
1606  BLOCKS_PER_SM,
1607  true>;
1608 
1609 template<size_t BLOCK_SIZE, typename Fraction, bool Async = false>
1610 using cuda_exec_occ_fraction = policy::cuda::cuda_exec_explicit<
1611  iteration_mapping::StridedLoop<named_usage::unspecified>,
1612  cuda::global_x<BLOCK_SIZE>,
1613  CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
1614  policy::cuda::MIN_BLOCKS_PER_SM,
1615  Async>;
1616 
1617 template<size_t BLOCK_SIZE, typename Fraction>
1618 using cuda_exec_occ_fraction_async = policy::cuda::cuda_exec_explicit<
1619  iteration_mapping::StridedLoop<named_usage::unspecified>,
1620  cuda::global_x<BLOCK_SIZE>,
1621  CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
1622  policy::cuda::MIN_BLOCKS_PER_SM,
1623  true>;
1624 
1625 template<size_t BLOCK_SIZE,
1626  size_t BLOCKS_PER_SM,
1627  typename Concretizer,
1628  bool Async = false>
1629 using cuda_exec_occ_custom_explicit = policy::cuda::cuda_exec_explicit<
1630  iteration_mapping::StridedLoop<named_usage::unspecified>,
1631  cuda::global_x<BLOCK_SIZE>,
1632  Concretizer,
1633  BLOCKS_PER_SM,
1634  Async>;
1635 
1636 template<size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Concretizer>
1637 using cuda_exec_occ_custom_explicit_async = policy::cuda::cuda_exec_explicit<
1638  iteration_mapping::StridedLoop<named_usage::unspecified>,
1639  cuda::global_x<BLOCK_SIZE>,
1640  Concretizer,
1641  BLOCKS_PER_SM,
1642  true>;
1643 
1644 template<size_t BLOCK_SIZE, typename Concretizer, bool Async = false>
1645 using cuda_exec_occ_custom = policy::cuda::cuda_exec_explicit<
1646  iteration_mapping::StridedLoop<named_usage::unspecified>,
1647  cuda::global_x<BLOCK_SIZE>,
1648  Concretizer,
1649  policy::cuda::MIN_BLOCKS_PER_SM,
1650  Async>;
1651 
1652 template<size_t BLOCK_SIZE, typename Concretizer>
1653 using cuda_exec_occ_custom_async = policy::cuda::cuda_exec_explicit<
1654  iteration_mapping::StridedLoop<named_usage::unspecified>,
1655  cuda::global_x<BLOCK_SIZE>,
1656  Concretizer,
1657  policy::cuda::MIN_BLOCKS_PER_SM,
1658  true>;
1659 
1660 template<size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
1661 using cuda_exec_with_reduce_explicit = policy::cuda::cuda_exec_explicit<
1662  iteration_mapping::StridedLoop<named_usage::unspecified>,
1663  cuda::global_x<BLOCK_SIZE>,
1664  CudaReduceDefaultConcretizer,
1665  BLOCKS_PER_SM,
1666  Async>;
1667 
1668 template<size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
1669 using cuda_exec_with_reduce_explicit_async = policy::cuda::cuda_exec_explicit<
1670  iteration_mapping::StridedLoop<named_usage::unspecified>,
1671  cuda::global_x<BLOCK_SIZE>,
1672  CudaReduceDefaultConcretizer,
1673  BLOCKS_PER_SM,
1674  true>;
1675 
1676 template<size_t BLOCK_SIZE, bool Async = false>
1677 using cuda_exec_with_reduce = policy::cuda::cuda_exec_explicit<
1678  iteration_mapping::StridedLoop<named_usage::unspecified>,
1679  cuda::global_x<BLOCK_SIZE>,
1680  CudaReduceDefaultConcretizer,
1681  policy::cuda::MIN_BLOCKS_PER_SM,
1682  Async>;
1683 
1684 template<size_t BLOCK_SIZE>
1685 using cuda_exec_with_reduce_async = policy::cuda::cuda_exec_explicit<
1686  iteration_mapping::StridedLoop<named_usage::unspecified>,
1687  cuda::global_x<BLOCK_SIZE>,
1688  CudaReduceDefaultConcretizer,
1689  policy::cuda::MIN_BLOCKS_PER_SM,
1690  true>;
1691 
1692 template<bool with_reduce,
1693  size_t BLOCK_SIZE,
1694  size_t BLOCKS_PER_SM,
1695  bool Async = false>
1696 using cuda_exec_base_explicit = std::conditional_t<
1697  with_reduce,
1698  cuda_exec_with_reduce_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
1699  cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>>;
1700 
1701 template<bool with_reduce, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
1702 using cuda_exec_base_explicit_async = std::conditional_t<
1703  with_reduce,
1704  cuda_exec_with_reduce_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
1705  cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>>;
1706 
1707 template<bool with_reduce, size_t BLOCK_SIZE, bool Async = false>
1708 using cuda_exec_base =
1709  std::conditional_t<with_reduce,
1710  cuda_exec_with_reduce<BLOCK_SIZE, Async>,
1711  cuda_exec<BLOCK_SIZE, Async>>;
1712 
1713 template<bool with_reduce, size_t BLOCK_SIZE>
1714 using cuda_exec_base_async =
1715  std::conditional_t<with_reduce,
1716  cuda_exec_with_reduce_async<BLOCK_SIZE>,
1717  cuda_exec_async<BLOCK_SIZE>>;
1718 
1719 
1720 // policies usable with WorkGroup
1721 template<size_t BLOCK_SIZE,
1722  size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
1723  bool Async = false>
1724 using cuda_work_explicit =
1725  policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
1726 
1727 template<size_t BLOCK_SIZE,
1728  size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
1729 using cuda_work_explicit_async =
1730  policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>;
1731 
1732 template<size_t BLOCK_SIZE, bool Async = false>
1733 using cuda_work = policy::cuda::
1734  cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
1735 
1736 template<size_t BLOCK_SIZE>
1737 using cuda_work_async = policy::cuda::
1738  cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, true>;
1739 
1740 using policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average;
1741 
1742 // policies usable with atomics
1743 using policy::cuda::cuda_atomic;
1744 using policy::cuda::cuda_atomic_explicit;
1745 
1746 
1747 // policies usable with reducers
1748 template<cuda::reduce_algorithm algorithm,
1749  cuda::block_communication_mode comm_mode,
1750  size_t replication = named_usage::unspecified,
1751  size_t atomic_stride = named_usage::unspecified>
1752 using cuda_reduce_tuning = policy::cuda::cuda_reduce_policy<
1753  cuda::ReduceTuning<algorithm, comm_mode, replication, atomic_stride>>;
1754 
1755 // Policies for RAJA::Reduce* objects with specific behaviors.
1756 // - non-atomic policies store partial results and combine them in the same
1757 // order every time, leading to consistent results for a loop run to run.
1758 // - *atomic* policies may use atomics to combine partial results. The
1759 // use of atomics leads to order of operation differences which change the
1760 // results of floating point sum reductions for a loop run to run. Falls back
1761 // on a non-atomic implementation if atomics can't be used with the given
1762 // type. The memory used with atomics is initialized on the device using
1763 // atomics which adds overhead.
1764 // - *atomic_host* policies are similar to the atomic policies above. However
1765 // the memory used with atomics is initialized on the host. This is faster
1766 // overall than other policies on HW with direct host access to device memory
1767 // such as the IBM power 9 + Nvidia V100 Sierra/Lassen systems.
1768 // - *device_fence* policies use normal memory accesses with device scope fences
1769 // in the implementation. This works on all HW.
1770 // - *block_fence* policies use special (atomic) memory accesses that use
1771 // a cache shared by the whole device to avoid having to use
1772 // device scope fences. This improves performance on some HW but
1773 // is more difficult to code correctly.
1774 using cuda_reduce_device_fence =
1775  cuda_reduce_tuning<cuda::reduce_algorithm::combine_last_block,
1776  cuda::block_communication_mode::device_fence,
1780 using cuda_reduce_block_fence =
1781  cuda_reduce_tuning<cuda::reduce_algorithm::combine_last_block,
1782  cuda::block_communication_mode::block_fence,
1786 using cuda_reduce_atomic_device_init_device_fence =
1787  cuda_reduce_tuning<cuda::reduce_algorithm::init_device_combine_atomic_block,
1788  cuda::block_communication_mode::device_fence,
1792 using cuda_reduce_atomic_device_init_block_fence =
1793  cuda_reduce_tuning<cuda::reduce_algorithm::init_device_combine_atomic_block,
1794  cuda::block_communication_mode::block_fence,
1798 using cuda_reduce_atomic_host_init_device_fence =
1799  cuda_reduce_tuning<cuda::reduce_algorithm::init_host_combine_atomic_block,
1800  cuda::block_communication_mode::device_fence,
1804 using cuda_reduce_atomic_host_init_block_fence =
1805  cuda_reduce_tuning<cuda::reduce_algorithm::init_host_combine_atomic_block,
1806  cuda::block_communication_mode::block_fence,
1809 
1810 // Policy for RAJA::Reduce* objects that gives the same answer every time when
1811 // used in the same way
1812 using cuda_reduce = cuda_reduce_device_fence;
1813 
1814 // Policy for RAJA::Reduce* objects that may use atomics and may not give the
1815 // same answer every time when used in the same way
1816 using cuda_reduce_atomic = cuda_reduce_atomic_host_init_device_fence;
1817 
1818 // Policy for RAJA::Reduce* objects that lets you select the default atomic or
1819 // non-atomic policy with a bool
1820 template<bool with_atomic>
1821 using cuda_reduce_base =
1822  std::conditional_t<with_atomic, cuda_reduce_atomic, cuda_reduce>;
1823 
1824 
1825 // policies usable with multi_reducers
1826 template<cuda::multi_reduce_algorithm algorithm,
1827  typename SharedAtomicReplicationConcretizer,
1828  typename SharedAtomicReplicationIndexer,
1829  typename GlobalAtomicReplicationConcretizer,
1830  typename GlobalAtomicReplicationIndexer>
1831 using cuda_multi_reduce_tuning =
1832  policy::cuda::cuda_multi_reduce_policy<cuda::MultiReduceTuning<
1833  algorithm,
1834  cuda::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
1835  SharedAtomicReplicationIndexer,
1836  GetOffsetRight<int>>,
1837  cuda::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
1838  GlobalAtomicReplicationIndexer,
1839  GetOffsetLeft<int>>>>;
1840 
1841 // Policies for RAJA::MultiReduce* objects with specific behaviors.
1842 // - *atomic* policies may use atomics to combine partial results. The
1843 // use of atomics leads to order of operation differences which change the
1844 // results of floating point sum reductions for a loop run to run.
1845 // - *no_replication* policies use the minimum amount of resources. The
1846 // lack of resources means they may perform poorly. These policies are
1847 // intended for use cases where low overhead is more important than high
1848 // performance such as error flags that are rarely set.
1849 // - *host_init* policies initialize memory used with atomics on the host.
1850 // This is faster overall than other policies on HW with direct host access
1851 // to device memory such as the IBM power 9 + Nvidia V100 Sierra/Lassen
1852 // systems.
1853 using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init =
1854  cuda_multi_reduce_tuning<
1855  cuda::multi_reduce_algorithm::
1856  init_host_combine_block_atomic_then_grid_atomic,
1857  cuda::SharedAtomicReplicationMaxPow2Concretizer<
1858  cuda::ConstantPreferredReplicationConcretizer<16>>,
1859  cuda::thread_xyz<>,
1860  cuda::GlobalAtomicReplicationMinPow2Concretizer<
1861  cuda::ConstantPreferredReplicationConcretizer<2>>,
1862  cuda::warp_global_xyz<>>;
1863 // special policy to test that multi-reducers work if there is not enough shmem
1864 using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing =
1865  cuda_multi_reduce_tuning<
1866  cuda::multi_reduce_algorithm::
1867  init_host_combine_block_atomic_then_grid_atomic,
1868  cuda::SharedAtomicReplicationMaxPow2Concretizer<
1869  cuda::ConstantPreferredReplicationConcretizer<0>>,
1870  cuda::thread_xyz<>,
1871  cuda::GlobalAtomicReplicationMinPow2Concretizer<
1872  cuda::ConstantPreferredReplicationConcretizer<2>>,
1873  cuda::warp_global_xyz<>>;
1874 //
1875 using cuda_multi_reduce_atomic_global_host_init = cuda_multi_reduce_tuning<
1876  cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
1877  void, // unused with this algorithm
1878  void, // unused with this algorithm
1879  cuda::GlobalAtomicReplicationMinPow2Concretizer<
1880  cuda::ConstantPreferredReplicationConcretizer<2>>,
1881  cuda::warp_global_xyz<>>;
1882 //
1883 using cuda_multi_reduce_atomic_global_no_replication_host_init =
1884  cuda_multi_reduce_tuning<
1885  cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
1886  void, // unused with this algorithm
1887  void, // unused with this algorithm
1888  cuda::GlobalAtomicReplicationMinPow2Concretizer<
1889  cuda::ConstantPreferredReplicationConcretizer<1>>,
1890  cuda::block_xyz<>>;
1891 
1892 // Policy for RAJA::MultiReduce* objects that may use atomics and may not give
1893 // the same answer every time when used in the same way
1894 using cuda_multi_reduce_atomic =
1895  cuda_multi_reduce_atomic_block_then_atomic_grid_host_init;
1896 // Similar to above but optimized for low overhead in cases where it is rarely
1897 // used
1898 using cuda_multi_reduce_atomic_low_performance_low_overhead =
1899  cuda_multi_reduce_atomic_global_no_replication_host_init;
1900 
1901 
1902 // policies usable with kernel
1903 using policy::cuda::cuda_block_reduce;
1904 using policy::cuda::cuda_warp_reduce;
1905 
1906 using cuda_warp_direct_unchecked = RAJA::policy::cuda::cuda_indexer<
1907  iteration_mapping::DirectUnchecked,
1909  cuda::thread_x<RAJA::policy::cuda::device_constants.WARP_SIZE>>;
1910 using cuda_warp_direct = RAJA::policy::cuda::cuda_indexer<
1911  iteration_mapping::Direct,
1913  cuda::thread_x<RAJA::policy::cuda::device_constants.WARP_SIZE>>;
1914 using cuda_warp_loop = RAJA::policy::cuda::cuda_indexer<
1915  iteration_mapping::StridedLoop<named_usage::unspecified>,
1917  cuda::thread_x<RAJA::policy::cuda::device_constants.WARP_SIZE>>;
1918 
1919 using policy::cuda::cuda_warp_masked_direct;
1920 using policy::cuda::cuda_warp_masked_loop;
1921 
1922 using policy::cuda::cuda_thread_masked_direct;
1923 using policy::cuda::cuda_thread_masked_loop;
1924 
1925 // policies usable with synchronize
1926 using policy::cuda::cuda_synchronize;
1927 
1928 // policies usable with launch
1929 template<bool Async,
1930  int num_threads = named_usage::unspecified,
1931  size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
1932 using cuda_launch_explicit_t =
1933  policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>;
1934 
1935 // CUDA will emit warnings if we specify BLOCKS_PER_SM but not num of threads
1936 template<bool Async, int num_threads = named_usage::unspecified>
1937 using cuda_launch_t =
1938  policy::cuda::cuda_launch_explicit_t<Async,
1939  num_threads,
1940  (num_threads ==
1943  : policy::cuda::MIN_BLOCKS_PER_SM>;
1944 
1945 
1946 // policies usable with kernel and launch
1947 template<typename... indexers>
1948 using cuda_indexer_direct_unchecked =
1949  policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked,
1951  indexers...>;
1952 
1953 template<typename... indexers>
1954 using cuda_indexer_direct =
1955  policy::cuda::cuda_indexer<iteration_mapping::Direct,
1957  indexers...>;
1958 
1959 template<typename... indexers>
1960 using cuda_indexer_loop = policy::cuda::cuda_indexer<
1961  iteration_mapping::StridedLoop<named_usage::unspecified>,
1963  indexers...>;
1964 
1965 template<typename... indexers>
1966 using cuda_indexer_syncable_loop = policy::cuda::cuda_indexer<
1967  iteration_mapping::StridedLoop<named_usage::unspecified>,
1969  indexers...>;
1970 
1971 template<typename... indexers>
1972 using cuda_flatten_indexer_direct_unchecked =
1973  policy::cuda::cuda_flatten_indexer<iteration_mapping::DirectUnchecked,
1975  indexers...>;
1976 
1977 template<typename... indexers>
1978 using cuda_flatten_indexer_direct =
1979  policy::cuda::cuda_flatten_indexer<iteration_mapping::Direct,
1981  indexers...>;
1982 
1983 template<typename... indexers>
1984 using cuda_flatten_indexer_loop = policy::cuda::cuda_flatten_indexer<
1985  iteration_mapping::StridedLoop<named_usage::unspecified>,
1987  indexers...>;
1988 
1989 
1997 template<named_dim... dims>
1998 using cuda_thread_direct_unchecked = cuda_indexer_direct_unchecked<
1999  cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
2000 using cuda_thread_x_direct_unchecked =
2001  cuda_thread_direct_unchecked<named_dim::x>;
2002 using cuda_thread_y_direct_unchecked =
2003  cuda_thread_direct_unchecked<named_dim::y>;
2004 using cuda_thread_z_direct_unchecked =
2005  cuda_thread_direct_unchecked<named_dim::z>;
2006 using cuda_thread_xy_direct_unchecked =
2007  cuda_thread_direct_unchecked<named_dim::x, named_dim::y>;
2008 using cuda_thread_xz_direct_unchecked =
2009  cuda_thread_direct_unchecked<named_dim::x, named_dim::z>;
2010 using cuda_thread_yx_direct_unchecked =
2011  cuda_thread_direct_unchecked<named_dim::y, named_dim::x>;
2012 using cuda_thread_yz_direct_unchecked =
2013  cuda_thread_direct_unchecked<named_dim::y, named_dim::z>;
2014 using cuda_thread_zx_direct_unchecked =
2015  cuda_thread_direct_unchecked<named_dim::z, named_dim::x>;
2016 using cuda_thread_zy_direct_unchecked =
2017  cuda_thread_direct_unchecked<named_dim::z, named_dim::y>;
2018 using cuda_thread_xyz_direct_unchecked =
2019  cuda_thread_direct_unchecked<named_dim::x, named_dim::y, named_dim::z>;
2020 using cuda_thread_xzy_direct_unchecked =
2021  cuda_thread_direct_unchecked<named_dim::x, named_dim::z, named_dim::y>;
2022 using cuda_thread_yxz_direct_unchecked =
2023  cuda_thread_direct_unchecked<named_dim::y, named_dim::x, named_dim::z>;
2024 using cuda_thread_yzx_direct_unchecked =
2025  cuda_thread_direct_unchecked<named_dim::y, named_dim::z, named_dim::x>;
2026 using cuda_thread_zxy_direct_unchecked =
2027  cuda_thread_direct_unchecked<named_dim::z, named_dim::x, named_dim::y>;
2028 using cuda_thread_zyx_direct_unchecked =
2029  cuda_thread_direct_unchecked<named_dim::z, named_dim::y, named_dim::x>;
2030 
2031 template<named_dim... dims>
2032 using cuda_block_direct_unchecked = cuda_indexer_direct_unchecked<
2033  cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
2034 using cuda_block_x_direct_unchecked = cuda_block_direct_unchecked<named_dim::x>;
2035 using cuda_block_y_direct_unchecked = cuda_block_direct_unchecked<named_dim::y>;
2036 using cuda_block_z_direct_unchecked = cuda_block_direct_unchecked<named_dim::z>;
2037 using cuda_block_xy_direct_unchecked =
2038  cuda_block_direct_unchecked<named_dim::x, named_dim::y>;
2039 using cuda_block_xz_direct_unchecked =
2040  cuda_block_direct_unchecked<named_dim::x, named_dim::z>;
2041 using cuda_block_yx_direct_unchecked =
2042  cuda_block_direct_unchecked<named_dim::y, named_dim::x>;
2043 using cuda_block_yz_direct_unchecked =
2044  cuda_block_direct_unchecked<named_dim::y, named_dim::z>;
2045 using cuda_block_zx_direct_unchecked =
2046  cuda_block_direct_unchecked<named_dim::z, named_dim::x>;
2047 using cuda_block_zy_direct_unchecked =
2048  cuda_block_direct_unchecked<named_dim::z, named_dim::y>;
2049 using cuda_block_xyz_direct_unchecked =
2050  cuda_block_direct_unchecked<named_dim::x, named_dim::y, named_dim::z>;
2051 using cuda_block_xzy_direct_unchecked =
2052  cuda_block_direct_unchecked<named_dim::x, named_dim::z, named_dim::y>;
2053 using cuda_block_yxz_direct_unchecked =
2054  cuda_block_direct_unchecked<named_dim::y, named_dim::x, named_dim::z>;
2055 using cuda_block_yzx_direct_unchecked =
2056  cuda_block_direct_unchecked<named_dim::y, named_dim::z, named_dim::x>;
2057 using cuda_block_zxy_direct_unchecked =
2058  cuda_block_direct_unchecked<named_dim::z, named_dim::x, named_dim::y>;
2059 using cuda_block_zyx_direct_unchecked =
2060  cuda_block_direct_unchecked<named_dim::z, named_dim::y, named_dim::x>;
2061 
2062 template<named_dim... dims>
2063 using cuda_global_direct_unchecked = cuda_indexer_direct_unchecked<
2064  cuda::IndexGlobal<dims,
2067 using cuda_global_x_direct_unchecked =
2068  cuda_global_direct_unchecked<named_dim::x>;
2069 using cuda_global_y_direct_unchecked =
2070  cuda_global_direct_unchecked<named_dim::y>;
2071 using cuda_global_z_direct_unchecked =
2072  cuda_global_direct_unchecked<named_dim::z>;
2073 using cuda_global_xy_direct_unchecked =
2074  cuda_global_direct_unchecked<named_dim::x, named_dim::y>;
2075 using cuda_global_xz_direct_unchecked =
2076  cuda_global_direct_unchecked<named_dim::x, named_dim::z>;
2077 using cuda_global_yx_direct_unchecked =
2078  cuda_global_direct_unchecked<named_dim::y, named_dim::x>;
2079 using cuda_global_yz_direct_unchecked =
2080  cuda_global_direct_unchecked<named_dim::y, named_dim::z>;
2081 using cuda_global_zx_direct_unchecked =
2082  cuda_global_direct_unchecked<named_dim::z, named_dim::x>;
2083 using cuda_global_zy_direct_unchecked =
2084  cuda_global_direct_unchecked<named_dim::z, named_dim::y>;
2085 using cuda_global_xyz_direct_unchecked =
2086  cuda_global_direct_unchecked<named_dim::x, named_dim::y, named_dim::z>;
2087 using cuda_global_xzy_direct_unchecked =
2088  cuda_global_direct_unchecked<named_dim::x, named_dim::z, named_dim::y>;
2089 using cuda_global_yxz_direct_unchecked =
2090  cuda_global_direct_unchecked<named_dim::y, named_dim::x, named_dim::z>;
2091 using cuda_global_yzx_direct_unchecked =
2092  cuda_global_direct_unchecked<named_dim::y, named_dim::z, named_dim::x>;
2093 using cuda_global_zxy_direct_unchecked =
2094  cuda_global_direct_unchecked<named_dim::z, named_dim::x, named_dim::y>;
2095 using cuda_global_zyx_direct_unchecked =
2096  cuda_global_direct_unchecked<named_dim::z, named_dim::y, named_dim::x>;
2097 
2105 template<named_dim... dims>
2106 using cuda_thread_direct = cuda_indexer_direct<
2107  cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
2108 using cuda_thread_x_direct = cuda_thread_direct<named_dim::x>;
2109 using cuda_thread_y_direct = cuda_thread_direct<named_dim::y>;
2110 using cuda_thread_z_direct = cuda_thread_direct<named_dim::z>;
2111 using cuda_thread_xy_direct = cuda_thread_direct<named_dim::x, named_dim::y>;
2112 using cuda_thread_xz_direct = cuda_thread_direct<named_dim::x, named_dim::z>;
2113 using cuda_thread_yx_direct = cuda_thread_direct<named_dim::y, named_dim::x>;
2114 using cuda_thread_yz_direct = cuda_thread_direct<named_dim::y, named_dim::z>;
2115 using cuda_thread_zx_direct = cuda_thread_direct<named_dim::z, named_dim::x>;
2116 using cuda_thread_zy_direct = cuda_thread_direct<named_dim::z, named_dim::y>;
2117 using cuda_thread_xyz_direct =
2118  cuda_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
2119 using cuda_thread_xzy_direct =
2120  cuda_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
2121 using cuda_thread_yxz_direct =
2122  cuda_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
2123 using cuda_thread_yzx_direct =
2124  cuda_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
2125 using cuda_thread_zxy_direct =
2126  cuda_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
2127 using cuda_thread_zyx_direct =
2128  cuda_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
2129 
2130 template<named_dim... dims>
2131 using cuda_block_direct = cuda_indexer_direct<
2132  cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
2133 using cuda_block_x_direct = cuda_block_direct<named_dim::x>;
2134 using cuda_block_y_direct = cuda_block_direct<named_dim::y>;
2135 using cuda_block_z_direct = cuda_block_direct<named_dim::z>;
2136 using cuda_block_xy_direct = cuda_block_direct<named_dim::x, named_dim::y>;
2137 using cuda_block_xz_direct = cuda_block_direct<named_dim::x, named_dim::z>;
2138 using cuda_block_yx_direct = cuda_block_direct<named_dim::y, named_dim::x>;
2139 using cuda_block_yz_direct = cuda_block_direct<named_dim::y, named_dim::z>;
2140 using cuda_block_zx_direct = cuda_block_direct<named_dim::z, named_dim::x>;
2141 using cuda_block_zy_direct = cuda_block_direct<named_dim::z, named_dim::y>;
2142 using cuda_block_xyz_direct =
2143  cuda_block_direct<named_dim::x, named_dim::y, named_dim::z>;
2144 using cuda_block_xzy_direct =
2145  cuda_block_direct<named_dim::x, named_dim::z, named_dim::y>;
2146 using cuda_block_yxz_direct =
2147  cuda_block_direct<named_dim::y, named_dim::x, named_dim::z>;
2148 using cuda_block_yzx_direct =
2149  cuda_block_direct<named_dim::y, named_dim::z, named_dim::x>;
2150 using cuda_block_zxy_direct =
2151  cuda_block_direct<named_dim::z, named_dim::x, named_dim::y>;
2152 using cuda_block_zyx_direct =
2153  cuda_block_direct<named_dim::z, named_dim::y, named_dim::x>;
2154 
2155 template<named_dim... dims>
2156 using cuda_global_direct =
2157  cuda_indexer_direct<cuda::IndexGlobal<dims,
2160 using cuda_global_x_direct = cuda_global_direct<named_dim::x>;
2161 using cuda_global_y_direct = cuda_global_direct<named_dim::y>;
2162 using cuda_global_z_direct = cuda_global_direct<named_dim::z>;
2163 using cuda_global_xy_direct = cuda_global_direct<named_dim::x, named_dim::y>;
2164 using cuda_global_xz_direct = cuda_global_direct<named_dim::x, named_dim::z>;
2165 using cuda_global_yx_direct = cuda_global_direct<named_dim::y, named_dim::x>;
2166 using cuda_global_yz_direct = cuda_global_direct<named_dim::y, named_dim::z>;
2167 using cuda_global_zx_direct = cuda_global_direct<named_dim::z, named_dim::x>;
2168 using cuda_global_zy_direct = cuda_global_direct<named_dim::z, named_dim::y>;
2169 using cuda_global_xyz_direct =
2170  cuda_global_direct<named_dim::x, named_dim::y, named_dim::z>;
2171 using cuda_global_xzy_direct =
2172  cuda_global_direct<named_dim::x, named_dim::z, named_dim::y>;
2173 using cuda_global_yxz_direct =
2174  cuda_global_direct<named_dim::y, named_dim::x, named_dim::z>;
2175 using cuda_global_yzx_direct =
2176  cuda_global_direct<named_dim::y, named_dim::z, named_dim::x>;
2177 using cuda_global_zxy_direct =
2178  cuda_global_direct<named_dim::z, named_dim::x, named_dim::y>;
2179 using cuda_global_zyx_direct =
2180  cuda_global_direct<named_dim::z, named_dim::y, named_dim::x>;
2181 
2187 template<named_dim... dims>
2188 using cuda_thread_loop = cuda_indexer_loop<
2189  cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
2190 using cuda_thread_x_loop = cuda_thread_loop<named_dim::x>;
2191 using cuda_thread_y_loop = cuda_thread_loop<named_dim::y>;
2192 using cuda_thread_z_loop = cuda_thread_loop<named_dim::z>;
2193 using cuda_thread_xy_loop = cuda_thread_loop<named_dim::x, named_dim::y>;
2194 using cuda_thread_xz_loop = cuda_thread_loop<named_dim::x, named_dim::z>;
2195 using cuda_thread_yx_loop = cuda_thread_loop<named_dim::y, named_dim::x>;
2196 using cuda_thread_yz_loop = cuda_thread_loop<named_dim::y, named_dim::z>;
2197 using cuda_thread_zx_loop = cuda_thread_loop<named_dim::z, named_dim::x>;
2198 using cuda_thread_zy_loop = cuda_thread_loop<named_dim::z, named_dim::y>;
2199 using cuda_thread_xyz_loop =
2200  cuda_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
2201 using cuda_thread_xzy_loop =
2202  cuda_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
2203 using cuda_thread_yxz_loop =
2204  cuda_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
2205 using cuda_thread_yzx_loop =
2206  cuda_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
2207 using cuda_thread_zxy_loop =
2208  cuda_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
2209 using cuda_thread_zyx_loop =
2210  cuda_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
2211 
2212 template<named_dim... dims>
2213 using cuda_block_loop = cuda_indexer_loop<
2214  cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
2215 using cuda_block_x_loop = cuda_block_loop<named_dim::x>;
2216 using cuda_block_y_loop = cuda_block_loop<named_dim::y>;
2217 using cuda_block_z_loop = cuda_block_loop<named_dim::z>;
2218 using cuda_block_xy_loop = cuda_block_loop<named_dim::x, named_dim::y>;
2219 using cuda_block_xz_loop = cuda_block_loop<named_dim::x, named_dim::z>;
2220 using cuda_block_yx_loop = cuda_block_loop<named_dim::y, named_dim::x>;
2221 using cuda_block_yz_loop = cuda_block_loop<named_dim::y, named_dim::z>;
2222 using cuda_block_zx_loop = cuda_block_loop<named_dim::z, named_dim::x>;
2223 using cuda_block_zy_loop = cuda_block_loop<named_dim::z, named_dim::y>;
2224 using cuda_block_xyz_loop =
2225  cuda_block_loop<named_dim::x, named_dim::y, named_dim::z>;
2226 using cuda_block_xzy_loop =
2227  cuda_block_loop<named_dim::x, named_dim::z, named_dim::y>;
2228 using cuda_block_yxz_loop =
2229  cuda_block_loop<named_dim::y, named_dim::x, named_dim::z>;
2230 using cuda_block_yzx_loop =
2231  cuda_block_loop<named_dim::y, named_dim::z, named_dim::x>;
2232 using cuda_block_zxy_loop =
2233  cuda_block_loop<named_dim::z, named_dim::x, named_dim::y>;
2234 using cuda_block_zyx_loop =
2235  cuda_block_loop<named_dim::z, named_dim::y, named_dim::x>;
2236 
2237 template<named_dim... dims>
2238 using cuda_global_loop =
2239  cuda_indexer_loop<cuda::IndexGlobal<dims,
2242 using cuda_global_x_loop = cuda_global_loop<named_dim::x>;
2243 using cuda_global_y_loop = cuda_global_loop<named_dim::y>;
2244 using cuda_global_z_loop = cuda_global_loop<named_dim::z>;
2245 using cuda_global_xy_loop = cuda_global_loop<named_dim::x, named_dim::y>;
2246 using cuda_global_xz_loop = cuda_global_loop<named_dim::x, named_dim::z>;
2247 using cuda_global_yx_loop = cuda_global_loop<named_dim::y, named_dim::x>;
2248 using cuda_global_yz_loop = cuda_global_loop<named_dim::y, named_dim::z>;
2249 using cuda_global_zx_loop = cuda_global_loop<named_dim::z, named_dim::x>;
2250 using cuda_global_zy_loop = cuda_global_loop<named_dim::z, named_dim::y>;
2251 using cuda_global_xyz_loop =
2252  cuda_global_loop<named_dim::x, named_dim::y, named_dim::z>;
2253 using cuda_global_xzy_loop =
2254  cuda_global_loop<named_dim::x, named_dim::z, named_dim::y>;
2255 using cuda_global_yxz_loop =
2256  cuda_global_loop<named_dim::y, named_dim::x, named_dim::z>;
2257 using cuda_global_yzx_loop =
2258  cuda_global_loop<named_dim::y, named_dim::z, named_dim::x>;
2259 using cuda_global_zxy_loop =
2260  cuda_global_loop<named_dim::z, named_dim::x, named_dim::y>;
2261 using cuda_global_zyx_loop =
2262  cuda_global_loop<named_dim::z, named_dim::y, named_dim::x>;
2263 
2271 template<named_dim... dims>
2272 using cuda_thread_syncable_loop = cuda_indexer_syncable_loop<
2273  cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
2274 using cuda_thread_x_syncable_loop = cuda_thread_syncable_loop<named_dim::x>;
2275 using cuda_thread_y_syncable_loop = cuda_thread_syncable_loop<named_dim::y>;
2276 using cuda_thread_z_syncable_loop = cuda_thread_syncable_loop<named_dim::z>;
2277 using cuda_thread_xy_syncable_loop =
2278  cuda_thread_syncable_loop<named_dim::x, named_dim::y>;
2279 using cuda_thread_xz_syncable_loop =
2280  cuda_thread_syncable_loop<named_dim::x, named_dim::z>;
2281 using cuda_thread_yx_syncable_loop =
2282  cuda_thread_syncable_loop<named_dim::y, named_dim::x>;
2283 using cuda_thread_yz_syncable_loop =
2284  cuda_thread_syncable_loop<named_dim::y, named_dim::z>;
2285 using cuda_thread_zx_syncable_loop =
2286  cuda_thread_syncable_loop<named_dim::z, named_dim::x>;
2287 using cuda_thread_zy_syncable_loop =
2288  cuda_thread_syncable_loop<named_dim::z, named_dim::y>;
2289 using cuda_thread_xyz_syncable_loop =
2290  cuda_thread_syncable_loop<named_dim::x, named_dim::y, named_dim::z>;
2291 using cuda_thread_xzy_syncable_loop =
2292  cuda_thread_syncable_loop<named_dim::x, named_dim::z, named_dim::y>;
2293 using cuda_thread_yxz_syncable_loop =
2294  cuda_thread_syncable_loop<named_dim::y, named_dim::x, named_dim::z>;
2295 using cuda_thread_yzx_syncable_loop =
2296  cuda_thread_syncable_loop<named_dim::y, named_dim::z, named_dim::x>;
2297 using cuda_thread_zxy_syncable_loop =
2298  cuda_thread_syncable_loop<named_dim::z, named_dim::x, named_dim::y>;
2299 using cuda_thread_zyx_syncable_loop =
2300  cuda_thread_syncable_loop<named_dim::z, named_dim::y, named_dim::x>;
2301 
2302 template<named_dim... dims>
2303 using cuda_block_syncable_loop = cuda_indexer_syncable_loop<
2304  cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
2305 using cuda_block_x_syncable_loop = cuda_block_syncable_loop<named_dim::x>;
2306 using cuda_block_y_syncable_loop = cuda_block_syncable_loop<named_dim::y>;
2307 using cuda_block_z_syncable_loop = cuda_block_syncable_loop<named_dim::z>;
2308 using cuda_block_xy_syncable_loop =
2309  cuda_block_syncable_loop<named_dim::x, named_dim::y>;
2310 using cuda_block_xz_syncable_loop =
2311  cuda_block_syncable_loop<named_dim::x, named_dim::z>;
2312 using cuda_block_yx_syncable_loop =
2313  cuda_block_syncable_loop<named_dim::y, named_dim::x>;
2314 using cuda_block_yz_syncable_loop =
2315  cuda_block_syncable_loop<named_dim::y, named_dim::z>;
2316 using cuda_block_zx_syncable_loop =
2317  cuda_block_syncable_loop<named_dim::z, named_dim::x>;
2318 using cuda_block_zy_syncable_loop =
2319  cuda_block_syncable_loop<named_dim::z, named_dim::y>;
2320 using cuda_block_xyz_syncable_loop =
2321  cuda_block_syncable_loop<named_dim::x, named_dim::y, named_dim::z>;
2322 using cuda_block_xzy_syncable_loop =
2323  cuda_block_syncable_loop<named_dim::x, named_dim::z, named_dim::y>;
2324 using cuda_block_yxz_syncable_loop =
2325  cuda_block_syncable_loop<named_dim::y, named_dim::x, named_dim::z>;
2326 using cuda_block_yzx_syncable_loop =
2327  cuda_block_syncable_loop<named_dim::y, named_dim::z, named_dim::x>;
2328 using cuda_block_zxy_syncable_loop =
2329  cuda_block_syncable_loop<named_dim::z, named_dim::x, named_dim::y>;
2330 using cuda_block_zyx_syncable_loop =
2331  cuda_block_syncable_loop<named_dim::z, named_dim::y, named_dim::x>;
2332 
2333 template<named_dim... dims>
2334 using cuda_global_syncable_loop =
2335  cuda_indexer_syncable_loop<cuda::IndexGlobal<dims,
2338 using cuda_global_x_syncable_loop = cuda_global_syncable_loop<named_dim::x>;
2339 using cuda_global_y_syncable_loop = cuda_global_syncable_loop<named_dim::y>;
2340 using cuda_global_z_syncable_loop = cuda_global_syncable_loop<named_dim::z>;
2341 using cuda_global_xy_syncable_loop =
2342  cuda_global_syncable_loop<named_dim::x, named_dim::y>;
2343 using cuda_global_xz_syncable_loop =
2344  cuda_global_syncable_loop<named_dim::x, named_dim::z>;
2345 using cuda_global_yx_syncable_loop =
2346  cuda_global_syncable_loop<named_dim::y, named_dim::x>;
2347 using cuda_global_yz_syncable_loop =
2348  cuda_global_syncable_loop<named_dim::y, named_dim::z>;
2349 using cuda_global_zx_syncable_loop =
2350  cuda_global_syncable_loop<named_dim::z, named_dim::x>;
2351 using cuda_global_zy_syncable_loop =
2352  cuda_global_syncable_loop<named_dim::z, named_dim::y>;
2353 using cuda_global_xyz_syncable_loop =
2354  cuda_global_syncable_loop<named_dim::x, named_dim::y, named_dim::z>;
2355 using cuda_global_xzy_syncable_loop =
2356  cuda_global_syncable_loop<named_dim::x, named_dim::z, named_dim::y>;
2357 using cuda_global_yxz_syncable_loop =
2358  cuda_global_syncable_loop<named_dim::y, named_dim::x, named_dim::z>;
2359 using cuda_global_yzx_syncable_loop =
2360  cuda_global_syncable_loop<named_dim::y, named_dim::z, named_dim::x>;
2361 using cuda_global_zxy_syncable_loop =
2362  cuda_global_syncable_loop<named_dim::z, named_dim::x, named_dim::y>;
2363 using cuda_global_zyx_syncable_loop =
2364  cuda_global_syncable_loop<named_dim::z, named_dim::y, named_dim::x>;
2365 
2366 /*
2367  * Maps segment indices to flattened CUDA threads, blocks, or global threads.
2368  * This is the lowest overhead mapping, but requires that there are the same
2369  * number of physical threads, blocks, or global threads as map requests.
2370  * Reshapes multiple physical threads, blocks, or global threads into a 1D
2371  * iteration space
2372  */
2373 template<named_dim... dims>
2374 using cuda_flatten_thread_direct_unchecked =
2375  cuda_flatten_indexer_direct_unchecked<
2376  cuda::IndexGlobal<dims,
2378  named_usage::ignored>...>;
2379 using cuda_flatten_thread_x_direct_unchecked =
2380  cuda_flatten_thread_direct_unchecked<named_dim::x>;
2381 using cuda_flatten_thread_y_direct_unchecked =
2382  cuda_flatten_thread_direct_unchecked<named_dim::y>;
2383 using cuda_flatten_thread_z_direct_unchecked =
2384  cuda_flatten_thread_direct_unchecked<named_dim::z>;
2385 using cuda_flatten_thread_xy_direct_unchecked =
2386  cuda_flatten_thread_direct_unchecked<named_dim::x, named_dim::y>;
2387 using cuda_flatten_thread_xz_direct_unchecked =
2388  cuda_flatten_thread_direct_unchecked<named_dim::x, named_dim::z>;
2389 using cuda_flatten_thread_yx_direct_unchecked =
2390  cuda_flatten_thread_direct_unchecked<named_dim::y, named_dim::x>;
2391 using cuda_flatten_thread_yz_direct_unchecked =
2392  cuda_flatten_thread_direct_unchecked<named_dim::y, named_dim::z>;
2393 using cuda_flatten_thread_zx_direct_unchecked =
2394  cuda_flatten_thread_direct_unchecked<named_dim::z, named_dim::x>;
2395 using cuda_flatten_thread_zy_direct_unchecked =
2396  cuda_flatten_thread_direct_unchecked<named_dim::z, named_dim::y>;
2397 using cuda_flatten_thread_xyz_direct_unchecked =
2398  cuda_flatten_thread_direct_unchecked<named_dim::x,
2399  named_dim::y,
2400  named_dim::z>;
2401 using cuda_flatten_thread_xzy_direct_unchecked =
2402  cuda_flatten_thread_direct_unchecked<named_dim::x,
2403  named_dim::z,
2404  named_dim::y>;
2405 using cuda_flatten_thread_yxz_direct_unchecked =
2406  cuda_flatten_thread_direct_unchecked<named_dim::y,
2407  named_dim::x,
2408  named_dim::z>;
2409 using cuda_flatten_thread_yzx_direct_unchecked =
2410  cuda_flatten_thread_direct_unchecked<named_dim::y,
2411  named_dim::z,
2412  named_dim::x>;
2413 using cuda_flatten_thread_zxy_direct_unchecked =
2414  cuda_flatten_thread_direct_unchecked<named_dim::z,
2415  named_dim::x,
2416  named_dim::y>;
2417 using cuda_flatten_thread_zyx_direct_unchecked =
2418  cuda_flatten_thread_direct_unchecked<named_dim::z,
2419  named_dim::y,
2420  named_dim::x>;
2421 
2422 template<named_dim... dims>
2423 using cuda_flatten_block_direct_unchecked =
2424  cuda_flatten_indexer_direct_unchecked<
2425  cuda::IndexGlobal<dims,
2428 using cuda_flatten_block_x_direct_unchecked =
2429  cuda_flatten_block_direct_unchecked<named_dim::x>;
2430 using cuda_flatten_block_y_direct_unchecked =
2431  cuda_flatten_block_direct_unchecked<named_dim::y>;
2432 using cuda_flatten_block_z_direct_unchecked =
2433  cuda_flatten_block_direct_unchecked<named_dim::z>;
2434 using cuda_flatten_block_xy_direct_unchecked =
2435  cuda_flatten_block_direct_unchecked<named_dim::x, named_dim::y>;
2436 using cuda_flatten_block_xz_direct_unchecked =
2437  cuda_flatten_block_direct_unchecked<named_dim::x, named_dim::z>;
2438 using cuda_flatten_block_yx_direct_unchecked =
2439  cuda_flatten_block_direct_unchecked<named_dim::y, named_dim::x>;
2440 using cuda_flatten_block_yz_direct_unchecked =
2441  cuda_flatten_block_direct_unchecked<named_dim::y, named_dim::z>;
2442 using cuda_flatten_block_zx_direct_unchecked =
2443  cuda_flatten_block_direct_unchecked<named_dim::z, named_dim::x>;
2444 using cuda_flatten_block_zy_direct_unchecked =
2445  cuda_flatten_block_direct_unchecked<named_dim::z, named_dim::y>;
2446 using cuda_flatten_block_xyz_direct_unchecked =
2447  cuda_flatten_block_direct_unchecked<named_dim::x,
2448  named_dim::y,
2449  named_dim::z>;
2450 using cuda_flatten_block_xzy_direct_unchecked =
2451  cuda_flatten_block_direct_unchecked<named_dim::x,
2452  named_dim::z,
2453  named_dim::y>;
2454 using cuda_flatten_block_yxz_direct_unchecked =
2455  cuda_flatten_block_direct_unchecked<named_dim::y,
2456  named_dim::x,
2457  named_dim::z>;
2458 using cuda_flatten_block_yzx_direct_unchecked =
2459  cuda_flatten_block_direct_unchecked<named_dim::y,
2460  named_dim::z,
2461  named_dim::x>;
2462 using cuda_flatten_block_zxy_direct_unchecked =
2463  cuda_flatten_block_direct_unchecked<named_dim::z,
2464  named_dim::x,
2465  named_dim::y>;
2466 using cuda_flatten_block_zyx_direct_unchecked =
2467  cuda_flatten_block_direct_unchecked<named_dim::z,
2468  named_dim::y,
2469  named_dim::x>;
2470 
2471 template<named_dim... dims>
2472 using cuda_flatten_global_direct_unchecked =
2473  cuda_flatten_indexer_direct_unchecked<
2474  cuda::IndexGlobal<dims,
2477 using cuda_flatten_global_x_direct_unchecked =
2478  cuda_flatten_global_direct_unchecked<named_dim::x>;
2479 using cuda_flatten_global_y_direct_unchecked =
2480  cuda_flatten_global_direct_unchecked<named_dim::y>;
2481 using cuda_flatten_global_z_direct_unchecked =
2482  cuda_flatten_global_direct_unchecked<named_dim::z>;
2483 using cuda_flatten_global_xy_direct_unchecked =
2484  cuda_flatten_global_direct_unchecked<named_dim::x, named_dim::y>;
2485 using cuda_flatten_global_xz_direct_unchecked =
2486  cuda_flatten_global_direct_unchecked<named_dim::x, named_dim::z>;
2487 using cuda_flatten_global_yx_direct_unchecked =
2488  cuda_flatten_global_direct_unchecked<named_dim::y, named_dim::x>;
2489 using cuda_flatten_global_yz_direct_unchecked =
2490  cuda_flatten_global_direct_unchecked<named_dim::y, named_dim::z>;
2491 using cuda_flatten_global_zx_direct_unchecked =
2492  cuda_flatten_global_direct_unchecked<named_dim::z, named_dim::x>;
2493 using cuda_flatten_global_zy_direct_unchecked =
2494  cuda_flatten_global_direct_unchecked<named_dim::z, named_dim::y>;
2495 using cuda_flatten_global_xyz_direct_unchecked =
2496  cuda_flatten_global_direct_unchecked<named_dim::x,
2497  named_dim::y,
2498  named_dim::z>;
2499 using cuda_flatten_global_xzy_direct_unchecked =
2500  cuda_flatten_global_direct_unchecked<named_dim::x,
2501  named_dim::z,
2502  named_dim::y>;
2503 using cuda_flatten_global_yxz_direct_unchecked =
2504  cuda_flatten_global_direct_unchecked<named_dim::y,
2505  named_dim::x,
2506  named_dim::z>;
2507 using cuda_flatten_global_yzx_direct_unchecked =
2508  cuda_flatten_global_direct_unchecked<named_dim::y,
2509  named_dim::z,
2510  named_dim::x>;
2511 using cuda_flatten_global_zxy_direct_unchecked =
2512  cuda_flatten_global_direct_unchecked<named_dim::z,
2513  named_dim::x,
2514  named_dim::y>;
2515 using cuda_flatten_global_zyx_direct_unchecked =
2516  cuda_flatten_global_direct_unchecked<named_dim::z,
2517  named_dim::y,
2518  named_dim::x>;
2519 
2520 /*
2521  * Maps segment indices to flattened CUDA threads, blocks, or global threads.
2522  * This is a low overhead mapping, but requires that there are enough
2523  * physical threads, blocks, or global threads to fit all of the direct map
2524  * requests.
2525  * Reshapes multiple physical threads, blocks, or global threads into a 1D
2526  * iteration space
2527  */
2528 template<named_dim... dims>
2529 using cuda_flatten_thread_direct = cuda_flatten_indexer_direct<
2530  cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
2531 using cuda_flatten_thread_x_direct = cuda_flatten_thread_direct<named_dim::x>;
2532 using cuda_flatten_thread_y_direct = cuda_flatten_thread_direct<named_dim::y>;
2533 using cuda_flatten_thread_z_direct = cuda_flatten_thread_direct<named_dim::z>;
2534 using cuda_flatten_thread_xy_direct =
2535  cuda_flatten_thread_direct<named_dim::x, named_dim::y>;
2536 using cuda_flatten_thread_xz_direct =
2537  cuda_flatten_thread_direct<named_dim::x, named_dim::z>;
2538 using cuda_flatten_thread_yx_direct =
2539  cuda_flatten_thread_direct<named_dim::y, named_dim::x>;
2540 using cuda_flatten_thread_yz_direct =
2541  cuda_flatten_thread_direct<named_dim::y, named_dim::z>;
2542 using cuda_flatten_thread_zx_direct =
2543  cuda_flatten_thread_direct<named_dim::z, named_dim::x>;
2544 using cuda_flatten_thread_zy_direct =
2545  cuda_flatten_thread_direct<named_dim::z, named_dim::y>;
2546 using cuda_flatten_thread_xyz_direct =
2547  cuda_flatten_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
2548 using cuda_flatten_thread_xzy_direct =
2549  cuda_flatten_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
2550 using cuda_flatten_thread_yxz_direct =
2551  cuda_flatten_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
2552 using cuda_flatten_thread_yzx_direct =
2553  cuda_flatten_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
2554 using cuda_flatten_thread_zxy_direct =
2555  cuda_flatten_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
2556 using cuda_flatten_thread_zyx_direct =
2557  cuda_flatten_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
2558 
2559 template<named_dim... dims>
2560 using cuda_flatten_block_direct = cuda_flatten_indexer_direct<
2561  cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
2562 using cuda_flatten_block_x_direct = cuda_flatten_block_direct<named_dim::x>;
2563 using cuda_flatten_block_y_direct = cuda_flatten_block_direct<named_dim::y>;
2564 using cuda_flatten_block_z_direct = cuda_flatten_block_direct<named_dim::z>;
2565 using cuda_flatten_block_xy_direct =
2566  cuda_flatten_block_direct<named_dim::x, named_dim::y>;
2567 using cuda_flatten_block_xz_direct =
2568  cuda_flatten_block_direct<named_dim::x, named_dim::z>;
2569 using cuda_flatten_block_yx_direct =
2570  cuda_flatten_block_direct<named_dim::y, named_dim::x>;
2571 using cuda_flatten_block_yz_direct =
2572  cuda_flatten_block_direct<named_dim::y, named_dim::z>;
2573 using cuda_flatten_block_zx_direct =
2574  cuda_flatten_block_direct<named_dim::z, named_dim::x>;
2575 using cuda_flatten_block_zy_direct =
2576  cuda_flatten_block_direct<named_dim::z, named_dim::y>;
2577 using cuda_flatten_block_xyz_direct =
2578  cuda_flatten_block_direct<named_dim::x, named_dim::y, named_dim::z>;
2579 using cuda_flatten_block_xzy_direct =
2580  cuda_flatten_block_direct<named_dim::x, named_dim::z, named_dim::y>;
2581 using cuda_flatten_block_yxz_direct =
2582  cuda_flatten_block_direct<named_dim::y, named_dim::x, named_dim::z>;
2583 using cuda_flatten_block_yzx_direct =
2584  cuda_flatten_block_direct<named_dim::y, named_dim::z, named_dim::x>;
2585 using cuda_flatten_block_zxy_direct =
2586  cuda_flatten_block_direct<named_dim::z, named_dim::x, named_dim::y>;
2587 using cuda_flatten_block_zyx_direct =
2588  cuda_flatten_block_direct<named_dim::z, named_dim::y, named_dim::x>;
2589 
2590 template<named_dim... dims>
2591 using cuda_flatten_global_direct =
2592  cuda_flatten_indexer_direct<cuda::IndexGlobal<dims,
2595 using cuda_flatten_global_x_direct = cuda_flatten_global_direct<named_dim::x>;
2596 using cuda_flatten_global_y_direct = cuda_flatten_global_direct<named_dim::y>;
2597 using cuda_flatten_global_z_direct = cuda_flatten_global_direct<named_dim::z>;
2598 using cuda_flatten_global_xy_direct =
2599  cuda_flatten_global_direct<named_dim::x, named_dim::y>;
2600 using cuda_flatten_global_xz_direct =
2601  cuda_flatten_global_direct<named_dim::x, named_dim::z>;
2602 using cuda_flatten_global_yx_direct =
2603  cuda_flatten_global_direct<named_dim::y, named_dim::x>;
2604 using cuda_flatten_global_yz_direct =
2605  cuda_flatten_global_direct<named_dim::y, named_dim::z>;
2606 using cuda_flatten_global_zx_direct =
2607  cuda_flatten_global_direct<named_dim::z, named_dim::x>;
2608 using cuda_flatten_global_zy_direct =
2609  cuda_flatten_global_direct<named_dim::z, named_dim::y>;
2610 using cuda_flatten_global_xyz_direct =
2611  cuda_flatten_global_direct<named_dim::x, named_dim::y, named_dim::z>;
2612 using cuda_flatten_global_xzy_direct =
2613  cuda_flatten_global_direct<named_dim::x, named_dim::z, named_dim::y>;
2614 using cuda_flatten_global_yxz_direct =
2615  cuda_flatten_global_direct<named_dim::y, named_dim::x, named_dim::z>;
2616 using cuda_flatten_global_yzx_direct =
2617  cuda_flatten_global_direct<named_dim::y, named_dim::z, named_dim::x>;
2618 using cuda_flatten_global_zxy_direct =
2619  cuda_flatten_global_direct<named_dim::z, named_dim::x, named_dim::y>;
2620 using cuda_flatten_global_zyx_direct =
2621  cuda_flatten_global_direct<named_dim::z, named_dim::y, named_dim::x>;
2622 
2623 /*
2624  * Maps segment indices to flattened CUDA threads, blocks, or global threads.
2625  * Reshapes multiple physical threads, blocks, or global threads into a 1D
2626  * iteration space
2627  * Uses block-stride or grid-stride looping to exceed the maximum number of
2628  * physical threads, blocks, or global threads
2629  */
2630 template<named_dim... dims>
2631 using cuda_flatten_thread_loop = cuda_flatten_indexer_loop<
2632  cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
2633 using cuda_flatten_thread_x_loop = cuda_flatten_thread_loop<named_dim::x>;
2634 using cuda_flatten_thread_y_loop = cuda_flatten_thread_loop<named_dim::y>;
2635 using cuda_flatten_thread_z_loop = cuda_flatten_thread_loop<named_dim::z>;
2636 using cuda_flatten_thread_xy_loop =
2637  cuda_flatten_thread_loop<named_dim::x, named_dim::y>;
2638 using cuda_flatten_thread_xz_loop =
2639  cuda_flatten_thread_loop<named_dim::x, named_dim::z>;
2640 using cuda_flatten_thread_yx_loop =
2641  cuda_flatten_thread_loop<named_dim::y, named_dim::x>;
2642 using cuda_flatten_thread_yz_loop =
2643  cuda_flatten_thread_loop<named_dim::y, named_dim::z>;
2644 using cuda_flatten_thread_zx_loop =
2645  cuda_flatten_thread_loop<named_dim::z, named_dim::x>;
2646 using cuda_flatten_thread_zy_loop =
2647  cuda_flatten_thread_loop<named_dim::z, named_dim::y>;
2648 using cuda_flatten_thread_xyz_loop =
2649  cuda_flatten_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
2650 using cuda_flatten_thread_xzy_loop =
2651  cuda_flatten_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
2652 using cuda_flatten_thread_yxz_loop =
2653  cuda_flatten_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
2654 using cuda_flatten_thread_yzx_loop =
2655  cuda_flatten_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
2656 using cuda_flatten_thread_zxy_loop =
2657  cuda_flatten_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
2658 using cuda_flatten_thread_zyx_loop =
2659  cuda_flatten_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
2660 
2661 template<named_dim... dims>
2662 using cuda_flatten_block_loop = cuda_flatten_indexer_loop<
2663  cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
2664 using cuda_flatten_block_x_loop = cuda_flatten_block_loop<named_dim::x>;
2665 using cuda_flatten_block_y_loop = cuda_flatten_block_loop<named_dim::y>;
2666 using cuda_flatten_block_z_loop = cuda_flatten_block_loop<named_dim::z>;
2667 using cuda_flatten_block_xy_loop =
2668  cuda_flatten_block_loop<named_dim::x, named_dim::y>;
2669 using cuda_flatten_block_xz_loop =
2670  cuda_flatten_block_loop<named_dim::x, named_dim::z>;
2671 using cuda_flatten_block_yx_loop =
2672  cuda_flatten_block_loop<named_dim::y, named_dim::x>;
2673 using cuda_flatten_block_yz_loop =
2674  cuda_flatten_block_loop<named_dim::y, named_dim::z>;
2675 using cuda_flatten_block_zx_loop =
2676  cuda_flatten_block_loop<named_dim::z, named_dim::x>;
2677 using cuda_flatten_block_zy_loop =
2678  cuda_flatten_block_loop<named_dim::z, named_dim::y>;
2679 using cuda_flatten_block_xyz_loop =
2680  cuda_flatten_block_loop<named_dim::x, named_dim::y, named_dim::z>;
2681 using cuda_flatten_block_xzy_loop =
2682  cuda_flatten_block_loop<named_dim::x, named_dim::z, named_dim::y>;
2683 using cuda_flatten_block_yxz_loop =
2684  cuda_flatten_block_loop<named_dim::y, named_dim::x, named_dim::z>;
2685 using cuda_flatten_block_yzx_loop =
2686  cuda_flatten_block_loop<named_dim::y, named_dim::z, named_dim::x>;
2687 using cuda_flatten_block_zxy_loop =
2688  cuda_flatten_block_loop<named_dim::z, named_dim::x, named_dim::y>;
2689 using cuda_flatten_block_zyx_loop =
2690  cuda_flatten_block_loop<named_dim::z, named_dim::y, named_dim::x>;
2691 
2692 template<named_dim... dims>
2693 using cuda_flatten_global_loop =
2694  cuda_flatten_indexer_loop<cuda::IndexGlobal<dims,
2697 using cuda_flatten_global_x_loop = cuda_flatten_global_loop<named_dim::x>;
2698 using cuda_flatten_global_y_loop = cuda_flatten_global_loop<named_dim::y>;
2699 using cuda_flatten_global_z_loop = cuda_flatten_global_loop<named_dim::z>;
2700 using cuda_flatten_global_xy_loop =
2701  cuda_flatten_global_loop<named_dim::x, named_dim::y>;
2702 using cuda_flatten_global_xz_loop =
2703  cuda_flatten_global_loop<named_dim::x, named_dim::z>;
2704 using cuda_flatten_global_yx_loop =
2705  cuda_flatten_global_loop<named_dim::y, named_dim::x>;
2706 using cuda_flatten_global_yz_loop =
2707  cuda_flatten_global_loop<named_dim::y, named_dim::z>;
2708 using cuda_flatten_global_zx_loop =
2709  cuda_flatten_global_loop<named_dim::z, named_dim::x>;
2710 using cuda_flatten_global_zy_loop =
2711  cuda_flatten_global_loop<named_dim::z, named_dim::y>;
2712 using cuda_flatten_global_xyz_loop =
2713  cuda_flatten_global_loop<named_dim::x, named_dim::y, named_dim::z>;
2714 using cuda_flatten_global_xzy_loop =
2715  cuda_flatten_global_loop<named_dim::x, named_dim::z, named_dim::y>;
2716 using cuda_flatten_global_yxz_loop =
2717  cuda_flatten_global_loop<named_dim::y, named_dim::x, named_dim::z>;
2718 using cuda_flatten_global_yzx_loop =
2719  cuda_flatten_global_loop<named_dim::y, named_dim::z, named_dim::x>;
2720 using cuda_flatten_global_zxy_loop =
2721  cuda_flatten_global_loop<named_dim::z, named_dim::x, named_dim::y>;
2722 using cuda_flatten_global_zyx_loop =
2723  cuda_flatten_global_loop<named_dim::z, named_dim::y, named_dim::x>;
2724 
2730 template<int X_SIZE>
2731 using cuda_thread_size_x_direct_unchecked =
2732  cuda_indexer_direct_unchecked<cuda::thread_x<X_SIZE>>;
2733 template<int Y_SIZE>
2734 using cuda_thread_size_y_direct_unchecked =
2735  cuda_indexer_direct_unchecked<cuda::thread_y<Y_SIZE>>;
2736 template<int Z_SIZE>
2737 using cuda_thread_size_z_direct_unchecked =
2738  cuda_indexer_direct_unchecked<cuda::thread_z<Z_SIZE>>;
2739 template<int X_SIZE, int Y_SIZE>
2740 using cuda_thread_size_xy_direct_unchecked =
2741  cuda_indexer_direct_unchecked<cuda::thread_x<X_SIZE>,
2742  cuda::thread_y<Y_SIZE>>;
2743 template<int X_SIZE, int Z_SIZE>
2744 using cuda_thread_size_xz_direct_unchecked =
2745  cuda_indexer_direct_unchecked<cuda::thread_x<X_SIZE>,
2746  cuda::thread_z<Z_SIZE>>;
2747 template<int Y_SIZE, int X_SIZE>
2748 using cuda_thread_size_yx_direct_unchecked =
2749  cuda_indexer_direct_unchecked<cuda::thread_y<Y_SIZE>,
2750  cuda::thread_x<X_SIZE>>;
2751 template<int Y_SIZE, int Z_SIZE>
2752 using cuda_thread_size_yz_direct_unchecked =
2753  cuda_indexer_direct_unchecked<cuda::thread_y<Y_SIZE>,
2754  cuda::thread_z<Z_SIZE>>;
2755 template<int Z_SIZE, int X_SIZE>
2756 using cuda_thread_size_zx_direct_unchecked =
2757  cuda_indexer_direct_unchecked<cuda::thread_z<Z_SIZE>,
2758  cuda::thread_x<X_SIZE>>;
2759 template<int Z_SIZE, int Y_SIZE>
2760 using cuda_thread_size_zy_direct_unchecked =
2761  cuda_indexer_direct_unchecked<cuda::thread_z<Z_SIZE>,
2762  cuda::thread_y<Y_SIZE>>;
2763 template<int X_SIZE, int Y_SIZE, int Z_SIZE>
2764 using cuda_thread_size_xyz_direct_unchecked =
2765  cuda_indexer_direct_unchecked<cuda::thread_x<X_SIZE>,
2766  cuda::thread_y<Y_SIZE>,
2767  cuda::thread_z<Z_SIZE>>;
2768 template<int X_SIZE, int Z_SIZE, int Y_SIZE>
2769 using cuda_thread_size_xzy_direct_unchecked =
2770  cuda_indexer_direct_unchecked<cuda::thread_x<X_SIZE>,
2771  cuda::thread_z<Z_SIZE>,
2772  cuda::thread_y<Y_SIZE>>;
2773 template<int Y_SIZE, int X_SIZE, int Z_SIZE>
2774 using cuda_thread_size_yxz_direct_unchecked =
2775  cuda_indexer_direct_unchecked<cuda::thread_y<Y_SIZE>,
2776  cuda::thread_x<X_SIZE>,
2777  cuda::thread_z<Z_SIZE>>;
2778 template<int Y_SIZE, int Z_SIZE, int X_SIZE>
2779 using cuda_thread_size_yzx_direct_unchecked =
2780  cuda_indexer_direct_unchecked<cuda::thread_y<Y_SIZE>,
2781  cuda::thread_z<Z_SIZE>,
2782  cuda::thread_x<X_SIZE>>;
2783 template<int Z_SIZE, int X_SIZE, int Y_SIZE>
2784 using cuda_thread_size_zxy_direct_unchecked =
2785  cuda_indexer_direct_unchecked<cuda::thread_z<Z_SIZE>,
2786  cuda::thread_x<X_SIZE>,
2787  cuda::thread_y<Y_SIZE>>;
2788 template<int Z_SIZE, int Y_SIZE, int X_SIZE>
2789 using cuda_thread_size_zyx_direct_unchecked =
2790  cuda_indexer_direct_unchecked<cuda::thread_z<Z_SIZE>,
2791  cuda::thread_y<Y_SIZE>,
2792  cuda::thread_x<X_SIZE>>;
2793 
2794 template<int X_SIZE>
2795 using cuda_block_size_x_direct_unchecked =
2796  cuda_indexer_direct_unchecked<cuda::block_x<X_SIZE>>;
2797 template<int Y_SIZE>
2798 using cuda_block_size_y_direct_unchecked =
2799  cuda_indexer_direct_unchecked<cuda::block_y<Y_SIZE>>;
2800 template<int Z_SIZE>
2801 using cuda_block_size_z_direct_unchecked =
2802  cuda_indexer_direct_unchecked<cuda::block_z<Z_SIZE>>;
2803 template<int X_SIZE, int Y_SIZE>
2804 using cuda_block_size_xy_direct_unchecked =
2805  cuda_indexer_direct_unchecked<cuda::block_x<X_SIZE>, cuda::block_y<Y_SIZE>>;
2806 template<int X_SIZE, int Z_SIZE>
2807 using cuda_block_size_xz_direct_unchecked =
2808  cuda_indexer_direct_unchecked<cuda::block_x<X_SIZE>, cuda::block_z<Z_SIZE>>;
2809 template<int Y_SIZE, int X_SIZE>
2810 using cuda_block_size_yx_direct_unchecked =
2811  cuda_indexer_direct_unchecked<cuda::block_y<Y_SIZE>, cuda::block_x<X_SIZE>>;
2812 template<int Y_SIZE, int Z_SIZE>
2813 using cuda_block_size_yz_direct_unchecked =
2814  cuda_indexer_direct_unchecked<cuda::block_y<Y_SIZE>, cuda::block_z<Z_SIZE>>;
2815 template<int Z_SIZE, int X_SIZE>
2816 using cuda_block_size_zx_direct_unchecked =
2817  cuda_indexer_direct_unchecked<cuda::block_z<Z_SIZE>, cuda::block_x<X_SIZE>>;
2818 template<int Z_SIZE, int Y_SIZE>
2819 using cuda_block_size_zy_direct_unchecked =
2820  cuda_indexer_direct_unchecked<cuda::block_z<Z_SIZE>, cuda::block_y<Y_SIZE>>;
2821 template<int X_SIZE, int Y_SIZE, int Z_SIZE>
2822 using cuda_block_size_xyz_direct_unchecked =
2823  cuda_indexer_direct_unchecked<cuda::block_x<X_SIZE>,
2824  cuda::block_y<Y_SIZE>,
2825  cuda::block_z<Z_SIZE>>;
2826 template<int X_SIZE, int Z_SIZE, int Y_SIZE>
2827 using cuda_block_size_xzy_direct_unchecked =
2828  cuda_indexer_direct_unchecked<cuda::block_x<X_SIZE>,
2829  cuda::block_z<Z_SIZE>,
2830  cuda::block_y<Y_SIZE>>;
2831 template<int Y_SIZE, int X_SIZE, int Z_SIZE>
2832 using cuda_block_size_yxz_direct_unchecked =
2833  cuda_indexer_direct_unchecked<cuda::block_y<Y_SIZE>,
2834  cuda::block_x<X_SIZE>,
2835  cuda::block_z<Z_SIZE>>;
2836 template<int Y_SIZE, int Z_SIZE, int X_SIZE>
2837 using cuda_block_size_yzx_direct_unchecked =
2838  cuda_indexer_direct_unchecked<cuda::block_y<Y_SIZE>,
2839  cuda::block_z<Z_SIZE>,
2840  cuda::block_x<X_SIZE>>;
2841 template<int Z_SIZE, int X_SIZE, int Y_SIZE>
2842 using cuda_block_size_zxy_direct_unchecked =
2843  cuda_indexer_direct_unchecked<cuda::block_z<Z_SIZE>,
2844  cuda::block_x<X_SIZE>,
2845  cuda::block_y<Y_SIZE>>;
2846 template<int Z_SIZE, int Y_SIZE, int X_SIZE>
2847 using cuda_block_size_zyx_direct_unchecked =
2848  cuda_indexer_direct_unchecked<cuda::block_z<Z_SIZE>,
2849  cuda::block_y<Y_SIZE>,
2850  cuda::block_x<X_SIZE>>;
2851 
2852 template<int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
2853 using cuda_global_size_x_direct_unchecked =
2854  cuda_indexer_direct_unchecked<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
2855 template<int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
2856 using cuda_global_size_y_direct_unchecked =
2857  cuda_indexer_direct_unchecked<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
2858 template<int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
2859 using cuda_global_size_z_direct_unchecked =
2860  cuda_indexer_direct_unchecked<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
2861 template<int X_BLOCK_SIZE,
2862  int Y_BLOCK_SIZE,
2863  int X_GRID_SIZE = named_usage::unspecified,
2864  int Y_GRID_SIZE = named_usage::unspecified>
2865 using cuda_global_size_xy_direct_unchecked =
2866  cuda_indexer_direct_unchecked<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
2867  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
2868 template<int X_BLOCK_SIZE,
2869  int Z_BLOCK_SIZE,
2870  int X_GRID_SIZE = named_usage::unspecified,
2871  int Z_GRID_SIZE = named_usage::unspecified>
2872 using cuda_global_size_xz_direct_unchecked =
2873  cuda_indexer_direct_unchecked<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
2874  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
2875 template<int Y_BLOCK_SIZE,
2876  int X_BLOCK_SIZE,
2877  int Y_GRID_SIZE = named_usage::unspecified,
2878  int X_GRID_SIZE = named_usage::unspecified>
2879 using cuda_global_size_yx_direct_unchecked =
2880  cuda_indexer_direct_unchecked<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
2881  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
2882 template<int Y_BLOCK_SIZE,
2883  int Z_BLOCK_SIZE,
2884  int Y_GRID_SIZE = named_usage::unspecified,
2885  int Z_GRID_SIZE = named_usage::unspecified>
2886 using cuda_global_size_yz_direct_unchecked =
2887  cuda_indexer_direct_unchecked<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
2888  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
2889 template<int Z_BLOCK_SIZE,
2890  int X_BLOCK_SIZE,
2891  int Z_GRID_SIZE = named_usage::unspecified,
2892  int X_GRID_SIZE = named_usage::unspecified>
2893 using cuda_global_size_zx_direct_unchecked =
2894  cuda_indexer_direct_unchecked<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
2895  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
2896 template<int Z_BLOCK_SIZE,
2897  int Y_BLOCK_SIZE,
2898  int Z_GRID_SIZE = named_usage::unspecified,
2899  int Y_GRID_SIZE = named_usage::unspecified>
2900 using cuda_global_size_zy_direct_unchecked =
2901  cuda_indexer_direct_unchecked<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
2902  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
2903 template<int X_BLOCK_SIZE,
2904  int Y_BLOCK_SIZE,
2905  int Z_BLOCK_SIZE,
2906  int X_GRID_SIZE = named_usage::unspecified,
2907  int Y_GRID_SIZE = named_usage::unspecified,
2908  int Z_GRID_SIZE = named_usage::unspecified>
2909 using cuda_global_size_xyz_direct_unchecked =
2910  cuda_indexer_direct_unchecked<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
2911  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
2912  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
2913 template<int X_BLOCK_SIZE,
2914  int Z_BLOCK_SIZE,
2915  int Y_BLOCK_SIZE,
2916  int X_GRID_SIZE = named_usage::unspecified,
2917  int Z_GRID_SIZE = named_usage::unspecified,
2918  int Y_GRID_SIZE = named_usage::unspecified>
2919 using cuda_global_size_xzy_direct_unchecked =
2920  cuda_indexer_direct_unchecked<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
2921  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
2922  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
2923 template<int Y_BLOCK_SIZE,
2924  int X_BLOCK_SIZE,
2925  int Z_BLOCK_SIZE,
2926  int Y_GRID_SIZE = named_usage::unspecified,
2927  int X_GRID_SIZE = named_usage::unspecified,
2928  int Z_GRID_SIZE = named_usage::unspecified>
2929 using cuda_global_size_yxz_direct_unchecked =
2930  cuda_indexer_direct_unchecked<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
2931  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
2932  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
2933 template<int Y_BLOCK_SIZE,
2934  int Z_BLOCK_SIZE,
2935  int X_BLOCK_SIZE,
2936  int Y_GRID_SIZE = named_usage::unspecified,
2937  int Z_GRID_SIZE = named_usage::unspecified,
2938  int X_GRID_SIZE = named_usage::unspecified>
2939 using cuda_global_size_yzx_direct_unchecked =
2940  cuda_indexer_direct_unchecked<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
2941  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
2942  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
2943 template<int Z_BLOCK_SIZE,
2944  int X_BLOCK_SIZE,
2945  int Y_BLOCK_SIZE,
2946  int Z_GRID_SIZE = named_usage::unspecified,
2947  int X_GRID_SIZE = named_usage::unspecified,
2948  int Y_GRID_SIZE = named_usage::unspecified>
2949 using cuda_global_size_zxy_direct_unchecked =
2950  cuda_indexer_direct_unchecked<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
2951  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
2952  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
2953 template<int Z_BLOCK_SIZE,
2954  int Y_BLOCK_SIZE,
2955  int X_BLOCK_SIZE,
2956  int Z_GRID_SIZE = named_usage::unspecified,
2957  int Y_GRID_SIZE = named_usage::unspecified,
2958  int X_GRID_SIZE = named_usage::unspecified>
2959 using cuda_global_size_zyx_direct_unchecked =
2960  cuda_indexer_direct_unchecked<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
2961  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
2962  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
2963 
2969 template<int X_SIZE>
2970 using cuda_thread_size_x_direct = cuda_indexer_direct<cuda::thread_x<X_SIZE>>;
2971 template<int Y_SIZE>
2972 using cuda_thread_size_y_direct = cuda_indexer_direct<cuda::thread_y<Y_SIZE>>;
2973 template<int Z_SIZE>
2974 using cuda_thread_size_z_direct = cuda_indexer_direct<cuda::thread_z<Z_SIZE>>;
2975 template<int X_SIZE, int Y_SIZE>
2976 using cuda_thread_size_xy_direct =
2977  cuda_indexer_direct<cuda::thread_x<X_SIZE>, cuda::thread_y<Y_SIZE>>;
2978 template<int X_SIZE, int Z_SIZE>
2979 using cuda_thread_size_xz_direct =
2980  cuda_indexer_direct<cuda::thread_x<X_SIZE>, cuda::thread_z<Z_SIZE>>;
2981 template<int Y_SIZE, int X_SIZE>
2982 using cuda_thread_size_yx_direct =
2983  cuda_indexer_direct<cuda::thread_y<Y_SIZE>, cuda::thread_x<X_SIZE>>;
2984 template<int Y_SIZE, int Z_SIZE>
2985 using cuda_thread_size_yz_direct =
2986  cuda_indexer_direct<cuda::thread_y<Y_SIZE>, cuda::thread_z<Z_SIZE>>;
2987 template<int Z_SIZE, int X_SIZE>
2988 using cuda_thread_size_zx_direct =
2989  cuda_indexer_direct<cuda::thread_z<Z_SIZE>, cuda::thread_x<X_SIZE>>;
2990 template<int Z_SIZE, int Y_SIZE>
2991 using cuda_thread_size_zy_direct =
2992  cuda_indexer_direct<cuda::thread_z<Z_SIZE>, cuda::thread_y<Y_SIZE>>;
2993 template<int X_SIZE, int Y_SIZE, int Z_SIZE>
2994 using cuda_thread_size_xyz_direct = cuda_indexer_direct<cuda::thread_x<X_SIZE>,
2995  cuda::thread_y<Y_SIZE>,
2996  cuda::thread_z<Z_SIZE>>;
2997 template<int X_SIZE, int Z_SIZE, int Y_SIZE>
2998 using cuda_thread_size_xzy_direct = cuda_indexer_direct<cuda::thread_x<X_SIZE>,
2999  cuda::thread_z<Z_SIZE>,
3000  cuda::thread_y<Y_SIZE>>;
3001 template<int Y_SIZE, int X_SIZE, int Z_SIZE>
3002 using cuda_thread_size_yxz_direct = cuda_indexer_direct<cuda::thread_y<Y_SIZE>,
3003  cuda::thread_x<X_SIZE>,
3004  cuda::thread_z<Z_SIZE>>;
3005 template<int Y_SIZE, int Z_SIZE, int X_SIZE>
3006 using cuda_thread_size_yzx_direct = cuda_indexer_direct<cuda::thread_y<Y_SIZE>,
3007  cuda::thread_z<Z_SIZE>,
3008  cuda::thread_x<X_SIZE>>;
3009 template<int Z_SIZE, int X_SIZE, int Y_SIZE>
3010 using cuda_thread_size_zxy_direct = cuda_indexer_direct<cuda::thread_z<Z_SIZE>,
3011  cuda::thread_x<X_SIZE>,
3012  cuda::thread_y<Y_SIZE>>;
3013 template<int Z_SIZE, int Y_SIZE, int X_SIZE>
3014 using cuda_thread_size_zyx_direct = cuda_indexer_direct<cuda::thread_z<Z_SIZE>,
3015  cuda::thread_y<Y_SIZE>,
3016  cuda::thread_x<X_SIZE>>;
3017 
3018 template<int X_SIZE>
3019 using cuda_block_size_x_direct = cuda_indexer_direct<cuda::block_x<X_SIZE>>;
3020 template<int Y_SIZE>
3021 using cuda_block_size_y_direct = cuda_indexer_direct<cuda::block_y<Y_SIZE>>;
3022 template<int Z_SIZE>
3023 using cuda_block_size_z_direct = cuda_indexer_direct<cuda::block_z<Z_SIZE>>;
3024 template<int X_SIZE, int Y_SIZE>
3025 using cuda_block_size_xy_direct =
3026  cuda_indexer_direct<cuda::block_x<X_SIZE>, cuda::block_y<Y_SIZE>>;
3027 template<int X_SIZE, int Z_SIZE>
3028 using cuda_block_size_xz_direct =
3029  cuda_indexer_direct<cuda::block_x<X_SIZE>, cuda::block_z<Z_SIZE>>;
3030 template<int Y_SIZE, int X_SIZE>
3031 using cuda_block_size_yx_direct =
3032  cuda_indexer_direct<cuda::block_y<Y_SIZE>, cuda::block_x<X_SIZE>>;
3033 template<int Y_SIZE, int Z_SIZE>
3034 using cuda_block_size_yz_direct =
3035  cuda_indexer_direct<cuda::block_y<Y_SIZE>, cuda::block_z<Z_SIZE>>;
3036 template<int Z_SIZE, int X_SIZE>
3037 using cuda_block_size_zx_direct =
3038  cuda_indexer_direct<cuda::block_z<Z_SIZE>, cuda::block_x<X_SIZE>>;
3039 template<int Z_SIZE, int Y_SIZE>
3040 using cuda_block_size_zy_direct =
3041  cuda_indexer_direct<cuda::block_z<Z_SIZE>, cuda::block_y<Y_SIZE>>;
3042 template<int X_SIZE, int Y_SIZE, int Z_SIZE>
3043 using cuda_block_size_xyz_direct = cuda_indexer_direct<cuda::block_x<X_SIZE>,
3044  cuda::block_y<Y_SIZE>,
3045  cuda::block_z<Z_SIZE>>;
3046 template<int X_SIZE, int Z_SIZE, int Y_SIZE>
3047 using cuda_block_size_xzy_direct = cuda_indexer_direct<cuda::block_x<X_SIZE>,
3048  cuda::block_z<Z_SIZE>,
3049  cuda::block_y<Y_SIZE>>;
3050 template<int Y_SIZE, int X_SIZE, int Z_SIZE>
3051 using cuda_block_size_yxz_direct = cuda_indexer_direct<cuda::block_y<Y_SIZE>,
3052  cuda::block_x<X_SIZE>,
3053  cuda::block_z<Z_SIZE>>;
3054 template<int Y_SIZE, int Z_SIZE, int X_SIZE>
3055 using cuda_block_size_yzx_direct = cuda_indexer_direct<cuda::block_y<Y_SIZE>,
3056  cuda::block_z<Z_SIZE>,
3057  cuda::block_x<X_SIZE>>;
3058 template<int Z_SIZE, int X_SIZE, int Y_SIZE>
3059 using cuda_block_size_zxy_direct = cuda_indexer_direct<cuda::block_z<Z_SIZE>,
3060  cuda::block_x<X_SIZE>,
3061  cuda::block_y<Y_SIZE>>;
3062 template<int Z_SIZE, int Y_SIZE, int X_SIZE>
3063 using cuda_block_size_zyx_direct = cuda_indexer_direct<cuda::block_z<Z_SIZE>,
3064  cuda::block_y<Y_SIZE>,
3065  cuda::block_x<X_SIZE>>;
3066 
3067 template<int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
3068 using cuda_global_size_x_direct =
3069  cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3070 template<int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
3071 using cuda_global_size_y_direct =
3072  cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3073 template<int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
3074 using cuda_global_size_z_direct =
3075  cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3076 template<int X_BLOCK_SIZE,
3077  int Y_BLOCK_SIZE,
3078  int X_GRID_SIZE = named_usage::unspecified,
3079  int Y_GRID_SIZE = named_usage::unspecified>
3080 using cuda_global_size_xy_direct =
3081  cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3082  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3083 template<int X_BLOCK_SIZE,
3084  int Z_BLOCK_SIZE,
3085  int X_GRID_SIZE = named_usage::unspecified,
3086  int Z_GRID_SIZE = named_usage::unspecified>
3087 using cuda_global_size_xz_direct =
3088  cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3089  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3090 template<int Y_BLOCK_SIZE,
3091  int X_BLOCK_SIZE,
3092  int Y_GRID_SIZE = named_usage::unspecified,
3093  int X_GRID_SIZE = named_usage::unspecified>
3094 using cuda_global_size_yx_direct =
3095  cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3096  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3097 template<int Y_BLOCK_SIZE,
3098  int Z_BLOCK_SIZE,
3099  int Y_GRID_SIZE = named_usage::unspecified,
3100  int Z_GRID_SIZE = named_usage::unspecified>
3101 using cuda_global_size_yz_direct =
3102  cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3103  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3104 template<int Z_BLOCK_SIZE,
3105  int X_BLOCK_SIZE,
3106  int Z_GRID_SIZE = named_usage::unspecified,
3107  int X_GRID_SIZE = named_usage::unspecified>
3108 using cuda_global_size_zx_direct =
3109  cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3110  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3111 template<int Z_BLOCK_SIZE,
3112  int Y_BLOCK_SIZE,
3113  int Z_GRID_SIZE = named_usage::unspecified,
3114  int Y_GRID_SIZE = named_usage::unspecified>
3115 using cuda_global_size_zy_direct =
3116  cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3117  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3118 template<int X_BLOCK_SIZE,
3119  int Y_BLOCK_SIZE,
3120  int Z_BLOCK_SIZE,
3121  int X_GRID_SIZE = named_usage::unspecified,
3122  int Y_GRID_SIZE = named_usage::unspecified,
3123  int Z_GRID_SIZE = named_usage::unspecified>
3124 using cuda_global_size_xyz_direct =
3125  cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3126  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3127  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3128 template<int X_BLOCK_SIZE,
3129  int Z_BLOCK_SIZE,
3130  int Y_BLOCK_SIZE,
3131  int X_GRID_SIZE = named_usage::unspecified,
3132  int Z_GRID_SIZE = named_usage::unspecified,
3133  int Y_GRID_SIZE = named_usage::unspecified>
3134 using cuda_global_size_xzy_direct =
3135  cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3136  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3137  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3138 template<int Y_BLOCK_SIZE,
3139  int X_BLOCK_SIZE,
3140  int Z_BLOCK_SIZE,
3141  int Y_GRID_SIZE = named_usage::unspecified,
3142  int X_GRID_SIZE = named_usage::unspecified,
3143  int Z_GRID_SIZE = named_usage::unspecified>
3144 using cuda_global_size_yxz_direct =
3145  cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3146  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3147  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3148 template<int Y_BLOCK_SIZE,
3149  int Z_BLOCK_SIZE,
3150  int X_BLOCK_SIZE,
3151  int Y_GRID_SIZE = named_usage::unspecified,
3152  int Z_GRID_SIZE = named_usage::unspecified,
3153  int X_GRID_SIZE = named_usage::unspecified>
3154 using cuda_global_size_yzx_direct =
3155  cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3156  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3157  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3158 template<int Z_BLOCK_SIZE,
3159  int X_BLOCK_SIZE,
3160  int Y_BLOCK_SIZE,
3161  int Z_GRID_SIZE = named_usage::unspecified,
3162  int X_GRID_SIZE = named_usage::unspecified,
3163  int Y_GRID_SIZE = named_usage::unspecified>
3164 using cuda_global_size_zxy_direct =
3165  cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3166  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3167  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3168 template<int Z_BLOCK_SIZE,
3169  int Y_BLOCK_SIZE,
3170  int X_BLOCK_SIZE,
3171  int Z_GRID_SIZE = named_usage::unspecified,
3172  int Y_GRID_SIZE = named_usage::unspecified,
3173  int X_GRID_SIZE = named_usage::unspecified>
3174 using cuda_global_size_zyx_direct =
3175  cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3176  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3177  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3178 
3184 template<int X_SIZE>
3185 using cuda_thread_size_x_loop = cuda_indexer_loop<cuda::thread_x<X_SIZE>>;
3186 template<int Y_SIZE>
3187 using cuda_thread_size_y_loop = cuda_indexer_loop<cuda::thread_y<Y_SIZE>>;
3188 template<int Z_SIZE>
3189 using cuda_thread_size_z_loop = cuda_indexer_loop<cuda::thread_z<Z_SIZE>>;
3190 template<int X_SIZE, int Y_SIZE>
3191 using cuda_thread_size_xy_loop =
3192  cuda_indexer_loop<cuda::thread_x<X_SIZE>, cuda::thread_y<Y_SIZE>>;
3193 template<int X_SIZE, int Z_SIZE>
3194 using cuda_thread_size_xz_loop =
3195  cuda_indexer_loop<cuda::thread_x<X_SIZE>, cuda::thread_z<Z_SIZE>>;
3196 template<int Y_SIZE, int X_SIZE>
3197 using cuda_thread_size_yx_loop =
3198  cuda_indexer_loop<cuda::thread_y<Y_SIZE>, cuda::thread_x<X_SIZE>>;
3199 template<int Y_SIZE, int Z_SIZE>
3200 using cuda_thread_size_yz_loop =
3201  cuda_indexer_loop<cuda::thread_y<Y_SIZE>, cuda::thread_z<Z_SIZE>>;
3202 template<int Z_SIZE, int X_SIZE>
3203 using cuda_thread_size_zx_loop =
3204  cuda_indexer_loop<cuda::thread_z<Z_SIZE>, cuda::thread_x<X_SIZE>>;
3205 template<int Z_SIZE, int Y_SIZE>
3206 using cuda_thread_size_zy_loop =
3207  cuda_indexer_loop<cuda::thread_z<Z_SIZE>, cuda::thread_y<Y_SIZE>>;
3208 template<int X_SIZE, int Y_SIZE, int Z_SIZE>
3209 using cuda_thread_size_xyz_loop = cuda_indexer_loop<cuda::thread_x<X_SIZE>,
3210  cuda::thread_y<Y_SIZE>,
3211  cuda::thread_z<Z_SIZE>>;
3212 template<int X_SIZE, int Z_SIZE, int Y_SIZE>
3213 using cuda_thread_size_xzy_loop = cuda_indexer_loop<cuda::thread_x<X_SIZE>,
3214  cuda::thread_z<Z_SIZE>,
3215  cuda::thread_y<Y_SIZE>>;
3216 template<int Y_SIZE, int X_SIZE, int Z_SIZE>
3217 using cuda_thread_size_yxz_loop = cuda_indexer_loop<cuda::thread_y<Y_SIZE>,
3218  cuda::thread_x<X_SIZE>,
3219  cuda::thread_z<Z_SIZE>>;
3220 template<int Y_SIZE, int Z_SIZE, int X_SIZE>
3221 using cuda_thread_size_yzx_loop = cuda_indexer_loop<cuda::thread_y<Y_SIZE>,
3222  cuda::thread_z<Z_SIZE>,
3223  cuda::thread_x<X_SIZE>>;
3224 template<int Z_SIZE, int X_SIZE, int Y_SIZE>
3225 using cuda_thread_size_zxy_loop = cuda_indexer_loop<cuda::thread_z<Z_SIZE>,
3226  cuda::thread_x<X_SIZE>,
3227  cuda::thread_y<Y_SIZE>>;
3228 template<int Z_SIZE, int Y_SIZE, int X_SIZE>
3229 using cuda_thread_size_zyx_loop = cuda_indexer_loop<cuda::thread_z<Z_SIZE>,
3230  cuda::thread_y<Y_SIZE>,
3231  cuda::thread_x<X_SIZE>>;
3232 
3233 template<int X_SIZE>
3234 using cuda_block_size_x_loop = cuda_indexer_loop<cuda::block_x<X_SIZE>>;
3235 template<int Y_SIZE>
3236 using cuda_block_size_y_loop = cuda_indexer_loop<cuda::block_y<Y_SIZE>>;
3237 template<int Z_SIZE>
3238 using cuda_block_size_z_loop = cuda_indexer_loop<cuda::block_z<Z_SIZE>>;
3239 template<int X_SIZE, int Y_SIZE>
3240 using cuda_block_size_xy_loop =
3241  cuda_indexer_loop<cuda::block_x<X_SIZE>, cuda::block_y<Y_SIZE>>;
3242 template<int X_SIZE, int Z_SIZE>
3243 using cuda_block_size_xz_loop =
3244  cuda_indexer_loop<cuda::block_x<X_SIZE>, cuda::block_z<Z_SIZE>>;
3245 template<int Y_SIZE, int X_SIZE>
3246 using cuda_block_size_yx_loop =
3247  cuda_indexer_loop<cuda::block_y<Y_SIZE>, cuda::block_x<X_SIZE>>;
3248 template<int Y_SIZE, int Z_SIZE>
3249 using cuda_block_size_yz_loop =
3250  cuda_indexer_loop<cuda::block_y<Y_SIZE>, cuda::block_z<Z_SIZE>>;
3251 template<int Z_SIZE, int X_SIZE>
3252 using cuda_block_size_zx_loop =
3253  cuda_indexer_loop<cuda::block_z<Z_SIZE>, cuda::block_x<X_SIZE>>;
3254 template<int Z_SIZE, int Y_SIZE>
3255 using cuda_block_size_zy_loop =
3256  cuda_indexer_loop<cuda::block_z<Z_SIZE>, cuda::block_y<Y_SIZE>>;
3257 template<int X_SIZE, int Y_SIZE, int Z_SIZE>
3258 using cuda_block_size_xyz_loop = cuda_indexer_loop<cuda::block_x<X_SIZE>,
3259  cuda::block_y<Y_SIZE>,
3260  cuda::block_z<Z_SIZE>>;
3261 template<int X_SIZE, int Z_SIZE, int Y_SIZE>
3262 using cuda_block_size_xzy_loop = cuda_indexer_loop<cuda::block_x<X_SIZE>,
3263  cuda::block_z<Z_SIZE>,
3264  cuda::block_y<Y_SIZE>>;
3265 template<int Y_SIZE, int X_SIZE, int Z_SIZE>
3266 using cuda_block_size_yxz_loop = cuda_indexer_loop<cuda::block_y<Y_SIZE>,
3267  cuda::block_x<X_SIZE>,
3268  cuda::block_z<Z_SIZE>>;
3269 template<int Y_SIZE, int Z_SIZE, int X_SIZE>
3270 using cuda_block_size_yzx_loop = cuda_indexer_loop<cuda::block_y<Y_SIZE>,
3271  cuda::block_z<Z_SIZE>,
3272  cuda::block_x<X_SIZE>>;
3273 template<int Z_SIZE, int X_SIZE, int Y_SIZE>
3274 using cuda_block_size_zxy_loop = cuda_indexer_loop<cuda::block_z<Z_SIZE>,
3275  cuda::block_x<X_SIZE>,
3276  cuda::block_y<Y_SIZE>>;
3277 template<int Z_SIZE, int Y_SIZE, int X_SIZE>
3278 using cuda_block_size_zyx_loop = cuda_indexer_loop<cuda::block_z<Z_SIZE>,
3279  cuda::block_y<Y_SIZE>,
3280  cuda::block_x<X_SIZE>>;
3281 
3282 template<int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
3283 using cuda_global_size_x_loop =
3284  cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3285 template<int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
3286 using cuda_global_size_y_loop =
3287  cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3288 template<int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
3289 using cuda_global_size_z_loop =
3290  cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3291 template<int X_BLOCK_SIZE,
3292  int Y_BLOCK_SIZE,
3293  int X_GRID_SIZE = named_usage::unspecified,
3294  int Y_GRID_SIZE = named_usage::unspecified>
3295 using cuda_global_size_xy_loop =
3296  cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3297  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3298 template<int X_BLOCK_SIZE,
3299  int Z_BLOCK_SIZE,
3300  int X_GRID_SIZE = named_usage::unspecified,
3301  int Z_GRID_SIZE = named_usage::unspecified>
3302 using cuda_global_size_xz_loop =
3303  cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3304  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3305 template<int Y_BLOCK_SIZE,
3306  int X_BLOCK_SIZE,
3307  int Y_GRID_SIZE = named_usage::unspecified,
3308  int X_GRID_SIZE = named_usage::unspecified>
3309 using cuda_global_size_yx_loop =
3310  cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3311  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3312 template<int Y_BLOCK_SIZE,
3313  int Z_BLOCK_SIZE,
3314  int Y_GRID_SIZE = named_usage::unspecified,
3315  int Z_GRID_SIZE = named_usage::unspecified>
3316 using cuda_global_size_yz_loop =
3317  cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3318  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3319 template<int Z_BLOCK_SIZE,
3320  int X_BLOCK_SIZE,
3321  int Z_GRID_SIZE = named_usage::unspecified,
3322  int X_GRID_SIZE = named_usage::unspecified>
3323 using cuda_global_size_zx_loop =
3324  cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3325  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3326 template<int Z_BLOCK_SIZE,
3327  int Y_BLOCK_SIZE,
3328  int Z_GRID_SIZE = named_usage::unspecified,
3329  int Y_GRID_SIZE = named_usage::unspecified>
3330 using cuda_global_size_zy_loop =
3331  cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3332  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3333 template<int X_BLOCK_SIZE,
3334  int Y_BLOCK_SIZE,
3335  int Z_BLOCK_SIZE,
3336  int X_GRID_SIZE = named_usage::unspecified,
3337  int Y_GRID_SIZE = named_usage::unspecified,
3338  int Z_GRID_SIZE = named_usage::unspecified>
3339 using cuda_global_size_xyz_loop =
3340  cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3341  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3342  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3343 template<int X_BLOCK_SIZE,
3344  int Z_BLOCK_SIZE,
3345  int Y_BLOCK_SIZE,
3346  int X_GRID_SIZE = named_usage::unspecified,
3347  int Z_GRID_SIZE = named_usage::unspecified,
3348  int Y_GRID_SIZE = named_usage::unspecified>
3349 using cuda_global_size_xzy_loop =
3350  cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3351  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3352  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3353 template<int Y_BLOCK_SIZE,
3354  int X_BLOCK_SIZE,
3355  int Z_BLOCK_SIZE,
3356  int Y_GRID_SIZE = named_usage::unspecified,
3357  int X_GRID_SIZE = named_usage::unspecified,
3358  int Z_GRID_SIZE = named_usage::unspecified>
3359 using cuda_global_size_yxz_loop =
3360  cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3361  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3362  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3363 template<int Y_BLOCK_SIZE,
3364  int Z_BLOCK_SIZE,
3365  int X_BLOCK_SIZE,
3366  int Y_GRID_SIZE = named_usage::unspecified,
3367  int Z_GRID_SIZE = named_usage::unspecified,
3368  int X_GRID_SIZE = named_usage::unspecified>
3369 using cuda_global_size_yzx_loop =
3370  cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3371  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3372  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3373 template<int Z_BLOCK_SIZE,
3374  int X_BLOCK_SIZE,
3375  int Y_BLOCK_SIZE,
3376  int Z_GRID_SIZE = named_usage::unspecified,
3377  int X_GRID_SIZE = named_usage::unspecified,
3378  int Y_GRID_SIZE = named_usage::unspecified>
3379 using cuda_global_size_zxy_loop =
3380  cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3381  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3382  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3383 template<int Z_BLOCK_SIZE,
3384  int Y_BLOCK_SIZE,
3385  int X_BLOCK_SIZE,
3386  int Z_GRID_SIZE = named_usage::unspecified,
3387  int Y_GRID_SIZE = named_usage::unspecified,
3388  int X_GRID_SIZE = named_usage::unspecified>
3389 using cuda_global_size_zyx_loop =
3390  cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3391  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3392  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3393 
3394 /*
3395  * Maps segment indices to flattened CUDA threads, blocks, or global threads.
3396  * This is the lowest overhead mapping, but requires that there are the same
3397  * number of physical threads, blocks, or global threads as the map requests.
3398  * Reshapes multiple physical threads, blocks, or global threads into a 1D
3399  * iteration space.
3400  */
3401 template<int X_SIZE>
3402 using cuda_flatten_thread_size_x_direct_unchecked =
3403  cuda_flatten_indexer_direct_unchecked<cuda::thread_x<X_SIZE>>;
3404 template<int Y_SIZE>
3405 using cuda_flatten_thread_size_y_direct_unchecked =
3406  cuda_flatten_indexer_direct_unchecked<cuda::thread_y<Y_SIZE>>;
3407 template<int Z_SIZE>
3408 using cuda_flatten_thread_size_z_direct_unchecked =
3409  cuda_flatten_indexer_direct_unchecked<cuda::thread_z<Z_SIZE>>;
3410 template<int X_SIZE, int Y_SIZE>
3411 using cuda_flatten_thread_size_xy_direct_unchecked =
3412  cuda_flatten_indexer_direct_unchecked<cuda::thread_x<X_SIZE>,
3413  cuda::thread_y<Y_SIZE>>;
3414 template<int X_SIZE, int Z_SIZE>
3415 using cuda_flatten_thread_size_xz_direct_unchecked =
3416  cuda_flatten_indexer_direct_unchecked<cuda::thread_x<X_SIZE>,
3417  cuda::thread_z<Z_SIZE>>;
3418 template<int Y_SIZE, int X_SIZE>
3419 using cuda_flatten_thread_size_yx_direct_unchecked =
3420  cuda_flatten_indexer_direct_unchecked<cuda::thread_y<Y_SIZE>,
3421  cuda::thread_x<X_SIZE>>;
3422 template<int Y_SIZE, int Z_SIZE>
3423 using cuda_flatten_thread_size_yz_direct_unchecked =
3424  cuda_flatten_indexer_direct_unchecked<cuda::thread_y<Y_SIZE>,
3425  cuda::thread_z<Z_SIZE>>;
3426 template<int Z_SIZE, int X_SIZE>
3427 using cuda_flatten_thread_size_zx_direct_unchecked =
3428  cuda_flatten_indexer_direct_unchecked<cuda::thread_z<Z_SIZE>,
3429  cuda::thread_x<X_SIZE>>;
3430 template<int Z_SIZE, int Y_SIZE>
3431 using cuda_flatten_thread_size_zy_direct_unchecked =
3432  cuda_flatten_indexer_direct_unchecked<cuda::thread_z<Z_SIZE>,
3433  cuda::thread_y<Y_SIZE>>;
3434 template<int X_SIZE, int Y_SIZE, int Z_SIZE>
3435 using cuda_flatten_thread_size_xyz_direct_unchecked =
3436  cuda_flatten_indexer_direct_unchecked<cuda::thread_x<X_SIZE>,
3437  cuda::thread_y<Y_SIZE>,
3438  cuda::thread_z<Z_SIZE>>;
3439 template<int X_SIZE, int Z_SIZE, int Y_SIZE>
3440 using cuda_flatten_thread_size_xzy_direct_unchecked =
3441  cuda_flatten_indexer_direct_unchecked<cuda::thread_x<X_SIZE>,
3442  cuda::thread_z<Z_SIZE>,
3443  cuda::thread_y<Y_SIZE>>;
3444 template<int Y_SIZE, int X_SIZE, int Z_SIZE>
3445 using cuda_flatten_thread_size_yxz_direct_unchecked =
3446  cuda_flatten_indexer_direct_unchecked<cuda::thread_y<Y_SIZE>,
3447  cuda::thread_x<X_SIZE>,
3448  cuda::thread_z<Z_SIZE>>;
3449 template<int Y_SIZE, int Z_SIZE, int X_SIZE>
3450 using cuda_flatten_thread_size_yzx_direct_unchecked =
3451  cuda_flatten_indexer_direct_unchecked<cuda::thread_y<Y_SIZE>,
3452  cuda::thread_z<Z_SIZE>,
3453  cuda::thread_x<X_SIZE>>;
3454 template<int Z_SIZE, int X_SIZE, int Y_SIZE>
3455 using cuda_flatten_thread_size_zxy_direct_unchecked =
3456  cuda_flatten_indexer_direct_unchecked<cuda::thread_z<Z_SIZE>,
3457  cuda::thread_x<X_SIZE>,
3458  cuda::thread_y<Y_SIZE>>;
3459 template<int Z_SIZE, int Y_SIZE, int X_SIZE>
3460 using cuda_flatten_thread_size_zyx_direct_unchecked =
3461  cuda_flatten_indexer_direct_unchecked<cuda::thread_z<Z_SIZE>,
3462  cuda::thread_y<Y_SIZE>,
3463  cuda::thread_x<X_SIZE>>;
3464 
3465 template<int X_SIZE>
3466 using cuda_flatten_block_size_x_direct_unchecked =
3467  cuda_flatten_indexer_direct_unchecked<cuda::block_x<X_SIZE>>;
3468 template<int Y_SIZE>
3469 using cuda_flatten_block_size_y_direct_unchecked =
3470  cuda_flatten_indexer_direct_unchecked<cuda::block_y<Y_SIZE>>;
3471 template<int Z_SIZE>
3472 using cuda_flatten_block_size_z_direct_unchecked =
3473  cuda_flatten_indexer_direct_unchecked<cuda::block_z<Z_SIZE>>;
3474 template<int X_SIZE, int Y_SIZE>
3475 using cuda_flatten_block_size_xy_direct_unchecked =
3476  cuda_flatten_indexer_direct_unchecked<cuda::block_x<X_SIZE>,
3477  cuda::block_y<Y_SIZE>>;
3478 template<int X_SIZE, int Z_SIZE>
3479 using cuda_flatten_block_size_xz_direct_unchecked =
3480  cuda_flatten_indexer_direct_unchecked<cuda::block_x<X_SIZE>,
3481  cuda::block_z<Z_SIZE>>;
3482 template<int Y_SIZE, int X_SIZE>
3483 using cuda_flatten_block_size_yx_direct_unchecked =
3484  cuda_flatten_indexer_direct_unchecked<cuda::block_y<Y_SIZE>,
3485  cuda::block_x<X_SIZE>>;
3486 template<int Y_SIZE, int Z_SIZE>
3487 using cuda_flatten_block_size_yz_direct_unchecked =
3488  cuda_flatten_indexer_direct_unchecked<cuda::block_y<Y_SIZE>,
3489  cuda::block_z<Z_SIZE>>;
3490 template<int Z_SIZE, int X_SIZE>
3491 using cuda_flatten_block_size_zx_direct_unchecked =
3492  cuda_flatten_indexer_direct_unchecked<cuda::block_z<Z_SIZE>,
3493  cuda::block_x<X_SIZE>>;
3494 template<int Z_SIZE, int Y_SIZE>
3495 using cuda_flatten_block_size_zy_direct_unchecked =
3496  cuda_flatten_indexer_direct_unchecked<cuda::block_z<Z_SIZE>,
3497  cuda::block_y<Y_SIZE>>;
3498 template<int X_SIZE, int Y_SIZE, int Z_SIZE>
3499 using cuda_flatten_block_size_xyz_direct_unchecked =
3500  cuda_flatten_indexer_direct_unchecked<cuda::block_x<X_SIZE>,
3501  cuda::block_y<Y_SIZE>,
3502  cuda::block_z<Z_SIZE>>;
3503 template<int X_SIZE, int Z_SIZE, int Y_SIZE>
3504 using cuda_flatten_block_size_xzy_direct_unchecked =
3505  cuda_flatten_indexer_direct_unchecked<cuda::block_x<X_SIZE>,
3506  cuda::block_z<Z_SIZE>,
3507  cuda::block_y<Y_SIZE>>;
3508 template<int Y_SIZE, int X_SIZE, int Z_SIZE>
3509 using cuda_flatten_block_size_yxz_direct_unchecked =
3510  cuda_flatten_indexer_direct_unchecked<cuda::block_y<Y_SIZE>,
3511  cuda::block_x<X_SIZE>,
3512  cuda::block_z<Z_SIZE>>;
3513 template<int Y_SIZE, int Z_SIZE, int X_SIZE>
3514 using cuda_flatten_block_size_yzx_direct_unchecked =
3515  cuda_flatten_indexer_direct_unchecked<cuda::block_y<Y_SIZE>,
3516  cuda::block_z<Z_SIZE>,
3517  cuda::block_x<X_SIZE>>;
3518 template<int Z_SIZE, int X_SIZE, int Y_SIZE>
3519 using cuda_flatten_block_size_zxy_direct_unchecked =
3520  cuda_flatten_indexer_direct_unchecked<cuda::block_z<Z_SIZE>,
3521  cuda::block_x<X_SIZE>,
3522  cuda::block_y<Y_SIZE>>;
3523 template<int Z_SIZE, int Y_SIZE, int X_SIZE>
3524 using cuda_flatten_block_size_zyx_direct_unchecked =
3525  cuda_flatten_indexer_direct_unchecked<cuda::block_z<Z_SIZE>,
3526  cuda::block_y<Y_SIZE>,
3527  cuda::block_x<X_SIZE>>;
3528 
3529 template<int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
3530 using cuda_flatten_global_size_x_direct_unchecked =
3531  cuda_flatten_indexer_direct_unchecked<
3532  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3533 template<int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
3534 using cuda_flatten_global_size_y_direct_unchecked =
3535  cuda_flatten_indexer_direct_unchecked<
3536  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3537 template<int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
3538 using cuda_flatten_global_size_z_direct_unchecked =
3539  cuda_flatten_indexer_direct_unchecked<
3540  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3541 template<int X_BLOCK_SIZE,
3542  int Y_BLOCK_SIZE,
3543  int X_GRID_SIZE = named_usage::unspecified,
3544  int Y_GRID_SIZE = named_usage::unspecified>
3545 using cuda_flatten_global_size_xy_direct_unchecked =
3546  cuda_flatten_indexer_direct_unchecked<
3547  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3548  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3549 template<int X_BLOCK_SIZE,
3550  int Z_BLOCK_SIZE,
3551  int X_GRID_SIZE = named_usage::unspecified,
3552  int Z_GRID_SIZE = named_usage::unspecified>
3553 using cuda_flatten_global_size_xz_direct_unchecked =
3554  cuda_flatten_indexer_direct_unchecked<
3555  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3556  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3557 template<int Y_BLOCK_SIZE,
3558  int X_BLOCK_SIZE,
3559  int Y_GRID_SIZE = named_usage::unspecified,
3560  int X_GRID_SIZE = named_usage::unspecified>
3561 using cuda_flatten_global_size_yx_direct_unchecked =
3562  cuda_flatten_indexer_direct_unchecked<
3563  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3564  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3565 template<int Y_BLOCK_SIZE,
3566  int Z_BLOCK_SIZE,
3567  int Y_GRID_SIZE = named_usage::unspecified,
3568  int Z_GRID_SIZE = named_usage::unspecified>
3569 using cuda_flatten_global_size_yz_direct_unchecked =
3570  cuda_flatten_indexer_direct_unchecked<
3571  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3572  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3573 template<int Z_BLOCK_SIZE,
3574  int X_BLOCK_SIZE,
3575  int Z_GRID_SIZE = named_usage::unspecified,
3576  int X_GRID_SIZE = named_usage::unspecified>
3577 using cuda_flatten_global_size_zx_direct_unchecked =
3578  cuda_flatten_indexer_direct_unchecked<
3579  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3580  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3581 template<int Z_BLOCK_SIZE,
3582  int Y_BLOCK_SIZE,
3583  int Z_GRID_SIZE = named_usage::unspecified,
3584  int Y_GRID_SIZE = named_usage::unspecified>
3585 using cuda_flatten_global_size_zy_direct_unchecked =
3586  cuda_flatten_indexer_direct_unchecked<
3587  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3588  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3589 template<int X_BLOCK_SIZE,
3590  int Y_BLOCK_SIZE,
3591  int Z_BLOCK_SIZE,
3592  int X_GRID_SIZE = named_usage::unspecified,
3593  int Y_GRID_SIZE = named_usage::unspecified,
3594  int Z_GRID_SIZE = named_usage::unspecified>
3595 using cuda_flatten_global_size_xyz_direct_unchecked =
3596  cuda_flatten_indexer_direct_unchecked<
3597  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3598  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3599  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3600 template<int X_BLOCK_SIZE,
3601  int Z_BLOCK_SIZE,
3602  int Y_BLOCK_SIZE,
3603  int X_GRID_SIZE = named_usage::unspecified,
3604  int Z_GRID_SIZE = named_usage::unspecified,
3605  int Y_GRID_SIZE = named_usage::unspecified>
3606 using cuda_flatten_global_size_xzy_direct_unchecked =
3607  cuda_flatten_indexer_direct_unchecked<
3608  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3609  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3610  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3611 template<int Y_BLOCK_SIZE,
3612  int X_BLOCK_SIZE,
3613  int Z_BLOCK_SIZE,
3614  int Y_GRID_SIZE = named_usage::unspecified,
3615  int X_GRID_SIZE = named_usage::unspecified,
3616  int Z_GRID_SIZE = named_usage::unspecified>
3617 using cuda_flatten_global_size_yxz_direct_unchecked =
3618  cuda_flatten_indexer_direct_unchecked<
3619  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3620  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3621  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3622 template<int Y_BLOCK_SIZE,
3623  int Z_BLOCK_SIZE,
3624  int X_BLOCK_SIZE,
3625  int Y_GRID_SIZE = named_usage::unspecified,
3626  int Z_GRID_SIZE = named_usage::unspecified,
3627  int X_GRID_SIZE = named_usage::unspecified>
3628 using cuda_flatten_global_size_yzx_direct_unchecked =
3629  cuda_flatten_indexer_direct_unchecked<
3630  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3631  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3632  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3633 template<int Z_BLOCK_SIZE,
3634  int X_BLOCK_SIZE,
3635  int Y_BLOCK_SIZE,
3636  int Z_GRID_SIZE = named_usage::unspecified,
3637  int X_GRID_SIZE = named_usage::unspecified,
3638  int Y_GRID_SIZE = named_usage::unspecified>
3639 using cuda_flatten_global_size_zxy_direct_unchecked =
3640  cuda_flatten_indexer_direct_unchecked<
3641  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3642  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3643  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3644 template<int Z_BLOCK_SIZE,
3645  int Y_BLOCK_SIZE,
3646  int X_BLOCK_SIZE,
3647  int Z_GRID_SIZE = named_usage::unspecified,
3648  int Y_GRID_SIZE = named_usage::unspecified,
3649  int X_GRID_SIZE = named_usage::unspecified>
3650 using cuda_flatten_global_size_zyx_direct_unchecked =
3651  cuda_flatten_indexer_direct_unchecked<
3652  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3653  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3654  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3655 
3656 /*
3657  * Maps segment indices to flattened CUDA threads, blocks, or global threads.
3658  * This is a low overhead mapping, but requires that there are enough
3659  * physical threads, blocks, or global threads to fit all of the direct map
3660  * requests.
3661  * Reshapes multiple physical threads, blocks, or global threads into a 1D
3662  * iteration space.
3663  */
3664 template<int X_SIZE>
3665 using cuda_flatten_thread_size_x_direct =
3666  cuda_flatten_indexer_direct<cuda::thread_x<X_SIZE>>;
3667 template<int Y_SIZE>
3668 using cuda_flatten_thread_size_y_direct =
3669  cuda_flatten_indexer_direct<cuda::thread_y<Y_SIZE>>;
3670 template<int Z_SIZE>
3671 using cuda_flatten_thread_size_z_direct =
3672  cuda_flatten_indexer_direct<cuda::thread_z<Z_SIZE>>;
3673 template<int X_SIZE, int Y_SIZE>
3674 using cuda_flatten_thread_size_xy_direct =
3675  cuda_flatten_indexer_direct<cuda::thread_x<X_SIZE>, cuda::thread_y<Y_SIZE>>;
3676 template<int X_SIZE, int Z_SIZE>
3677 using cuda_flatten_thread_size_xz_direct =
3678  cuda_flatten_indexer_direct<cuda::thread_x<X_SIZE>, cuda::thread_z<Z_SIZE>>;
3679 template<int Y_SIZE, int X_SIZE>
3680 using cuda_flatten_thread_size_yx_direct =
3681  cuda_flatten_indexer_direct<cuda::thread_y<Y_SIZE>, cuda::thread_x<X_SIZE>>;
3682 template<int Y_SIZE, int Z_SIZE>
3683 using cuda_flatten_thread_size_yz_direct =
3684  cuda_flatten_indexer_direct<cuda::thread_y<Y_SIZE>, cuda::thread_z<Z_SIZE>>;
3685 template<int Z_SIZE, int X_SIZE>
3686 using cuda_flatten_thread_size_zx_direct =
3687  cuda_flatten_indexer_direct<cuda::thread_z<Z_SIZE>, cuda::thread_x<X_SIZE>>;
3688 template<int Z_SIZE, int Y_SIZE>
3689 using cuda_flatten_thread_size_zy_direct =
3690  cuda_flatten_indexer_direct<cuda::thread_z<Z_SIZE>, cuda::thread_y<Y_SIZE>>;
3691 template<int X_SIZE, int Y_SIZE, int Z_SIZE>
3692 using cuda_flatten_thread_size_xyz_direct =
3693  cuda_flatten_indexer_direct<cuda::thread_x<X_SIZE>,
3694  cuda::thread_y<Y_SIZE>,
3695  cuda::thread_z<Z_SIZE>>;
3696 template<int X_SIZE, int Z_SIZE, int Y_SIZE>
3697 using cuda_flatten_thread_size_xzy_direct =
3698  cuda_flatten_indexer_direct<cuda::thread_x<X_SIZE>,
3699  cuda::thread_z<Z_SIZE>,
3700  cuda::thread_y<Y_SIZE>>;
3701 template<int Y_SIZE, int X_SIZE, int Z_SIZE>
3702 using cuda_flatten_thread_size_yxz_direct =
3703  cuda_flatten_indexer_direct<cuda::thread_y<Y_SIZE>,
3704  cuda::thread_x<X_SIZE>,
3705  cuda::thread_z<Z_SIZE>>;
3706 template<int Y_SIZE, int Z_SIZE, int X_SIZE>
3707 using cuda_flatten_thread_size_yzx_direct =
3708  cuda_flatten_indexer_direct<cuda::thread_y<Y_SIZE>,
3709  cuda::thread_z<Z_SIZE>,
3710  cuda::thread_x<X_SIZE>>;
3711 template<int Z_SIZE, int X_SIZE, int Y_SIZE>
3712 using cuda_flatten_thread_size_zxy_direct =
3713  cuda_flatten_indexer_direct<cuda::thread_z<Z_SIZE>,
3714  cuda::thread_x<X_SIZE>,
3715  cuda::thread_y<Y_SIZE>>;
3716 template<int Z_SIZE, int Y_SIZE, int X_SIZE>
3717 using cuda_flatten_thread_size_zyx_direct =
3718  cuda_flatten_indexer_direct<cuda::thread_z<Z_SIZE>,
3719  cuda::thread_y<Y_SIZE>,
3720  cuda::thread_x<X_SIZE>>;
3721 
3722 template<int X_SIZE>
3723 using cuda_flatten_block_size_x_direct =
3724  cuda_flatten_indexer_direct<cuda::block_x<X_SIZE>>;
3725 template<int Y_SIZE>
3726 using cuda_flatten_block_size_y_direct =
3727  cuda_flatten_indexer_direct<cuda::block_y<Y_SIZE>>;
3728 template<int Z_SIZE>
3729 using cuda_flatten_block_size_z_direct =
3730  cuda_flatten_indexer_direct<cuda::block_z<Z_SIZE>>;
3731 template<int X_SIZE, int Y_SIZE>
3732 using cuda_flatten_block_size_xy_direct =
3733  cuda_flatten_indexer_direct<cuda::block_x<X_SIZE>, cuda::block_y<Y_SIZE>>;
3734 template<int X_SIZE, int Z_SIZE>
3735 using cuda_flatten_block_size_xz_direct =
3736  cuda_flatten_indexer_direct<cuda::block_x<X_SIZE>, cuda::block_z<Z_SIZE>>;
3737 template<int Y_SIZE, int X_SIZE>
3738 using cuda_flatten_block_size_yx_direct =
3739  cuda_flatten_indexer_direct<cuda::block_y<Y_SIZE>, cuda::block_x<X_SIZE>>;
3740 template<int Y_SIZE, int Z_SIZE>
3741 using cuda_flatten_block_size_yz_direct =
3742  cuda_flatten_indexer_direct<cuda::block_y<Y_SIZE>, cuda::block_z<Z_SIZE>>;
3743 template<int Z_SIZE, int X_SIZE>
3744 using cuda_flatten_block_size_zx_direct =
3745  cuda_flatten_indexer_direct<cuda::block_z<Z_SIZE>, cuda::block_x<X_SIZE>>;
3746 template<int Z_SIZE, int Y_SIZE>
3747 using cuda_flatten_block_size_zy_direct =
3748  cuda_flatten_indexer_direct<cuda::block_z<Z_SIZE>, cuda::block_y<Y_SIZE>>;
3749 template<int X_SIZE, int Y_SIZE, int Z_SIZE>
3750 using cuda_flatten_block_size_xyz_direct =
3751  cuda_flatten_indexer_direct<cuda::block_x<X_SIZE>,
3752  cuda::block_y<Y_SIZE>,
3753  cuda::block_z<Z_SIZE>>;
3754 template<int X_SIZE, int Z_SIZE, int Y_SIZE>
3755 using cuda_flatten_block_size_xzy_direct =
3756  cuda_flatten_indexer_direct<cuda::block_x<X_SIZE>,
3757  cuda::block_z<Z_SIZE>,
3758  cuda::block_y<Y_SIZE>>;
3759 template<int Y_SIZE, int X_SIZE, int Z_SIZE>
3760 using cuda_flatten_block_size_yxz_direct =
3761  cuda_flatten_indexer_direct<cuda::block_y<Y_SIZE>,
3762  cuda::block_x<X_SIZE>,
3763  cuda::block_z<Z_SIZE>>;
3764 template<int Y_SIZE, int Z_SIZE, int X_SIZE>
3765 using cuda_flatten_block_size_yzx_direct =
3766  cuda_flatten_indexer_direct<cuda::block_y<Y_SIZE>,
3767  cuda::block_z<Z_SIZE>,
3768  cuda::block_x<X_SIZE>>;
3769 template<int Z_SIZE, int X_SIZE, int Y_SIZE>
3770 using cuda_flatten_block_size_zxy_direct =
3771  cuda_flatten_indexer_direct<cuda::block_z<Z_SIZE>,
3772  cuda::block_x<X_SIZE>,
3773  cuda::block_y<Y_SIZE>>;
3774 template<int Z_SIZE, int Y_SIZE, int X_SIZE>
3775 using cuda_flatten_block_size_zyx_direct =
3776  cuda_flatten_indexer_direct<cuda::block_z<Z_SIZE>,
3777  cuda::block_y<Y_SIZE>,
3778  cuda::block_x<X_SIZE>>;
3779 
3780 template<int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
3781 using cuda_flatten_global_size_x_direct =
3782  cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3783 template<int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
3784 using cuda_flatten_global_size_y_direct =
3785  cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3786 template<int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
3787 using cuda_flatten_global_size_z_direct =
3788  cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3789 template<int X_BLOCK_SIZE,
3790  int Y_BLOCK_SIZE,
3791  int X_GRID_SIZE = named_usage::unspecified,
3792  int Y_GRID_SIZE = named_usage::unspecified>
3793 using cuda_flatten_global_size_xy_direct =
3794  cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3795  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3796 template<int X_BLOCK_SIZE,
3797  int Z_BLOCK_SIZE,
3798  int X_GRID_SIZE = named_usage::unspecified,
3799  int Z_GRID_SIZE = named_usage::unspecified>
3800 using cuda_flatten_global_size_xz_direct =
3801  cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3802  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3803 template<int Y_BLOCK_SIZE,
3804  int X_BLOCK_SIZE,
3805  int Y_GRID_SIZE = named_usage::unspecified,
3806  int X_GRID_SIZE = named_usage::unspecified>
3807 using cuda_flatten_global_size_yx_direct =
3808  cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3809  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3810 template<int Y_BLOCK_SIZE,
3811  int Z_BLOCK_SIZE,
3812  int Y_GRID_SIZE = named_usage::unspecified,
3813  int Z_GRID_SIZE = named_usage::unspecified>
3814 using cuda_flatten_global_size_yz_direct =
3815  cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3816  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3817 template<int Z_BLOCK_SIZE,
3818  int X_BLOCK_SIZE,
3819  int Z_GRID_SIZE = named_usage::unspecified,
3820  int X_GRID_SIZE = named_usage::unspecified>
3821 using cuda_flatten_global_size_zx_direct =
3822  cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3823  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3824 template<int Z_BLOCK_SIZE,
3825  int Y_BLOCK_SIZE,
3826  int Z_GRID_SIZE = named_usage::unspecified,
3827  int Y_GRID_SIZE = named_usage::unspecified>
3828 using cuda_flatten_global_size_zy_direct =
3829  cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3830  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3831 template<int X_BLOCK_SIZE,
3832  int Y_BLOCK_SIZE,
3833  int Z_BLOCK_SIZE,
3834  int X_GRID_SIZE = named_usage::unspecified,
3835  int Y_GRID_SIZE = named_usage::unspecified,
3836  int Z_GRID_SIZE = named_usage::unspecified>
3837 using cuda_flatten_global_size_xyz_direct =
3838  cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3839  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3840  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3841 template<int X_BLOCK_SIZE,
3842  int Z_BLOCK_SIZE,
3843  int Y_BLOCK_SIZE,
3844  int X_GRID_SIZE = named_usage::unspecified,
3845  int Z_GRID_SIZE = named_usage::unspecified,
3846  int Y_GRID_SIZE = named_usage::unspecified>
3847 using cuda_flatten_global_size_xzy_direct =
3848  cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3849  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3850  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3851 template<int Y_BLOCK_SIZE,
3852  int X_BLOCK_SIZE,
3853  int Z_BLOCK_SIZE,
3854  int Y_GRID_SIZE = named_usage::unspecified,
3855  int X_GRID_SIZE = named_usage::unspecified,
3856  int Z_GRID_SIZE = named_usage::unspecified>
3857 using cuda_flatten_global_size_yxz_direct =
3858  cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3859  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3860  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3861 template<int Y_BLOCK_SIZE,
3862  int Z_BLOCK_SIZE,
3863  int X_BLOCK_SIZE,
3864  int Y_GRID_SIZE = named_usage::unspecified,
3865  int Z_GRID_SIZE = named_usage::unspecified,
3866  int X_GRID_SIZE = named_usage::unspecified>
3867 using cuda_flatten_global_size_yzx_direct =
3868  cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3869  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3870  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3871 template<int Z_BLOCK_SIZE,
3872  int X_BLOCK_SIZE,
3873  int Y_BLOCK_SIZE,
3874  int Z_GRID_SIZE = named_usage::unspecified,
3875  int X_GRID_SIZE = named_usage::unspecified,
3876  int Y_GRID_SIZE = named_usage::unspecified>
3877 using cuda_flatten_global_size_zxy_direct =
3878  cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3879  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3880  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3881 template<int Z_BLOCK_SIZE,
3882  int Y_BLOCK_SIZE,
3883  int X_BLOCK_SIZE,
3884  int Z_GRID_SIZE = named_usage::unspecified,
3885  int Y_GRID_SIZE = named_usage::unspecified,
3886  int X_GRID_SIZE = named_usage::unspecified>
3887 using cuda_flatten_global_size_zyx_direct =
3888  cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3889  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3890  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3891 
3892 /*
3893  * Maps segment indices to flattened CUDA threads, blocks, or global threads.
3894  * Reshapes multiple physical threads, blocks, or global threads into a 1D
3895  * iteration space.
3896  * Uses block-stride or grid-stride looping to exceed the maximum number of
3897  * physical threads, blocks, or global threads.
3898  */
3899 template<int X_SIZE>
3900 using cuda_flatten_thread_size_x_loop =
3901  cuda_flatten_indexer_loop<cuda::thread_x<X_SIZE>>;
3902 template<int Y_SIZE>
3903 using cuda_flatten_thread_size_y_loop =
3904  cuda_flatten_indexer_loop<cuda::thread_y<Y_SIZE>>;
3905 template<int Z_SIZE>
3906 using cuda_flatten_thread_size_z_loop =
3907  cuda_flatten_indexer_loop<cuda::thread_z<Z_SIZE>>;
3908 template<int X_SIZE, int Y_SIZE>
3909 using cuda_flatten_thread_size_xy_loop =
3910  cuda_flatten_indexer_loop<cuda::thread_x<X_SIZE>, cuda::thread_y<Y_SIZE>>;
3911 template<int X_SIZE, int Z_SIZE>
3912 using cuda_flatten_thread_size_xz_loop =
3913  cuda_flatten_indexer_loop<cuda::thread_x<X_SIZE>, cuda::thread_z<Z_SIZE>>;
3914 template<int Y_SIZE, int X_SIZE>
3915 using cuda_flatten_thread_size_yx_loop =
3916  cuda_flatten_indexer_loop<cuda::thread_y<Y_SIZE>, cuda::thread_x<X_SIZE>>;
3917 template<int Y_SIZE, int Z_SIZE>
3918 using cuda_flatten_thread_size_yz_loop =
3919  cuda_flatten_indexer_loop<cuda::thread_y<Y_SIZE>, cuda::thread_z<Z_SIZE>>;
3920 template<int Z_SIZE, int X_SIZE>
3921 using cuda_flatten_thread_size_zx_loop =
3922  cuda_flatten_indexer_loop<cuda::thread_z<Z_SIZE>, cuda::thread_x<X_SIZE>>;
3923 template<int Z_SIZE, int Y_SIZE>
3924 using cuda_flatten_thread_size_zy_loop =
3925  cuda_flatten_indexer_loop<cuda::thread_z<Z_SIZE>, cuda::thread_y<Y_SIZE>>;
3926 template<int X_SIZE, int Y_SIZE, int Z_SIZE>
3927 using cuda_flatten_thread_size_xyz_loop =
3928  cuda_flatten_indexer_loop<cuda::thread_x<X_SIZE>,
3929  cuda::thread_y<Y_SIZE>,
3930  cuda::thread_z<Z_SIZE>>;
3931 template<int X_SIZE, int Z_SIZE, int Y_SIZE>
3932 using cuda_flatten_thread_size_xzy_loop =
3933  cuda_flatten_indexer_loop<cuda::thread_x<X_SIZE>,
3934  cuda::thread_z<Z_SIZE>,
3935  cuda::thread_y<Y_SIZE>>;
3936 template<int Y_SIZE, int X_SIZE, int Z_SIZE>
3937 using cuda_flatten_thread_size_yxz_loop =
3938  cuda_flatten_indexer_loop<cuda::thread_y<Y_SIZE>,
3939  cuda::thread_x<X_SIZE>,
3940  cuda::thread_z<Z_SIZE>>;
3941 template<int Y_SIZE, int Z_SIZE, int X_SIZE>
3942 using cuda_flatten_thread_size_yzx_loop =
3943  cuda_flatten_indexer_loop<cuda::thread_y<Y_SIZE>,
3944  cuda::thread_z<Z_SIZE>,
3945  cuda::thread_x<X_SIZE>>;
3946 template<int Z_SIZE, int X_SIZE, int Y_SIZE>
3947 using cuda_flatten_thread_size_zxy_loop =
3948  cuda_flatten_indexer_loop<cuda::thread_z<Z_SIZE>,
3949  cuda::thread_x<X_SIZE>,
3950  cuda::thread_y<Y_SIZE>>;
3951 template<int Z_SIZE, int Y_SIZE, int X_SIZE>
3952 using cuda_flatten_thread_size_zyx_loop =
3953  cuda_flatten_indexer_loop<cuda::thread_z<Z_SIZE>,
3954  cuda::thread_y<Y_SIZE>,
3955  cuda::thread_x<X_SIZE>>;
3956 
3957 template<int X_SIZE>
3958 using cuda_flatten_block_size_x_loop =
3959  cuda_flatten_indexer_loop<cuda::block_x<X_SIZE>>;
3960 template<int Y_SIZE>
3961 using cuda_flatten_block_size_y_loop =
3962  cuda_flatten_indexer_loop<cuda::block_y<Y_SIZE>>;
3963 template<int Z_SIZE>
3964 using cuda_flatten_block_size_z_loop =
3965  cuda_flatten_indexer_loop<cuda::block_z<Z_SIZE>>;
3966 template<int X_SIZE, int Y_SIZE>
3967 using cuda_flatten_block_size_xy_loop =
3968  cuda_flatten_indexer_loop<cuda::block_x<X_SIZE>, cuda::block_y<Y_SIZE>>;
3969 template<int X_SIZE, int Z_SIZE>
3970 using cuda_flatten_block_size_xz_loop =
3971  cuda_flatten_indexer_loop<cuda::block_x<X_SIZE>, cuda::block_z<Z_SIZE>>;
3972 template<int Y_SIZE, int X_SIZE>
3973 using cuda_flatten_block_size_yx_loop =
3974  cuda_flatten_indexer_loop<cuda::block_y<Y_SIZE>, cuda::block_x<X_SIZE>>;
3975 template<int Y_SIZE, int Z_SIZE>
3976 using cuda_flatten_block_size_yz_loop =
3977  cuda_flatten_indexer_loop<cuda::block_y<Y_SIZE>, cuda::block_z<Z_SIZE>>;
3978 template<int Z_SIZE, int X_SIZE>
3979 using cuda_flatten_block_size_zx_loop =
3980  cuda_flatten_indexer_loop<cuda::block_z<Z_SIZE>, cuda::block_x<X_SIZE>>;
3981 template<int Z_SIZE, int Y_SIZE>
3982 using cuda_flatten_block_size_zy_loop =
3983  cuda_flatten_indexer_loop<cuda::block_z<Z_SIZE>, cuda::block_y<Y_SIZE>>;
3984 template<int X_SIZE, int Y_SIZE, int Z_SIZE>
3985 using cuda_flatten_block_size_xyz_loop =
3986  cuda_flatten_indexer_loop<cuda::block_x<X_SIZE>,
3987  cuda::block_y<Y_SIZE>,
3988  cuda::block_z<Z_SIZE>>;
3989 template<int X_SIZE, int Z_SIZE, int Y_SIZE>
3990 using cuda_flatten_block_size_xzy_loop =
3991  cuda_flatten_indexer_loop<cuda::block_x<X_SIZE>,
3992  cuda::block_z<Z_SIZE>,
3993  cuda::block_y<Y_SIZE>>;
3994 template<int Y_SIZE, int X_SIZE, int Z_SIZE>
3995 using cuda_flatten_block_size_yxz_loop =
3996  cuda_flatten_indexer_loop<cuda::block_y<Y_SIZE>,
3997  cuda::block_x<X_SIZE>,
3998  cuda::block_z<Z_SIZE>>;
3999 template<int Y_SIZE, int Z_SIZE, int X_SIZE>
4000 using cuda_flatten_block_size_yzx_loop =
4001  cuda_flatten_indexer_loop<cuda::block_y<Y_SIZE>,
4002  cuda::block_z<Z_SIZE>,
4003  cuda::block_x<X_SIZE>>;
4004 template<int Z_SIZE, int X_SIZE, int Y_SIZE>
4005 using cuda_flatten_block_size_zxy_loop =
4006  cuda_flatten_indexer_loop<cuda::block_z<Z_SIZE>,
4007  cuda::block_x<X_SIZE>,
4008  cuda::block_y<Y_SIZE>>;
4009 template<int Z_SIZE, int Y_SIZE, int X_SIZE>
4010 using cuda_flatten_block_size_zyx_loop =
4011  cuda_flatten_indexer_loop<cuda::block_z<Z_SIZE>,
4012  cuda::block_y<Y_SIZE>,
4013  cuda::block_x<X_SIZE>>;
4014 
4015 template<int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
4016 using cuda_flatten_global_size_x_loop =
4017  cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
4018 template<int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
4019 using cuda_flatten_global_size_y_loop =
4020  cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
4021 template<int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
4022 using cuda_flatten_global_size_z_loop =
4023  cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
4024 template<int X_BLOCK_SIZE,
4025  int Y_BLOCK_SIZE,
4026  int X_GRID_SIZE = named_usage::unspecified,
4027  int Y_GRID_SIZE = named_usage::unspecified>
4028 using cuda_flatten_global_size_xy_loop =
4029  cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
4030  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
4031 template<int X_BLOCK_SIZE,
4032  int Z_BLOCK_SIZE,
4033  int X_GRID_SIZE = named_usage::unspecified,
4034  int Z_GRID_SIZE = named_usage::unspecified>
4035 using cuda_flatten_global_size_xz_loop =
4036  cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
4037  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
4038 template<int Y_BLOCK_SIZE,
4039  int X_BLOCK_SIZE,
4040  int Y_GRID_SIZE = named_usage::unspecified,
4041  int X_GRID_SIZE = named_usage::unspecified>
4042 using cuda_flatten_global_size_yx_loop =
4043  cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
4044  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
4045 template<int Y_BLOCK_SIZE,
4046  int Z_BLOCK_SIZE,
4047  int Y_GRID_SIZE = named_usage::unspecified,
4048  int Z_GRID_SIZE = named_usage::unspecified>
4049 using cuda_flatten_global_size_yz_loop =
4050  cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
4051  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
4052 template<int Z_BLOCK_SIZE,
4053  int X_BLOCK_SIZE,
4054  int Z_GRID_SIZE = named_usage::unspecified,
4055  int X_GRID_SIZE = named_usage::unspecified>
4056 using cuda_flatten_global_size_zx_loop =
4057  cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
4058  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
4059 template<int Z_BLOCK_SIZE,
4060  int Y_BLOCK_SIZE,
4061  int Z_GRID_SIZE = named_usage::unspecified,
4062  int Y_GRID_SIZE = named_usage::unspecified>
4063 using cuda_flatten_global_size_zy_loop =
4064  cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
4065  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
4066 template<int X_BLOCK_SIZE,
4067  int Y_BLOCK_SIZE,
4068  int Z_BLOCK_SIZE,
4069  int X_GRID_SIZE = named_usage::unspecified,
4070  int Y_GRID_SIZE = named_usage::unspecified,
4071  int Z_GRID_SIZE = named_usage::unspecified>
4072 using cuda_flatten_global_size_xyz_loop =
4073  cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
4074  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
4075  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
4076 template<int X_BLOCK_SIZE,
4077  int Z_BLOCK_SIZE,
4078  int Y_BLOCK_SIZE,
4079  int X_GRID_SIZE = named_usage::unspecified,
4080  int Z_GRID_SIZE = named_usage::unspecified,
4081  int Y_GRID_SIZE = named_usage::unspecified>
4082 using cuda_flatten_global_size_xzy_loop =
4083  cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
4084  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
4085  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
4086 template<int Y_BLOCK_SIZE,
4087  int X_BLOCK_SIZE,
4088  int Z_BLOCK_SIZE,
4089  int Y_GRID_SIZE = named_usage::unspecified,
4090  int X_GRID_SIZE = named_usage::unspecified,
4091  int Z_GRID_SIZE = named_usage::unspecified>
4092 using cuda_flatten_global_size_yxz_loop =
4093  cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
4094  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
4095  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
4096 template<int Y_BLOCK_SIZE,
4097  int Z_BLOCK_SIZE,
4098  int X_BLOCK_SIZE,
4099  int Y_GRID_SIZE = named_usage::unspecified,
4100  int Z_GRID_SIZE = named_usage::unspecified,
4101  int X_GRID_SIZE = named_usage::unspecified>
4102 using cuda_flatten_global_size_yzx_loop =
4103  cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
4104  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
4105  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
4106 template<int Z_BLOCK_SIZE,
4107  int X_BLOCK_SIZE,
4108  int Y_BLOCK_SIZE,
4109  int Z_GRID_SIZE = named_usage::unspecified,
4110  int X_GRID_SIZE = named_usage::unspecified,
4111  int Y_GRID_SIZE = named_usage::unspecified>
4112 using cuda_flatten_global_size_zxy_loop =
4113  cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
4114  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
4115  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
4116 template<int Z_BLOCK_SIZE,
4117  int Y_BLOCK_SIZE,
4118  int X_BLOCK_SIZE,
4119  int Z_GRID_SIZE = named_usage::unspecified,
4120  int Y_GRID_SIZE = named_usage::unspecified,
4121  int X_GRID_SIZE = named_usage::unspecified>
4122 using cuda_flatten_global_size_zyx_loop =
4123  cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
4124  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
4125  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
4126 
4127 
4128 /*
4129  * Deprecated policies
4130  */
4131 using cuda_global_thread_x = cuda_global_x_direct;
4132 using cuda_global_thread_y = cuda_global_y_direct;
4133 using cuda_global_thread_z = cuda_global_z_direct;
4134 
4135 using cuda_global_thread_xy = cuda_global_xy_direct;
4136 using cuda_global_thread_xz = cuda_global_xz_direct;
4137 using cuda_global_thread_yx = cuda_global_yx_direct;
4138 using cuda_global_thread_yz = cuda_global_yz_direct;
4139 using cuda_global_thread_zx = cuda_global_zx_direct;
4140 using cuda_global_thread_zy = cuda_global_zy_direct;
4141 
4142 using cuda_global_thread_xyz = cuda_global_xyz_direct;
4143 using cuda_global_thread_xzy = cuda_global_xzy_direct;
4144 using cuda_global_thread_yxz = cuda_global_yxz_direct;
4145 using cuda_global_thread_yzx = cuda_global_yzx_direct;
4146 using cuda_global_thread_zxy = cuda_global_zxy_direct;
4147 using cuda_global_thread_zyx = cuda_global_zyx_direct;
4148 
4149 using cuda_flatten_block_threads_xy_direct = cuda_flatten_thread_xy_direct;
4150 using cuda_flatten_block_threads_xz_direct = cuda_flatten_thread_xz_direct;
4151 using cuda_flatten_block_threads_yx_direct = cuda_flatten_thread_yx_direct;
4152 using cuda_flatten_block_threads_yz_direct = cuda_flatten_thread_yz_direct;
4153 using cuda_flatten_block_threads_zx_direct = cuda_flatten_thread_zx_direct;
4154 using cuda_flatten_block_threads_zy_direct = cuda_flatten_thread_zy_direct;
4155 
4156 using cuda_flatten_block_threads_xyz_direct = cuda_flatten_thread_xyz_direct;
4157 using cuda_flatten_block_threads_xzy_direct = cuda_flatten_thread_xzy_direct;
4158 using cuda_flatten_block_threads_yxz_direct = cuda_flatten_thread_yxz_direct;
4159 using cuda_flatten_block_threads_yzx_direct = cuda_flatten_thread_yzx_direct;
4160 using cuda_flatten_block_threads_zxy_direct = cuda_flatten_thread_zxy_direct;
4161 using cuda_flatten_block_threads_zyx_direct = cuda_flatten_thread_zyx_direct;
4162 
4163 using cuda_flatten_block_threads_xy_loop = cuda_flatten_thread_xy_loop;
4164 using cuda_flatten_block_threads_xz_loop = cuda_flatten_thread_xz_loop;
4165 using cuda_flatten_block_threads_yx_loop = cuda_flatten_thread_yx_loop;
4166 using cuda_flatten_block_threads_yz_loop = cuda_flatten_thread_yz_loop;
4167 using cuda_flatten_block_threads_zx_loop = cuda_flatten_thread_zx_loop;
4168 using cuda_flatten_block_threads_zy_loop = cuda_flatten_thread_zy_loop;
4169 
4170 using cuda_flatten_block_threads_xyz_loop = cuda_flatten_thread_xyz_loop;
4171 using cuda_flatten_block_threads_xzy_loop = cuda_flatten_thread_xzy_loop;
4172 using cuda_flatten_block_threads_yxz_loop = cuda_flatten_thread_yxz_loop;
4173 using cuda_flatten_block_threads_yzx_loop = cuda_flatten_thread_yzx_loop;
4174 using cuda_flatten_block_threads_zxy_loop = cuda_flatten_thread_zxy_loop;
4175 using cuda_flatten_block_threads_zyx_loop = cuda_flatten_thread_zyx_loop;
4176 
4177 using cuda_block_xy_nested_direct = cuda_block_xy_direct;
4178 using cuda_block_xz_nested_direct = cuda_block_xz_direct;
4179 using cuda_block_yx_nested_direct = cuda_block_yx_direct;
4180 using cuda_block_yz_nested_direct = cuda_block_yz_direct;
4181 using cuda_block_zx_nested_direct = cuda_block_zx_direct;
4182 using cuda_block_zy_nested_direct = cuda_block_zy_direct;
4183 
4184 using cuda_block_xyz_nested_direct = cuda_block_xyz_direct;
4185 using cuda_block_xzy_nested_direct = cuda_block_xzy_direct;
4186 using cuda_block_yxz_nested_direct = cuda_block_yxz_direct;
4187 using cuda_block_yzx_nested_direct = cuda_block_yzx_direct;
4188 using cuda_block_zxy_nested_direct = cuda_block_zxy_direct;
4189 using cuda_block_zyx_nested_direct = cuda_block_zyx_direct;
4190 
4191 using cuda_block_xy_nested_loop = cuda_block_xy_loop;
4192 using cuda_block_xz_nested_loop = cuda_block_xz_loop;
4193 using cuda_block_yx_nested_loop = cuda_block_yx_loop;
4194 using cuda_block_yz_nested_loop = cuda_block_yz_loop;
4195 using cuda_block_zx_nested_loop = cuda_block_zx_loop;
4196 using cuda_block_zy_nested_loop = cuda_block_zy_loop;
4197 
4198 using cuda_block_xyz_nested_loop = cuda_block_xyz_loop;
4199 using cuda_block_xzy_nested_loop = cuda_block_xzy_loop;
4200 using cuda_block_yxz_nested_loop = cuda_block_yxz_loop;
4201 using cuda_block_yzx_nested_loop = cuda_block_yzx_loop;
4202 using cuda_block_zxy_nested_loop = cuda_block_zxy_loop;
4203 using cuda_block_zyx_nested_loop = cuda_block_zyx_loop;
4204 
4205 } // namespace RAJA
4206 
4207 #endif // RAJA_ENABLE_CUDA
4208 #endif
RAJA header file defining Simple Offset Calculators.
Header file for RAJA operator definitions.
Header file for basic RAJA policy mechanics.
Header file containing RAJA intrinsics templates for CUDA execution.
#define RAJA_HOST_DEVICE
Definition: macros.hpp:65
#define RAJA_UNUSED_ARG(x)
Definition: macros.hpp:97
#define RAJA_DIVIDE_CEILING_INT(dividend, divisor)
Definition: macros.hpp:122
#define RAJA_DEVICE
Definition: macros.hpp:66
Header file providing RAJA math templates.
multi_reduce_algorithm
Definition: policy.hpp:31
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result min(Args... args)
Definition: foldl.hpp:161
named_dim
Definition: types.hpp:53
Launch
Definition: PolicyBase.hpp:60
RAJA_HOST_DEVICE constexpr RAJA_INLINE T next_pow2(T n) noexcept
"round up" to the next greatest power of 2
Definition: math.hpp:63
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
kernel_sync_requirement
Definition: types.hpp:63
named_usage
Definition: types.hpp:44
@ ignored
Definition: types.hpp:45
@ unspecified
Definition: types.hpp:46
PolicyBaseT< Policy_, Pattern_, Launch_, Platform::undefined, Args... > make_policy_pattern_launch_t
Definition: PolicyBase.hpp:180
RAJA_HOST_DEVICE constexpr RAJA_INLINE T prev_pow2(T n) noexcept
"round down" to the largest power of 2 that is less than or equal to n
Definition: math.hpp:85
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155
Header file providing RAJA reduction declarations.
Header file containing RAJA sequential policy definitions.
static constexpr int_t multiply(int_t val) noexcept
Definition: types.hpp:255
Definition: PolicyBase.hpp:75
Header file for RAJA type definitions.