20 #ifndef RAJA_policy_cuda_HPP
21 #define RAJA_policy_cuda_HPP
23 #include "RAJA/config.hpp"
25 #if defined(RAJA_CUDA_ACTIVE)
44 using cuda_dim_t = RAJA_CUDA_DIM_T;
46 using cuda_dim_member_t = camp::decay<decltype(std::declval<cuda_dim_t>().x)>;
69 struct get_launch<false>
79 template<named_dim dim,
int BLOCK_SIZE,
int GRID_SIZE>
82 template<
typename... indexers>
85 template<
size_t divisor,
typename index>
88 template<
size_t divisor,
typename index>
97 struct MaxOccupancyConcretizer
99 template<
typename IdxT,
typename Data>
100 static IdxT get_max_grid_size(Data
const& data)
102 IdxT device_sm_per_device = data.device_sm_per_device;
103 IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
105 IdxT func_max_blocks_per_device =
106 func_max_blocks_per_sm * device_sm_per_device;
108 return func_max_blocks_per_device;
119 template<
typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
120 struct FractionOffsetOccupancyConcretizer
122 template<
typename IdxT,
typename Data>
123 static IdxT get_max_grid_size(Data
const& data)
125 using Fraction =
typename t_Fraction::template rebind<IdxT>;
127 IdxT device_sm_per_device = data.device_sm_per_device;
128 IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
135 if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) >
138 func_max_blocks_per_sm =
139 IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
142 IdxT func_max_blocks_per_device =
143 func_max_blocks_per_sm * device_sm_per_device;
145 return func_max_blocks_per_device;
157 template<
typename A
voidMaxOccupancyConcretizer>
158 struct AvoidDeviceMaxThreadOccupancyConcretizer
160 template<
typename IdxT,
typename Data>
161 static IdxT get_max_grid_size(Data
const& data)
163 IdxT device_max_threads_per_sm = data.device_max_threads_per_sm;
164 IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
165 IdxT func_threads_per_block = data.func_threads_per_block;
167 IdxT func_max_threads_per_sm =
168 func_threads_per_block * func_max_blocks_per_sm;
170 if (func_max_threads_per_sm < device_max_threads_per_sm)
172 return MaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
176 return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(
185 template<
size_t preferred_replication>
186 struct ConstantPreferredReplicationConcretizer
188 template<
typename IdxT,
typename Data>
189 static IdxT get_preferred_replication(Data
const&
RAJA_UNUSED_ARG(data))
191 return IdxT(preferred_replication);
200 template<
size_t t_cutoff,
201 size_t preferred_replication_before_cutoff,
202 size_t preferred_replication_after_cutoff>
203 struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
205 template<
typename IdxT,
typename Data>
206 static IdxT get_preferred_replication(Data
const& data)
208 IdxT cutoff = t_cutoff;
209 IdxT func_threads_per_block = data.func_threads_per_block;
211 if (func_threads_per_block < cutoff)
213 return IdxT(preferred_replication_before_cutoff);
217 return IdxT(preferred_replication_after_cutoff);
227 template<
typename GetPreferredReplication>
228 struct SharedAtomicReplicationMaxPow2Concretizer
230 template<
typename IdxT,
typename Data>
231 static IdxT get_shared_replication(Data
const& data)
233 IdxT func_max_shared_replication_per_block =
234 data.func_max_shared_replication_per_block;
236 IdxT preferred_replication =
237 GetPreferredReplication {}.template get_preferred_replication<IdxT>(
241 std::min(preferred_replication, func_max_shared_replication_per_block));
250 template<
typename GetPreferredReplication>
251 struct GlobalAtomicReplicationMinPow2Concretizer
253 template<
typename IdxT,
typename Data>
254 static IdxT get_global_replication(Data
const& data)
256 IdxT func_min_global_replication = data.func_min_global_replication;
258 IdxT preferred_replication =
259 GetPreferredReplication {}.template get_preferred_replication<IdxT>(
263 std::max(preferred_replication, func_min_global_replication));
268 enum struct reduce_algorithm : int
271 init_device_combine_atomic_block,
272 init_host_combine_atomic_block
275 enum struct block_communication_mode : int
281 template<reduce_algorithm t_algorithm,
282 block_communication_mode t_comm_mode,
283 size_t t_replication,
284 size_t t_atomic_stride>
287 static constexpr reduce_algorithm algorithm = t_algorithm;
288 static constexpr block_communication_mode comm_mode = t_comm_mode;
289 static constexpr
size_t replication = t_replication;
290 static constexpr
size_t atomic_stride = t_atomic_stride;
291 static constexpr
bool consistent =
292 (algorithm == reduce_algorithm::combine_last_block);
298 init_host_combine_block_atomic_then_grid_atomic,
299 init_host_combine_global_atomic
302 template<
typename t_AtomicReplicationConcretizer,
303 typename t_ReplicationIndexer,
304 typename t_OffsetCalculator>
305 struct AtomicReplicationTuning
307 using AtomicReplicationConcretizer = t_AtomicReplicationConcretizer;
308 using ReplicationIndexer = t_ReplicationIndexer;
309 using OffsetCalculator = t_OffsetCalculator;
313 typename t_SharedAtomicReplicationTuning,
314 typename t_GlobalAtomicReplicationTuning>
315 struct MultiReduceTuning
318 using SharedAtomicReplicationTuning = t_SharedAtomicReplicationTuning;
319 using GlobalAtomicReplicationTuning = t_GlobalAtomicReplicationTuning;
320 static constexpr
bool consistent =
false;
330 template<
typename _IterationMapping,
332 typename... _IterationGetters>
336 template<
typename _IterationMapping,
338 typename... _IterationGetters>
339 struct cuda_flatten_indexer
342 RAJA::Pattern::region,
343 detail::get_launch<true >::value,
344 RAJA::Platform::cuda>
346 using IterationGetter = RAJA::cuda::IndexFlatten<_IterationGetters...>;
349 template<
typename _IterationMapping,
350 typename _IterationGetter,
351 typename _LaunchConcretizer,
352 size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
356 RAJA::Pattern::forall,
357 detail::get_launch<Async>::value,
358 RAJA::Platform::cuda>
360 using IterationMapping = _IterationMapping;
361 using IterationGetter = _IterationGetter;
362 using LaunchConcretizer = _LaunchConcretizer;
367 size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
368 struct cuda_launch_explicit_t
371 RAJA::Pattern::region,
372 detail::get_launch<Async>::value,
373 RAJA::Platform::cuda>
383 template<
size_t BLOCK_SIZE,
384 size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
388 RAJA::Pattern::workgroup_exec,
389 detail::get_launch<Async>::value,
390 RAJA::Platform::cuda>
397 struct unordered_cuda_loop_y_block_iter_x_threadblock_average
400 RAJA::Pattern::workgroup_order,
401 RAJA::Platform::cuda>
412 template<
typename tuning>
415 RAJA::Pattern::reduce,
416 detail::get_launch<false>::value,
417 RAJA::Platform::cuda,
418 std::conditional_t<tuning::consistent,
423 template<
typename tuning>
424 struct cuda_multi_reduce_policy
427 RAJA::Pattern::multi_reduce,
428 detail::get_launch<false>::value,
429 RAJA::Platform::cuda,
430 std::conditional_t<tuning::consistent,
439 template<
typename host_policy>
440 struct cuda_atomic_explicit
447 using cuda_atomic = cuda_atomic_explicit<seq_atomic>;
451 struct cuda_block_reduce
456 struct cuda_warp_reduce
463 struct cuda_warp_direct
469 struct cuda_warp_loop
478 template<
typename Mask>
479 struct cuda_warp_masked_direct
488 template<
typename Mask>
489 struct cuda_warp_masked_loop
492 template<
typename Mask>
493 struct cuda_thread_masked_direct
496 template<
typename Mask>
497 struct cuda_thread_masked_loop
501 Pattern::synchronize,
512 int get_size(cuda_dim_t dims)
514 if (dims.x == 0 && dims.y == 0 && dims.z == 0)
518 return (dims.x ? dims.x : 1) * (dims.y ? dims.y : 1) * (dims.z ? dims.z : 1);
524 cuda_dim_t blocks {0, 0, 0};
525 cuda_dim_t threads {0, 0, 0};
527 CudaDims() =
default;
528 CudaDims(CudaDims
const&) =
default;
529 CudaDims& operator=(CudaDims
const&) =
default;
532 CudaDims(cuda_dim_member_t default_val)
533 : blocks {default_val, default_val, default_val},
534 threads {default_val, default_val, default_val}
538 int num_blocks()
const {
return get_size(blocks); }
541 int num_threads()
const {
return get_size(threads); }
544 cuda_dim_t get_blocks()
const
546 if (num_blocks() != 0)
548 return {(blocks.x ? blocks.x : 1), (blocks.y ? blocks.y : 1),
549 (blocks.z ? blocks.z : 1)};
558 cuda_dim_t get_threads()
const
560 if (num_threads() != 0)
562 return {(threads.x ? threads.x : 1), (threads.y ? threads.y : 1),
563 (threads.z ? threads.z : 1)};
572 template<named_dim dim>
573 struct CudaDimHelper;
579 template<
typename dim_t>
585 template<
typename dim_t>
596 template<
typename dim_t>
602 template<
typename dim_t>
613 template<
typename dim_t>
619 template<
typename dim_t>
626 template<named_dim dim,
typename dim_t>
632 template<named_dim dim,
typename dim_t>
635 return CudaDimHelper<dim>::set(d, value);
652 : block_size(_block_size),
653 grid_size(_grid_size)
658 template<
bool cache_threadIdx>
661 template<named_dim dim>
662 RAJA_DEVICE constexpr cuda_dim_member_t get_threadIdx()
const
669 struct ThreadIndices<true>
674 #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
675 : m_threadIdx(threadIdx)
679 template<named_dim dim>
680 RAJA_DEVICE constexpr cuda_dim_member_t get_threadIdx()
const
687 template<
bool cache_blockIdx>
690 template<named_dim dim>
691 RAJA_DEVICE constexpr cuda_dim_member_t get_blockIdx()
const
698 struct BlockIndices<true>
703 #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
704 : m_blockIdx(blockIdx)
708 template<named_dim dim>
709 RAJA_DEVICE constexpr cuda_dim_member_t get_blockIdx()
const
716 template<
bool cache_blockDim>
717 struct BlockDimensions
719 template<named_dim dim>
720 RAJA_DEVICE constexpr cuda_dim_member_t get_blockDim()
const
727 struct BlockDimensions<true>
732 #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
733 : m_blockDim(blockDim)
737 template<named_dim dim>
738 RAJA_DEVICE constexpr cuda_dim_member_t get_blockDim()
const
745 template<
bool cache_gr
idDim>
746 struct GridDimensions
748 template<named_dim dim>
749 RAJA_DEVICE constexpr cuda_dim_member_t get_gridDim()
const
756 struct GridDimensions<true>
758 dim3 m_gridDim = gridDim;
761 #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
766 template<named_dim dim>
767 RAJA_DEVICE constexpr cuda_dim_member_t get_gridDim()
const
774 template<
bool cache_threadIdx,
778 struct IndicesAndDims : ThreadIndices<cache_threadIdx>,
779 BlockIndices<cache_blockIdx>,
780 BlockDimensions<cache_blockDim>,
781 GridDimensions<cache_gridDim>
785 using NonCachedIndicesAndDims = IndicesAndDims<false, false, false, false>;
788 using CachedBlockDims = IndicesAndDims<false, false, true, false>;
791 using AllCachedIndicesAndDims = IndicesAndDims<true, true, true, true>;
800 template<
typename IndicesAndDimsT = NonCachedIndicesAndDims>
801 struct LaunchContextIndicesAndDimsPolicy
803 using indices_and_dims_t = IndicesAndDimsT;
806 using LaunchContextNonCachedIndicesAndDimsPolicy =
807 LaunchContextIndicesAndDimsPolicy<NonCachedIndicesAndDims>;
809 using LaunchContextCachedBlockDimsPolicy =
810 LaunchContextIndicesAndDimsPolicy<CachedBlockDims>;
812 using LaunchContextAllCachedIndicesAndDimsPolicy =
813 LaunchContextIndicesAndDimsPolicy<AllCachedIndicesAndDims>;
820 template<named_dim dim,
int BLOCK_SIZE,
int GRID_SIZE>
823 static_assert(BLOCK_SIZE > 0,
"block size must not be negative");
824 static_assert(GRID_SIZE > 0,
"grid size must not be negative");
826 static constexpr
int block_size = BLOCK_SIZE;
827 static constexpr
int grid_size = GRID_SIZE;
829 template<
typename IdxT = cuda_dim_member_t,
830 typename IdxNDims = NonCachedIndicesAndDims>
831 RAJA_DEVICE static inline IdxT index(IdxNDims
const& idxNDims = IdxNDims {})
833 return static_cast<IdxT
>(idxNDims.template get_threadIdx<dim>()) +
834 static_cast<IdxT
>(block_size) *
835 static_cast<IdxT
>(idxNDims.template get_blockIdx<dim>());
838 template<
typename IdxT = cuda_dim_member_t,
839 typename IdxNDims = NonCachedIndicesAndDims>
840 RAJA_DEVICE static constexpr IdxT size(IdxNDims
const& idxNDims = IdxNDims {})
842 return static_cast<IdxT
>(block_size) *
static_cast<IdxT
>(grid_size);
847 template<named_dim dim,
int GRID_SIZE>
848 struct IndexGlobal<dim, 1, GRID_SIZE>
850 static_assert(GRID_SIZE > 0,
"grid size must not be negative");
852 static constexpr
int block_size = 1;
853 static constexpr
int grid_size = GRID_SIZE;
855 template<
typename IdxT = cuda_dim_member_t,
856 typename IdxNDims = NonCachedIndicesAndDims>
857 RAJA_DEVICE static inline IdxT index(IdxNDims
const& idxNDims = IdxNDims {})
859 return static_cast<IdxT
>(idxNDims.template get_blockIdx<dim>());
862 template<
typename IdxT = cuda_dim_member_t,
863 typename IdxNDims = NonCachedIndicesAndDims>
864 RAJA_DEVICE static constexpr IdxT size(IdxNDims
const& idxNDims = IdxNDims {})
866 return static_cast<IdxT
>(grid_size);
871 template<named_dim dim,
int BLOCK_SIZE>
872 struct IndexGlobal<dim, BLOCK_SIZE, 1>
874 static_assert(BLOCK_SIZE > 0,
"block size must not be negative");
876 static constexpr
int block_size = BLOCK_SIZE;
877 static constexpr
int grid_size = 1;
879 template<
typename IdxT = cuda_dim_member_t,
880 typename IdxNDims = NonCachedIndicesAndDims>
881 RAJA_DEVICE static inline IdxT index(IdxNDims
const& idxNDims = IdxNDims {})
883 return static_cast<IdxT
>(idxNDims.template get_threadIdx<dim>());
886 template<
typename IdxT = cuda_dim_member_t,
887 typename IdxNDims = NonCachedIndicesAndDims>
888 RAJA_DEVICE static constexpr IdxT size(IdxNDims
const& idxNDims = IdxNDims {})
890 return static_cast<IdxT
>(block_size);
895 template<named_dim dim>
896 struct IndexGlobal<dim, 1, 1>
898 static constexpr
int block_size = 1;
899 static constexpr
int grid_size = 1;
901 template<
typename IdxT = cuda_dim_member_t,
902 typename IdxNDims = NonCachedIndicesAndDims>
903 RAJA_DEVICE static inline IdxT index(IdxNDims
const& idxNDims = IdxNDims {})
905 return static_cast<IdxT
>(0);
908 template<
typename IdxT = cuda_dim_member_t,
909 typename IdxNDims = NonCachedIndicesAndDims>
910 RAJA_DEVICE static inline IdxT size(IdxNDims
const& idxNDims = IdxNDims {})
912 return static_cast<IdxT
>(1);
917 template<named_dim dim,
int GRID_SIZE>
920 static_assert(GRID_SIZE > 0,
"grid size must not be negative");
923 static constexpr
int grid_size = GRID_SIZE;
925 template<
typename IdxT = cuda_dim_member_t,
926 typename IdxNDims = NonCachedIndicesAndDims>
927 RAJA_DEVICE static inline IdxT index(IdxNDims
const& idxNDims = IdxNDims {})
929 return static_cast<IdxT
>(idxNDims.template get_threadIdx<dim>()) +
930 static_cast<IdxT
>(idxNDims.template get_blockDim<dim>()) *
931 static_cast<IdxT
>(idxNDims.template get_blockIdx<dim>());
934 template<
typename IdxT = cuda_dim_member_t,
935 typename IdxNDims = NonCachedIndicesAndDims>
936 RAJA_DEVICE static inline IdxT size(IdxNDims
const& idxNDims = IdxNDims {})
938 return static_cast<IdxT
>(idxNDims.template get_blockDim<dim>()) *
939 static_cast<IdxT
>(grid_size);
944 template<named_dim dim>
948 static constexpr
int grid_size = 1;
950 template<
typename IdxT = cuda_dim_member_t,
951 typename IdxNDims = NonCachedIndicesAndDims>
952 RAJA_DEVICE static inline IdxT index(IdxNDims
const& idxNDims = IdxNDims {})
954 return static_cast<IdxT
>(idxNDims.template get_threadIdx<dim>());
957 template<
typename IdxT = cuda_dim_member_t,
958 typename IdxNDims = NonCachedIndicesAndDims>
959 RAJA_DEVICE static inline IdxT size(IdxNDims
const& idxNDims = IdxNDims {})
961 return static_cast<IdxT
>(idxNDims.template get_blockDim<dim>());
966 template<named_dim dim,
int BLOCK_SIZE>
969 static_assert(BLOCK_SIZE > 0,
"block size must not be negative");
971 static constexpr
int block_size = BLOCK_SIZE;
974 template<
typename IdxT = cuda_dim_member_t,
975 typename IdxNDims = NonCachedIndicesAndDims>
976 RAJA_DEVICE static inline IdxT index(IdxNDims
const& idxNDims = IdxNDims {})
978 return static_cast<IdxT
>(idxNDims.template get_threadIdx<dim>()) +
979 static_cast<IdxT
>(block_size) *
980 static_cast<IdxT
>(idxNDims.template get_blockIdx<dim>());
983 template<
typename IdxT = cuda_dim_member_t,
984 typename IdxNDims = NonCachedIndicesAndDims>
985 RAJA_DEVICE static inline IdxT size(IdxNDims
const& idxNDims = IdxNDims {})
987 return static_cast<IdxT
>(block_size) *
988 static_cast<IdxT
>(idxNDims.template get_gridDim<dim>());
993 template<named_dim dim>
996 static constexpr
int block_size = 1;
999 template<
typename IdxT = cuda_dim_member_t,
1000 typename IdxNDims = NonCachedIndicesAndDims>
1001 RAJA_DEVICE static inline IdxT index(IdxNDims
const& idxNDims = IdxNDims {})
1003 return static_cast<IdxT
>(idxNDims.template get_blockIdx<dim>());
1006 template<
typename IdxT = cuda_dim_member_t,
1007 typename IdxNDims = NonCachedIndicesAndDims>
1008 RAJA_DEVICE static inline IdxT size(IdxNDims
const& idxNDims = IdxNDims {})
1010 return static_cast<IdxT
>(idxNDims.template get_gridDim<dim>());
1015 template<named_dim dim>
1021 template<
typename IdxT = cuda_dim_member_t,
1022 typename IdxNDims = NonCachedIndicesAndDims>
1023 RAJA_DEVICE static inline IdxT index(IdxNDims
const& idxNDims = IdxNDims {})
1025 return static_cast<IdxT
>(idxNDims.template get_threadIdx<dim>()) +
1026 static_cast<IdxT
>(idxNDims.template get_blockDim<dim>()) *
1027 static_cast<IdxT
>(idxNDims.template get_blockIdx<dim>());
1030 template<
typename IdxT = cuda_dim_member_t,
1031 typename IdxNDims = NonCachedIndicesAndDims>
1032 RAJA_DEVICE static inline IdxT size(IdxNDims
const& idxNDims = IdxNDims {})
1034 return static_cast<IdxT
>(idxNDims.template get_blockDim<dim>()) *
1035 static_cast<IdxT
>(idxNDims.template get_gridDim<dim>());
1041 template<named_dim dim,
int GRID_SIZE>
1044 static_assert(GRID_SIZE > 0,
"grid size must not be negative");
1047 static constexpr
int grid_size = GRID_SIZE;
1049 template<
typename IdxT = cuda_dim_member_t,
1050 typename IdxNDims = NonCachedIndicesAndDims>
1051 RAJA_DEVICE static inline IdxT index(IdxNDims
const& idxNDims = IdxNDims {})
1053 return static_cast<IdxT
>(idxNDims.template get_blockIdx<dim>());
1056 template<
typename IdxT = cuda_dim_member_t,
1057 typename IdxNDims = NonCachedIndicesAndDims>
1058 RAJA_DEVICE static constexpr IdxT size(IdxNDims
const& idxNDims = IdxNDims {})
1060 return static_cast<IdxT
>(grid_size);
1065 template<named_dim dim>
1069 static constexpr
int grid_size = 1;
1071 template<
typename IdxT = cuda_dim_member_t,
1072 typename IdxNDims = NonCachedIndicesAndDims>
1073 RAJA_DEVICE static inline IdxT index(IdxNDims
const& idxNDims = IdxNDims {})
1075 return static_cast<IdxT
>(0);
1078 template<
typename IdxT = cuda_dim_member_t,
1079 typename IdxNDims = NonCachedIndicesAndDims>
1080 RAJA_DEVICE static inline IdxT size(IdxNDims
const& idxNDims = IdxNDims {})
1082 return static_cast<IdxT
>(1);
1087 template<named_dim dim>
1093 template<
typename IdxT = cuda_dim_member_t,
1094 typename IdxNDims = NonCachedIndicesAndDims>
1095 RAJA_DEVICE static inline IdxT index(IdxNDims
const& idxNDims = IdxNDims {})
1097 return static_cast<IdxT
>(idxNDims.template get_blockIdx<dim>());
1100 template<
typename IdxT = cuda_dim_member_t,
1101 typename IdxNDims = NonCachedIndicesAndDims>
1102 RAJA_DEVICE static inline IdxT size(IdxNDims
const& idxNDims = IdxNDims {})
1104 return static_cast<IdxT
>(idxNDims.template get_gridDim<dim>());
1110 template<named_dim dim,
int BLOCK_SIZE>
1113 static_assert(BLOCK_SIZE > 0,
"block size must not be negative");
1115 static constexpr
int block_size = BLOCK_SIZE;
1118 template<
typename IdxT = cuda_dim_member_t,
1119 typename IdxNDims = NonCachedIndicesAndDims>
1120 RAJA_DEVICE static inline IdxT index(IdxNDims
const& idxNDims = IdxNDims {})
1122 return static_cast<IdxT
>(idxNDims.template get_threadIdx<dim>());
1125 template<
typename IdxT = cuda_dim_member_t,
1126 typename IdxNDims = NonCachedIndicesAndDims>
1127 RAJA_DEVICE static constexpr IdxT size(IdxNDims
const& idxNDims = IdxNDims {})
1129 return static_cast<IdxT
>(block_size);
1134 template<named_dim dim>
1137 static constexpr
int block_size = 1;
1140 template<
typename IdxT = cuda_dim_member_t,
1141 typename IdxNDims = NonCachedIndicesAndDims>
1142 RAJA_DEVICE static inline IdxT index(IdxNDims
const& idxNDims = IdxNDims {})
1144 return static_cast<IdxT
>(0);
1147 template<
typename IdxT = cuda_dim_member_t,
1148 typename IdxNDims = NonCachedIndicesAndDims>
1149 RAJA_DEVICE static inline IdxT size(IdxNDims
const& idxNDims = IdxNDims {})
1151 return static_cast<IdxT
>(1);
1156 template<named_dim dim>
1162 template<
typename IdxT = cuda_dim_member_t,
1163 typename IdxNDims = NonCachedIndicesAndDims>
1164 RAJA_DEVICE static inline IdxT index(IdxNDims
const& idxNDims = IdxNDims {})
1166 return static_cast<IdxT
>(idxNDims.template get_threadIdx<dim>());
1169 template<
typename IdxT = cuda_dim_member_t,
1170 typename IdxNDims = NonCachedIndicesAndDims>
1171 RAJA_DEVICE static inline IdxT size(IdxNDims
const& idxNDims = IdxNDims {})
1173 return static_cast<IdxT
>(idxNDims.template get_blockDim<dim>());
1179 template<named_dim dim>
1185 template<
typename IdxT = cuda_dim_member_t,
1186 typename IdxNDims = NonCachedIndicesAndDims>
1187 RAJA_DEVICE static inline IdxT index(IdxNDims
const& idxNDims = IdxNDims {})
1189 return static_cast<IdxT
>(0);
1192 template<
typename IdxT = cuda_dim_member_t,
1193 typename IdxNDims = NonCachedIndicesAndDims>
1194 RAJA_DEVICE static inline IdxT size(IdxNDims
const& idxNDims = IdxNDims {})
1196 return static_cast<IdxT
>(1);
1201 template<
typename x_index>
1202 struct IndexFlatten<x_index>
1205 template<
typename IdxT = cuda_dim_member_t,
1206 typename IdxNDims = NonCachedIndicesAndDims>
1207 RAJA_DEVICE static inline IdxT index(IdxNDims
const& idxNDims = IdxNDims {})
1210 return x_index::template index<IdxT>(idxNDims);
1213 template<
typename IdxT = cuda_dim_member_t,
1214 typename IdxNDims = NonCachedIndicesAndDims>
1215 RAJA_DEVICE static inline IdxT size(IdxNDims
const& idxNDims = IdxNDims {})
1217 return x_index::template size<IdxT>(idxNDims);
1222 template<
typename x_index,
typename y_index>
1223 struct IndexFlatten<x_index, y_index>
1226 template<
typename IdxT = cuda_dim_member_t,
1227 typename IdxNDims = NonCachedIndicesAndDims>
1228 RAJA_DEVICE static inline IdxT index(IdxNDims
const& idxNDims = IdxNDims {})
1231 return x_index::template index<IdxT>(idxNDims) +
1232 x_index::template size<IdxT>(idxNDims) *
1233 (y_index::template index<IdxT>(idxNDims));
1236 template<
typename IdxT = cuda_dim_member_t,
1237 typename IdxNDims = NonCachedIndicesAndDims>
1238 RAJA_DEVICE static inline IdxT size(IdxNDims
const& idxNDims = IdxNDims {})
1240 return x_index::template size<IdxT>(idxNDims) *
1241 y_index::template size<IdxT>(idxNDims);
1246 template<
typename x_index,
typename y_index,
typename z_index>
1247 struct IndexFlatten<x_index, y_index, z_index>
1250 template<
typename IdxT = cuda_dim_member_t,
1251 typename IdxNDims = NonCachedIndicesAndDims>
1252 RAJA_DEVICE static inline IdxT index(IdxNDims
const& idxNDims = IdxNDims {})
1255 return x_index::template index<IdxT>(idxNDims) +
1256 x_index::template size<IdxT>(idxNDims) *
1257 (y_index::template index<IdxT>(idxNDims) +
1258 y_index::template size<IdxT>(idxNDims) *
1259 z_index::template index<IdxT>(idxNDims));
1262 template<
typename IdxT = cuda_dim_member_t,
1263 typename IdxNDims = NonCachedIndicesAndDims>
1264 RAJA_DEVICE static inline IdxT size(IdxNDims
const& idxNDims = IdxNDims {})
1266 return x_index::template size<IdxT>(idxNDims) *
1267 y_index::template size<IdxT>(idxNDims) *
1268 z_index::template size<IdxT>(idxNDims);
1272 template<
size_t divisor,
typename indexer>
1275 template<
typename IdxT = cuda_dim_member_t,
1276 typename IdxNDims = NonCachedIndicesAndDims>
1277 RAJA_DEVICE static inline IdxT index(IdxNDims
const& idxNDims = IdxNDims {})
1279 return indexer::template index<IdxT>(idxNDims) /
static_cast<IdxT
>(divisor);
1282 template<
typename IdxT = cuda_dim_member_t,
1283 typename IdxNDims = NonCachedIndicesAndDims>
1284 RAJA_DEVICE static inline IdxT size(IdxNDims
const& idxNDims = IdxNDims {})
1287 static_cast<IdxT
>(divisor));
1291 template<
size_t divisor,
typename indexer>
1294 template<
typename IdxT = cuda_dim_member_t,
1295 typename IdxNDims = NonCachedIndicesAndDims>
1296 RAJA_DEVICE static inline IdxT index(IdxNDims
const& idxNDims = IdxNDims {})
1298 return indexer::template index<IdxT>(idxNDims) %
static_cast<IdxT
>(divisor);
1301 template<
typename IdxT = cuda_dim_member_t,
1302 typename IdxNDims = NonCachedIndicesAndDims>
1303 RAJA_DEVICE static inline IdxT size(IdxNDims
const& idxNDims = IdxNDims {})
1305 return static_cast<IdxT
>(divisor);
1311 template<
typename index_global>
1312 struct get_index_thread;
1315 template<named_dim dim,
int BLOCK_SIZE,
int GRID_SIZE>
1316 struct get_index_thread<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
1318 using type = IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
1322 template<
typename x_index,
typename y_index,
typename z_index>
1323 struct get_index_thread<IndexFlatten<x_index, y_index, z_index>>
1325 using type = IndexFlatten<typename get_index_thread<x_index>::type,
1326 typename get_index_thread<y_index>::type,
1327 typename get_index_thread<z_index>::type>;
1331 template<
typename index_global>
1332 struct get_index_block;
1335 template<named_dim dim,
int BLOCK_SIZE,
int GRID_SIZE>
1336 struct get_index_block<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
1338 using type = IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
1342 template<
typename x_index,
typename y_index,
typename z_index>
1343 struct get_index_block<IndexFlatten<x_index, y_index, z_index>>
1345 using type = IndexFlatten<typename get_index_block<x_index>::type,
1346 typename get_index_block<y_index>::type,
1347 typename get_index_block<z_index>::type>;
1350 template<
size_t BLOCK_SIZE = named_usage::unspecified>
1351 using thread_x = IndexGlobal<named_dim::x, BLOCK_SIZE, named_usage::ignored>;
1352 template<
size_t BLOCK_SIZE = named_usage::unspecified>
1353 using thread_y = IndexGlobal<named_dim::y, BLOCK_SIZE, named_usage::ignored>;
1354 template<
size_t BLOCK_SIZE = named_usage::unspecified>
1355 using thread_z = IndexGlobal<named_dim::z, BLOCK_SIZE, named_usage::ignored>;
1360 using thread_xyz = IndexFlatten<thread_x<BLOCK_SIZE_X>,
1361 thread_y<BLOCK_SIZE_Y>,
1362 thread_z<BLOCK_SIZE_Z>>;
1364 template<
size_t GRID_SIZE = named_usage::unspecified>
1365 using block_x = IndexGlobal<named_dim::x, named_usage::ignored, GRID_SIZE>;
1366 template<
size_t GRID_SIZE = named_usage::unspecified>
1367 using block_y = IndexGlobal<named_dim::y, named_usage::ignored, GRID_SIZE>;
1368 template<
size_t GRID_SIZE = named_usage::unspecified>
1369 using block_z = IndexGlobal<named_dim::z, named_usage::ignored, GRID_SIZE>;
1374 using block_xyz = IndexFlatten<block_x<GRID_SIZE_X>,
1375 block_y<GRID_SIZE_Y>,
1376 block_z<GRID_SIZE_Z>>;
1378 template<
size_t BLOCK_SIZE,
size_t GRID_SIZE = named_usage::unspecified>
1379 using global_x = IndexGlobal<named_dim::x, BLOCK_SIZE, GRID_SIZE>;
1380 template<
size_t BLOCK_SIZE,
size_t GRID_SIZE = named_usage::unspecified>
1381 using global_y = IndexGlobal<named_dim::y, BLOCK_SIZE, GRID_SIZE>;
1382 template<
size_t BLOCK_SIZE,
size_t GRID_SIZE = named_usage::unspecified>
1383 using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
1386 template<
size_t BLOCK_SIZE_X,
1387 size_t BLOCK_SIZE_Y,
1388 size_t BLOCK_SIZE_Z,
1392 using global_xyz = IndexFlatten<global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
1393 global_y<BLOCK_SIZE_Y, GRID_SIZE_Y>,
1394 global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
1397 template<
size_t WARP_SIZE = RAJA::policy::cuda::device_constants.WARP_SIZE,
1402 IndexDivide<WARP_SIZE,
1403 thread_xyz<BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>>;
1405 template<
size_t WARP_SIZE = RAJA::policy::cuda::device_constants.WARP_SIZE,
1412 using warp_global_xyz =
1413 IndexFlatten<warp_xyz<WARP_SIZE, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>,
1414 block_xyz<GRID_SIZE_X, GRID_SIZE_Y, GRID_SIZE_Z>>;
1418 using CudaAllCachedIndicesAndDims = cuda::AllCachedIndicesAndDims;
1419 using CudaCachedBlockDims = cuda::CachedBlockDims;
1420 using CudaNonCachedIndicesAndDims = cuda::NonCachedIndicesAndDims;
1422 template<
bool cache_threadIdx,
1423 bool cache_blockIdx,
1424 bool cache_blockDim,
1426 using CudaIndicesAndDims = cuda::IndicesAndDims<cache_threadIdx,
1431 using CudaLaunchContextAllCachedIndicesAndDimsPolicy =
1432 cuda::LaunchContextAllCachedIndicesAndDimsPolicy;
1433 using CudaLaunchContextCachedBlockDimsPolicy =
1434 cuda::LaunchContextCachedBlockDimsPolicy;
1435 template<
typename IndicesAndDimsT = cuda::NonCachedIndicesAndDims>
1436 using CudaLaunchContextIndicesAndDimsPolicy =
1437 cuda::LaunchContextIndicesAndDimsPolicy<IndicesAndDimsT>;
1438 using CudaLaunchContextNonCachedIndicesAndDimsPolicy =
1439 cuda::LaunchContextNonCachedIndicesAndDimsPolicy;
1443 using CudaAvoidDeviceMaxThreadOccupancyConcretizer =
1444 cuda::AvoidDeviceMaxThreadOccupancyConcretizer<
1445 cuda::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
1447 template<
typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
1448 using CudaFractionOffsetOccupancyConcretizer =
1449 cuda::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
1451 using CudaMaxOccupancyConcretizer = cuda::MaxOccupancyConcretizer;
1453 using CudaReduceDefaultConcretizer = CudaMaxOccupancyConcretizer;
1455 using CudaDefaultConcretizer = CudaMaxOccupancyConcretizer;
1459 template<
size_t BLOCK_SIZE,
1461 size_t BLOCKS_PER_SM,
1463 using cuda_exec_grid_explicit = policy::cuda::cuda_exec_explicit<
1464 iteration_mapping::StridedLoop<named_usage::unspecified>,
1465 cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
1466 CudaDefaultConcretizer,
1470 template<
size_t BLOCK_SIZE,
size_t GRID_SIZE,
size_t BLOCKS_PER_SM>
1471 using cuda_exec_grid_explicit_async = policy::cuda::cuda_exec_explicit<
1472 iteration_mapping::StridedLoop<named_usage::unspecified>,
1473 cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
1474 CudaDefaultConcretizer,
1478 template<
size_t BLOCK_SIZE,
size_t GRID_SIZE,
bool Async = false>
1479 using cuda_exec_grid = policy::cuda::cuda_exec_explicit<
1480 iteration_mapping::StridedLoop<named_usage::unspecified>,
1481 cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
1482 CudaDefaultConcretizer,
1483 policy::cuda::MIN_BLOCKS_PER_SM,
1486 template<
size_t BLOCK_SIZE,
size_t GRID_SIZE>
1487 using cuda_exec_grid_async = policy::cuda::cuda_exec_explicit<
1488 iteration_mapping::StridedLoop<named_usage::unspecified>,
1489 cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
1490 CudaDefaultConcretizer,
1491 policy::cuda::MIN_BLOCKS_PER_SM,
1494 template<
size_t BLOCK_SIZE,
size_t BLOCKS_PER_SM,
bool Async = false>
1495 using cuda_exec_explicit =
1496 policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
1497 cuda::global_x<BLOCK_SIZE>,
1498 CudaDefaultConcretizer,
1502 template<
size_t BLOCK_SIZE,
size_t BLOCKS_PER_SM>
1503 using cuda_exec_explicit_async =
1504 policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
1505 cuda::global_x<BLOCK_SIZE>,
1506 CudaDefaultConcretizer,
1510 template<
size_t BLOCK_SIZE,
bool Async = false>
1512 policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
1513 cuda::global_x<BLOCK_SIZE>,
1514 CudaDefaultConcretizer,
1515 policy::cuda::MIN_BLOCKS_PER_SM,
1518 template<
size_t BLOCK_SIZE>
1519 using cuda_exec_async =
1520 policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
1521 cuda::global_x<BLOCK_SIZE>,
1522 CudaDefaultConcretizer,
1523 policy::cuda::MIN_BLOCKS_PER_SM,
1526 template<
size_t BLOCK_SIZE,
size_t BLOCKS_PER_SM,
bool Async = false>
1527 using cuda_exec_occ_calc_explicit = policy::cuda::cuda_exec_explicit<
1528 iteration_mapping::StridedLoop<named_usage::unspecified>,
1529 cuda::global_x<BLOCK_SIZE>,
1530 CudaDefaultConcretizer,
1534 template<
size_t BLOCK_SIZE,
size_t BLOCKS_PER_SM>
1535 using cuda_exec_occ_calc_explicit_async = policy::cuda::cuda_exec_explicit<
1536 iteration_mapping::StridedLoop<named_usage::unspecified>,
1537 cuda::global_x<BLOCK_SIZE>,
1538 CudaDefaultConcretizer,
1542 template<
size_t BLOCK_SIZE,
bool Async = false>
1543 using cuda_exec_occ_calc = policy::cuda::cuda_exec_explicit<
1544 iteration_mapping::StridedLoop<named_usage::unspecified>,
1545 cuda::global_x<BLOCK_SIZE>,
1546 CudaDefaultConcretizer,
1547 policy::cuda::MIN_BLOCKS_PER_SM,
1550 template<
size_t BLOCK_SIZE>
1551 using cuda_exec_occ_calc_async = policy::cuda::cuda_exec_explicit<
1552 iteration_mapping::StridedLoop<named_usage::unspecified>,
1553 cuda::global_x<BLOCK_SIZE>,
1554 CudaDefaultConcretizer,
1555 policy::cuda::MIN_BLOCKS_PER_SM,
1558 template<
size_t BLOCK_SIZE,
size_t BLOCKS_PER_SM,
bool Async = false>
1559 using cuda_exec_occ_max_explicit = policy::cuda::cuda_exec_explicit<
1560 iteration_mapping::StridedLoop<named_usage::unspecified>,
1561 cuda::global_x<BLOCK_SIZE>,
1562 CudaMaxOccupancyConcretizer,
1566 template<
size_t BLOCK_SIZE,
size_t BLOCKS_PER_SM>
1567 using cuda_exec_occ_max_explicit_async = policy::cuda::cuda_exec_explicit<
1568 iteration_mapping::StridedLoop<named_usage::unspecified>,
1569 cuda::global_x<BLOCK_SIZE>,
1570 CudaMaxOccupancyConcretizer,
1574 template<
size_t BLOCK_SIZE,
bool Async = false>
1575 using cuda_exec_occ_max = policy::cuda::cuda_exec_explicit<
1576 iteration_mapping::StridedLoop<named_usage::unspecified>,
1577 cuda::global_x<BLOCK_SIZE>,
1578 CudaMaxOccupancyConcretizer,
1579 policy::cuda::MIN_BLOCKS_PER_SM,
1582 template<
size_t BLOCK_SIZE>
1583 using cuda_exec_occ_max_async = policy::cuda::cuda_exec_explicit<
1584 iteration_mapping::StridedLoop<named_usage::unspecified>,
1585 cuda::global_x<BLOCK_SIZE>,
1586 CudaMaxOccupancyConcretizer,
1587 policy::cuda::MIN_BLOCKS_PER_SM,
1590 template<
size_t BLOCK_SIZE,
1591 size_t BLOCKS_PER_SM,
1594 using cuda_exec_occ_fraction_explicit = policy::cuda::cuda_exec_explicit<
1595 iteration_mapping::StridedLoop<named_usage::unspecified>,
1596 cuda::global_x<BLOCK_SIZE>,
1597 CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
1601 template<
size_t BLOCK_SIZE,
size_t BLOCKS_PER_SM,
typename Fraction>
1602 using cuda_exec_occ_fraction_explicit_async = policy::cuda::cuda_exec_explicit<
1603 iteration_mapping::StridedLoop<named_usage::unspecified>,
1604 cuda::global_x<BLOCK_SIZE>,
1605 CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
1609 template<
size_t BLOCK_SIZE,
typename Fraction,
bool Async = false>
1610 using cuda_exec_occ_fraction = policy::cuda::cuda_exec_explicit<
1611 iteration_mapping::StridedLoop<named_usage::unspecified>,
1612 cuda::global_x<BLOCK_SIZE>,
1613 CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
1614 policy::cuda::MIN_BLOCKS_PER_SM,
1617 template<
size_t BLOCK_SIZE,
typename Fraction>
1618 using cuda_exec_occ_fraction_async = policy::cuda::cuda_exec_explicit<
1619 iteration_mapping::StridedLoop<named_usage::unspecified>,
1620 cuda::global_x<BLOCK_SIZE>,
1621 CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
1622 policy::cuda::MIN_BLOCKS_PER_SM,
1625 template<
size_t BLOCK_SIZE,
1626 size_t BLOCKS_PER_SM,
1627 typename Concretizer,
1629 using cuda_exec_occ_custom_explicit = policy::cuda::cuda_exec_explicit<
1630 iteration_mapping::StridedLoop<named_usage::unspecified>,
1631 cuda::global_x<BLOCK_SIZE>,
1636 template<
size_t BLOCK_SIZE,
size_t BLOCKS_PER_SM,
typename Concretizer>
1637 using cuda_exec_occ_custom_explicit_async = policy::cuda::cuda_exec_explicit<
1638 iteration_mapping::StridedLoop<named_usage::unspecified>,
1639 cuda::global_x<BLOCK_SIZE>,
1644 template<
size_t BLOCK_SIZE,
typename Concretizer,
bool Async = false>
1645 using cuda_exec_occ_custom = policy::cuda::cuda_exec_explicit<
1646 iteration_mapping::StridedLoop<named_usage::unspecified>,
1647 cuda::global_x<BLOCK_SIZE>,
1649 policy::cuda::MIN_BLOCKS_PER_SM,
1652 template<
size_t BLOCK_SIZE,
typename Concretizer>
1653 using cuda_exec_occ_custom_async = policy::cuda::cuda_exec_explicit<
1654 iteration_mapping::StridedLoop<named_usage::unspecified>,
1655 cuda::global_x<BLOCK_SIZE>,
1657 policy::cuda::MIN_BLOCKS_PER_SM,
1660 template<
size_t BLOCK_SIZE,
size_t BLOCKS_PER_SM,
bool Async = false>
1661 using cuda_exec_with_reduce_explicit = policy::cuda::cuda_exec_explicit<
1662 iteration_mapping::StridedLoop<named_usage::unspecified>,
1663 cuda::global_x<BLOCK_SIZE>,
1664 CudaReduceDefaultConcretizer,
1668 template<
size_t BLOCK_SIZE,
size_t BLOCKS_PER_SM>
1669 using cuda_exec_with_reduce_explicit_async = policy::cuda::cuda_exec_explicit<
1670 iteration_mapping::StridedLoop<named_usage::unspecified>,
1671 cuda::global_x<BLOCK_SIZE>,
1672 CudaReduceDefaultConcretizer,
1676 template<
size_t BLOCK_SIZE,
bool Async = false>
1677 using cuda_exec_with_reduce = policy::cuda::cuda_exec_explicit<
1678 iteration_mapping::StridedLoop<named_usage::unspecified>,
1679 cuda::global_x<BLOCK_SIZE>,
1680 CudaReduceDefaultConcretizer,
1681 policy::cuda::MIN_BLOCKS_PER_SM,
1684 template<
size_t BLOCK_SIZE>
1685 using cuda_exec_with_reduce_async = policy::cuda::cuda_exec_explicit<
1686 iteration_mapping::StridedLoop<named_usage::unspecified>,
1687 cuda::global_x<BLOCK_SIZE>,
1688 CudaReduceDefaultConcretizer,
1689 policy::cuda::MIN_BLOCKS_PER_SM,
1692 template<
bool with_reduce,
1694 size_t BLOCKS_PER_SM,
1696 using cuda_exec_base_explicit = std::conditional_t<
1698 cuda_exec_with_reduce_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
1699 cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>>;
1701 template<
bool with_reduce,
size_t BLOCK_SIZE,
size_t BLOCKS_PER_SM>
1702 using cuda_exec_base_explicit_async = std::conditional_t<
1704 cuda_exec_with_reduce_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
1705 cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>>;
1707 template<
bool with_reduce,
size_t BLOCK_SIZE,
bool Async = false>
1708 using cuda_exec_base =
1709 std::conditional_t<with_reduce,
1710 cuda_exec_with_reduce<BLOCK_SIZE, Async>,
1711 cuda_exec<BLOCK_SIZE, Async>>;
1713 template<
bool with_reduce,
size_t BLOCK_SIZE>
1714 using cuda_exec_base_async =
1715 std::conditional_t<with_reduce,
1716 cuda_exec_with_reduce_async<BLOCK_SIZE>,
1717 cuda_exec_async<BLOCK_SIZE>>;
1721 template<
size_t BLOCK_SIZE,
1722 size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
1724 using cuda_work_explicit =
1725 policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
1727 template<
size_t BLOCK_SIZE,
1728 size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
1729 using cuda_work_explicit_async =
1730 policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>;
1732 template<
size_t BLOCK_SIZE,
bool Async = false>
1733 using cuda_work = policy::cuda::
1734 cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
1736 template<
size_t BLOCK_SIZE>
1737 using cuda_work_async = policy::cuda::
1738 cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, true>;
1740 using policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average;
1743 using policy::cuda::cuda_atomic;
1744 using policy::cuda::cuda_atomic_explicit;
1748 template<cuda::reduce_algorithm algorithm,
1749 cuda::block_communication_mode comm_mode,
1752 using cuda_reduce_tuning = policy::cuda::cuda_reduce_policy<
1753 cuda::ReduceTuning<algorithm, comm_mode, replication, atomic_stride>>;
1774 using cuda_reduce_device_fence =
1775 cuda_reduce_tuning<cuda::reduce_algorithm::combine_last_block,
1776 cuda::block_communication_mode::device_fence,
1780 using cuda_reduce_block_fence =
1781 cuda_reduce_tuning<cuda::reduce_algorithm::combine_last_block,
1782 cuda::block_communication_mode::block_fence,
1786 using cuda_reduce_atomic_device_init_device_fence =
1787 cuda_reduce_tuning<cuda::reduce_algorithm::init_device_combine_atomic_block,
1788 cuda::block_communication_mode::device_fence,
1792 using cuda_reduce_atomic_device_init_block_fence =
1793 cuda_reduce_tuning<cuda::reduce_algorithm::init_device_combine_atomic_block,
1794 cuda::block_communication_mode::block_fence,
1798 using cuda_reduce_atomic_host_init_device_fence =
1799 cuda_reduce_tuning<cuda::reduce_algorithm::init_host_combine_atomic_block,
1800 cuda::block_communication_mode::device_fence,
1804 using cuda_reduce_atomic_host_init_block_fence =
1805 cuda_reduce_tuning<cuda::reduce_algorithm::init_host_combine_atomic_block,
1806 cuda::block_communication_mode::block_fence,
1812 using cuda_reduce = cuda_reduce_device_fence;
1816 using cuda_reduce_atomic = cuda_reduce_atomic_host_init_device_fence;
1820 template<
bool with_atomic>
1821 using cuda_reduce_base =
1822 std::conditional_t<with_atomic, cuda_reduce_atomic, cuda_reduce>;
1827 typename SharedAtomicReplicationConcretizer,
1828 typename SharedAtomicReplicationIndexer,
1829 typename GlobalAtomicReplicationConcretizer,
1830 typename GlobalAtomicReplicationIndexer>
1831 using cuda_multi_reduce_tuning =
1832 policy::cuda::cuda_multi_reduce_policy<cuda::MultiReduceTuning<
1834 cuda::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
1835 SharedAtomicReplicationIndexer,
1836 GetOffsetRight<int>>,
1837 cuda::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
1838 GlobalAtomicReplicationIndexer,
1839 GetOffsetLeft<int>>>>;
1853 using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init =
1854 cuda_multi_reduce_tuning<
1855 cuda::multi_reduce_algorithm::
1856 init_host_combine_block_atomic_then_grid_atomic,
1857 cuda::SharedAtomicReplicationMaxPow2Concretizer<
1858 cuda::ConstantPreferredReplicationConcretizer<16>>,
1860 cuda::GlobalAtomicReplicationMinPow2Concretizer<
1861 cuda::ConstantPreferredReplicationConcretizer<2>>,
1862 cuda::warp_global_xyz<>>;
1864 using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing =
1865 cuda_multi_reduce_tuning<
1866 cuda::multi_reduce_algorithm::
1867 init_host_combine_block_atomic_then_grid_atomic,
1868 cuda::SharedAtomicReplicationMaxPow2Concretizer<
1869 cuda::ConstantPreferredReplicationConcretizer<0>>,
1871 cuda::GlobalAtomicReplicationMinPow2Concretizer<
1872 cuda::ConstantPreferredReplicationConcretizer<2>>,
1873 cuda::warp_global_xyz<>>;
1875 using cuda_multi_reduce_atomic_global_host_init = cuda_multi_reduce_tuning<
1876 cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
1879 cuda::GlobalAtomicReplicationMinPow2Concretizer<
1880 cuda::ConstantPreferredReplicationConcretizer<2>>,
1881 cuda::warp_global_xyz<>>;
1883 using cuda_multi_reduce_atomic_global_no_replication_host_init =
1884 cuda_multi_reduce_tuning<
1885 cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
1888 cuda::GlobalAtomicReplicationMinPow2Concretizer<
1889 cuda::ConstantPreferredReplicationConcretizer<1>>,
1894 using cuda_multi_reduce_atomic =
1895 cuda_multi_reduce_atomic_block_then_atomic_grid_host_init;
1898 using cuda_multi_reduce_atomic_low_performance_low_overhead =
1899 cuda_multi_reduce_atomic_global_no_replication_host_init;
1903 using policy::cuda::cuda_block_reduce;
1904 using policy::cuda::cuda_warp_reduce;
1906 using cuda_warp_direct_unchecked = RAJA::policy::cuda::cuda_indexer<
1907 iteration_mapping::DirectUnchecked,
1909 cuda::thread_x<RAJA::policy::cuda::device_constants.WARP_SIZE>>;
1910 using cuda_warp_direct = RAJA::policy::cuda::cuda_indexer<
1911 iteration_mapping::Direct,
1913 cuda::thread_x<RAJA::policy::cuda::device_constants.WARP_SIZE>>;
1914 using cuda_warp_loop = RAJA::policy::cuda::cuda_indexer<
1915 iteration_mapping::StridedLoop<named_usage::unspecified>,
1917 cuda::thread_x<RAJA::policy::cuda::device_constants.WARP_SIZE>>;
1919 using policy::cuda::cuda_warp_masked_direct;
1920 using policy::cuda::cuda_warp_masked_loop;
1922 using policy::cuda::cuda_thread_masked_direct;
1923 using policy::cuda::cuda_thread_masked_loop;
1926 using policy::cuda::cuda_synchronize;
1929 template<
bool Async,
1931 size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
1932 using cuda_launch_explicit_t =
1933 policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>;
1936 template<
bool Async,
int num_threads = named_usage::unspecified>
1937 using cuda_launch_t =
1938 policy::cuda::cuda_launch_explicit_t<Async,
1943 : policy::cuda::MIN_BLOCKS_PER_SM>;
1947 template<
typename... indexers>
1948 using cuda_indexer_direct_unchecked =
1949 policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked,
1953 template<
typename... indexers>
1954 using cuda_indexer_direct =
1955 policy::cuda::cuda_indexer<iteration_mapping::Direct,
1959 template<
typename... indexers>
1960 using cuda_indexer_loop = policy::cuda::cuda_indexer<
1961 iteration_mapping::StridedLoop<named_usage::unspecified>,
1965 template<
typename... indexers>
1966 using cuda_indexer_syncable_loop = policy::cuda::cuda_indexer<
1967 iteration_mapping::StridedLoop<named_usage::unspecified>,
1971 template<
typename... indexers>
1972 using cuda_flatten_indexer_direct_unchecked =
1973 policy::cuda::cuda_flatten_indexer<iteration_mapping::DirectUnchecked,
1977 template<
typename... indexers>
1978 using cuda_flatten_indexer_direct =
1979 policy::cuda::cuda_flatten_indexer<iteration_mapping::Direct,
1983 template<
typename... indexers>
1984 using cuda_flatten_indexer_loop = policy::cuda::cuda_flatten_indexer<
1985 iteration_mapping::StridedLoop<named_usage::unspecified>,
1998 using cuda_thread_direct_unchecked = cuda_indexer_direct_unchecked<
1999 cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
2000 using cuda_thread_x_direct_unchecked =
2001 cuda_thread_direct_unchecked<named_dim::x>;
2002 using cuda_thread_y_direct_unchecked =
2003 cuda_thread_direct_unchecked<named_dim::y>;
2004 using cuda_thread_z_direct_unchecked =
2005 cuda_thread_direct_unchecked<named_dim::z>;
2006 using cuda_thread_xy_direct_unchecked =
2007 cuda_thread_direct_unchecked<named_dim::x, named_dim::y>;
2008 using cuda_thread_xz_direct_unchecked =
2009 cuda_thread_direct_unchecked<named_dim::x, named_dim::z>;
2010 using cuda_thread_yx_direct_unchecked =
2011 cuda_thread_direct_unchecked<named_dim::y, named_dim::x>;
2012 using cuda_thread_yz_direct_unchecked =
2013 cuda_thread_direct_unchecked<named_dim::y, named_dim::z>;
2014 using cuda_thread_zx_direct_unchecked =
2015 cuda_thread_direct_unchecked<named_dim::z, named_dim::x>;
2016 using cuda_thread_zy_direct_unchecked =
2017 cuda_thread_direct_unchecked<named_dim::z, named_dim::y>;
2018 using cuda_thread_xyz_direct_unchecked =
2019 cuda_thread_direct_unchecked<named_dim::x, named_dim::y, named_dim::z>;
2020 using cuda_thread_xzy_direct_unchecked =
2021 cuda_thread_direct_unchecked<named_dim::x, named_dim::z, named_dim::y>;
2022 using cuda_thread_yxz_direct_unchecked =
2023 cuda_thread_direct_unchecked<named_dim::y, named_dim::x, named_dim::z>;
2024 using cuda_thread_yzx_direct_unchecked =
2025 cuda_thread_direct_unchecked<named_dim::y, named_dim::z, named_dim::x>;
2026 using cuda_thread_zxy_direct_unchecked =
2027 cuda_thread_direct_unchecked<named_dim::z, named_dim::x, named_dim::y>;
2028 using cuda_thread_zyx_direct_unchecked =
2029 cuda_thread_direct_unchecked<named_dim::z, named_dim::y, named_dim::x>;
2032 using cuda_block_direct_unchecked = cuda_indexer_direct_unchecked<
2033 cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
2034 using cuda_block_x_direct_unchecked = cuda_block_direct_unchecked<named_dim::x>;
2035 using cuda_block_y_direct_unchecked = cuda_block_direct_unchecked<named_dim::y>;
2036 using cuda_block_z_direct_unchecked = cuda_block_direct_unchecked<named_dim::z>;
2037 using cuda_block_xy_direct_unchecked =
2038 cuda_block_direct_unchecked<named_dim::x, named_dim::y>;
2039 using cuda_block_xz_direct_unchecked =
2040 cuda_block_direct_unchecked<named_dim::x, named_dim::z>;
2041 using cuda_block_yx_direct_unchecked =
2042 cuda_block_direct_unchecked<named_dim::y, named_dim::x>;
2043 using cuda_block_yz_direct_unchecked =
2044 cuda_block_direct_unchecked<named_dim::y, named_dim::z>;
2045 using cuda_block_zx_direct_unchecked =
2046 cuda_block_direct_unchecked<named_dim::z, named_dim::x>;
2047 using cuda_block_zy_direct_unchecked =
2048 cuda_block_direct_unchecked<named_dim::z, named_dim::y>;
2049 using cuda_block_xyz_direct_unchecked =
2050 cuda_block_direct_unchecked<named_dim::x, named_dim::y, named_dim::z>;
2051 using cuda_block_xzy_direct_unchecked =
2052 cuda_block_direct_unchecked<named_dim::x, named_dim::z, named_dim::y>;
2053 using cuda_block_yxz_direct_unchecked =
2054 cuda_block_direct_unchecked<named_dim::y, named_dim::x, named_dim::z>;
2055 using cuda_block_yzx_direct_unchecked =
2056 cuda_block_direct_unchecked<named_dim::y, named_dim::z, named_dim::x>;
2057 using cuda_block_zxy_direct_unchecked =
2058 cuda_block_direct_unchecked<named_dim::z, named_dim::x, named_dim::y>;
2059 using cuda_block_zyx_direct_unchecked =
2060 cuda_block_direct_unchecked<named_dim::z, named_dim::y, named_dim::x>;
2063 using cuda_global_direct_unchecked = cuda_indexer_direct_unchecked<
2064 cuda::IndexGlobal<dims,
2067 using cuda_global_x_direct_unchecked =
2068 cuda_global_direct_unchecked<named_dim::x>;
2069 using cuda_global_y_direct_unchecked =
2070 cuda_global_direct_unchecked<named_dim::y>;
2071 using cuda_global_z_direct_unchecked =
2072 cuda_global_direct_unchecked<named_dim::z>;
2073 using cuda_global_xy_direct_unchecked =
2074 cuda_global_direct_unchecked<named_dim::x, named_dim::y>;
2075 using cuda_global_xz_direct_unchecked =
2076 cuda_global_direct_unchecked<named_dim::x, named_dim::z>;
2077 using cuda_global_yx_direct_unchecked =
2078 cuda_global_direct_unchecked<named_dim::y, named_dim::x>;
2079 using cuda_global_yz_direct_unchecked =
2080 cuda_global_direct_unchecked<named_dim::y, named_dim::z>;
2081 using cuda_global_zx_direct_unchecked =
2082 cuda_global_direct_unchecked<named_dim::z, named_dim::x>;
2083 using cuda_global_zy_direct_unchecked =
2084 cuda_global_direct_unchecked<named_dim::z, named_dim::y>;
2085 using cuda_global_xyz_direct_unchecked =
2086 cuda_global_direct_unchecked<named_dim::x, named_dim::y, named_dim::z>;
2087 using cuda_global_xzy_direct_unchecked =
2088 cuda_global_direct_unchecked<named_dim::x, named_dim::z, named_dim::y>;
2089 using cuda_global_yxz_direct_unchecked =
2090 cuda_global_direct_unchecked<named_dim::y, named_dim::x, named_dim::z>;
2091 using cuda_global_yzx_direct_unchecked =
2092 cuda_global_direct_unchecked<named_dim::y, named_dim::z, named_dim::x>;
2093 using cuda_global_zxy_direct_unchecked =
2094 cuda_global_direct_unchecked<named_dim::z, named_dim::x, named_dim::y>;
2095 using cuda_global_zyx_direct_unchecked =
2096 cuda_global_direct_unchecked<named_dim::z, named_dim::y, named_dim::x>;
2106 using cuda_thread_direct = cuda_indexer_direct<
2107 cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
2108 using cuda_thread_x_direct = cuda_thread_direct<named_dim::x>;
2109 using cuda_thread_y_direct = cuda_thread_direct<named_dim::y>;
2110 using cuda_thread_z_direct = cuda_thread_direct<named_dim::z>;
2111 using cuda_thread_xy_direct = cuda_thread_direct<named_dim::x, named_dim::y>;
2112 using cuda_thread_xz_direct = cuda_thread_direct<named_dim::x, named_dim::z>;
2113 using cuda_thread_yx_direct = cuda_thread_direct<named_dim::y, named_dim::x>;
2114 using cuda_thread_yz_direct = cuda_thread_direct<named_dim::y, named_dim::z>;
2115 using cuda_thread_zx_direct = cuda_thread_direct<named_dim::z, named_dim::x>;
2116 using cuda_thread_zy_direct = cuda_thread_direct<named_dim::z, named_dim::y>;
2117 using cuda_thread_xyz_direct =
2118 cuda_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
2119 using cuda_thread_xzy_direct =
2120 cuda_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
2121 using cuda_thread_yxz_direct =
2122 cuda_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
2123 using cuda_thread_yzx_direct =
2124 cuda_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
2125 using cuda_thread_zxy_direct =
2126 cuda_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
2127 using cuda_thread_zyx_direct =
2128 cuda_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
2131 using cuda_block_direct = cuda_indexer_direct<
2132 cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
2133 using cuda_block_x_direct = cuda_block_direct<named_dim::x>;
2134 using cuda_block_y_direct = cuda_block_direct<named_dim::y>;
2135 using cuda_block_z_direct = cuda_block_direct<named_dim::z>;
2136 using cuda_block_xy_direct = cuda_block_direct<named_dim::x, named_dim::y>;
2137 using cuda_block_xz_direct = cuda_block_direct<named_dim::x, named_dim::z>;
2138 using cuda_block_yx_direct = cuda_block_direct<named_dim::y, named_dim::x>;
2139 using cuda_block_yz_direct = cuda_block_direct<named_dim::y, named_dim::z>;
2140 using cuda_block_zx_direct = cuda_block_direct<named_dim::z, named_dim::x>;
2141 using cuda_block_zy_direct = cuda_block_direct<named_dim::z, named_dim::y>;
2142 using cuda_block_xyz_direct =
2143 cuda_block_direct<named_dim::x, named_dim::y, named_dim::z>;
2144 using cuda_block_xzy_direct =
2145 cuda_block_direct<named_dim::x, named_dim::z, named_dim::y>;
2146 using cuda_block_yxz_direct =
2147 cuda_block_direct<named_dim::y, named_dim::x, named_dim::z>;
2148 using cuda_block_yzx_direct =
2149 cuda_block_direct<named_dim::y, named_dim::z, named_dim::x>;
2150 using cuda_block_zxy_direct =
2151 cuda_block_direct<named_dim::z, named_dim::x, named_dim::y>;
2152 using cuda_block_zyx_direct =
2153 cuda_block_direct<named_dim::z, named_dim::y, named_dim::x>;
2156 using cuda_global_direct =
2157 cuda_indexer_direct<cuda::IndexGlobal<dims,
2160 using cuda_global_x_direct = cuda_global_direct<named_dim::x>;
2161 using cuda_global_y_direct = cuda_global_direct<named_dim::y>;
2162 using cuda_global_z_direct = cuda_global_direct<named_dim::z>;
2163 using cuda_global_xy_direct = cuda_global_direct<named_dim::x, named_dim::y>;
2164 using cuda_global_xz_direct = cuda_global_direct<named_dim::x, named_dim::z>;
2165 using cuda_global_yx_direct = cuda_global_direct<named_dim::y, named_dim::x>;
2166 using cuda_global_yz_direct = cuda_global_direct<named_dim::y, named_dim::z>;
2167 using cuda_global_zx_direct = cuda_global_direct<named_dim::z, named_dim::x>;
2168 using cuda_global_zy_direct = cuda_global_direct<named_dim::z, named_dim::y>;
2169 using cuda_global_xyz_direct =
2170 cuda_global_direct<named_dim::x, named_dim::y, named_dim::z>;
2171 using cuda_global_xzy_direct =
2172 cuda_global_direct<named_dim::x, named_dim::z, named_dim::y>;
2173 using cuda_global_yxz_direct =
2174 cuda_global_direct<named_dim::y, named_dim::x, named_dim::z>;
2175 using cuda_global_yzx_direct =
2176 cuda_global_direct<named_dim::y, named_dim::z, named_dim::x>;
2177 using cuda_global_zxy_direct =
2178 cuda_global_direct<named_dim::z, named_dim::x, named_dim::y>;
2179 using cuda_global_zyx_direct =
2180 cuda_global_direct<named_dim::z, named_dim::y, named_dim::x>;
2188 using cuda_thread_loop = cuda_indexer_loop<
2189 cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
2190 using cuda_thread_x_loop = cuda_thread_loop<named_dim::x>;
2191 using cuda_thread_y_loop = cuda_thread_loop<named_dim::y>;
2192 using cuda_thread_z_loop = cuda_thread_loop<named_dim::z>;
2193 using cuda_thread_xy_loop = cuda_thread_loop<named_dim::x, named_dim::y>;
2194 using cuda_thread_xz_loop = cuda_thread_loop<named_dim::x, named_dim::z>;
2195 using cuda_thread_yx_loop = cuda_thread_loop<named_dim::y, named_dim::x>;
2196 using cuda_thread_yz_loop = cuda_thread_loop<named_dim::y, named_dim::z>;
2197 using cuda_thread_zx_loop = cuda_thread_loop<named_dim::z, named_dim::x>;
2198 using cuda_thread_zy_loop = cuda_thread_loop<named_dim::z, named_dim::y>;
2199 using cuda_thread_xyz_loop =
2200 cuda_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
2201 using cuda_thread_xzy_loop =
2202 cuda_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
2203 using cuda_thread_yxz_loop =
2204 cuda_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
2205 using cuda_thread_yzx_loop =
2206 cuda_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
2207 using cuda_thread_zxy_loop =
2208 cuda_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
2209 using cuda_thread_zyx_loop =
2210 cuda_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
2213 using cuda_block_loop = cuda_indexer_loop<
2214 cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
2215 using cuda_block_x_loop = cuda_block_loop<named_dim::x>;
2216 using cuda_block_y_loop = cuda_block_loop<named_dim::y>;
2217 using cuda_block_z_loop = cuda_block_loop<named_dim::z>;
2218 using cuda_block_xy_loop = cuda_block_loop<named_dim::x, named_dim::y>;
2219 using cuda_block_xz_loop = cuda_block_loop<named_dim::x, named_dim::z>;
2220 using cuda_block_yx_loop = cuda_block_loop<named_dim::y, named_dim::x>;
2221 using cuda_block_yz_loop = cuda_block_loop<named_dim::y, named_dim::z>;
2222 using cuda_block_zx_loop = cuda_block_loop<named_dim::z, named_dim::x>;
2223 using cuda_block_zy_loop = cuda_block_loop<named_dim::z, named_dim::y>;
2224 using cuda_block_xyz_loop =
2225 cuda_block_loop<named_dim::x, named_dim::y, named_dim::z>;
2226 using cuda_block_xzy_loop =
2227 cuda_block_loop<named_dim::x, named_dim::z, named_dim::y>;
2228 using cuda_block_yxz_loop =
2229 cuda_block_loop<named_dim::y, named_dim::x, named_dim::z>;
2230 using cuda_block_yzx_loop =
2231 cuda_block_loop<named_dim::y, named_dim::z, named_dim::x>;
2232 using cuda_block_zxy_loop =
2233 cuda_block_loop<named_dim::z, named_dim::x, named_dim::y>;
2234 using cuda_block_zyx_loop =
2235 cuda_block_loop<named_dim::z, named_dim::y, named_dim::x>;
2238 using cuda_global_loop =
2239 cuda_indexer_loop<cuda::IndexGlobal<dims,
2242 using cuda_global_x_loop = cuda_global_loop<named_dim::x>;
2243 using cuda_global_y_loop = cuda_global_loop<named_dim::y>;
2244 using cuda_global_z_loop = cuda_global_loop<named_dim::z>;
2245 using cuda_global_xy_loop = cuda_global_loop<named_dim::x, named_dim::y>;
2246 using cuda_global_xz_loop = cuda_global_loop<named_dim::x, named_dim::z>;
2247 using cuda_global_yx_loop = cuda_global_loop<named_dim::y, named_dim::x>;
2248 using cuda_global_yz_loop = cuda_global_loop<named_dim::y, named_dim::z>;
2249 using cuda_global_zx_loop = cuda_global_loop<named_dim::z, named_dim::x>;
2250 using cuda_global_zy_loop = cuda_global_loop<named_dim::z, named_dim::y>;
2251 using cuda_global_xyz_loop =
2252 cuda_global_loop<named_dim::x, named_dim::y, named_dim::z>;
2253 using cuda_global_xzy_loop =
2254 cuda_global_loop<named_dim::x, named_dim::z, named_dim::y>;
2255 using cuda_global_yxz_loop =
2256 cuda_global_loop<named_dim::y, named_dim::x, named_dim::z>;
2257 using cuda_global_yzx_loop =
2258 cuda_global_loop<named_dim::y, named_dim::z, named_dim::x>;
2259 using cuda_global_zxy_loop =
2260 cuda_global_loop<named_dim::z, named_dim::x, named_dim::y>;
2261 using cuda_global_zyx_loop =
2262 cuda_global_loop<named_dim::z, named_dim::y, named_dim::x>;
2272 using cuda_thread_syncable_loop = cuda_indexer_syncable_loop<
2273 cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
2274 using cuda_thread_x_syncable_loop = cuda_thread_syncable_loop<named_dim::x>;
2275 using cuda_thread_y_syncable_loop = cuda_thread_syncable_loop<named_dim::y>;
2276 using cuda_thread_z_syncable_loop = cuda_thread_syncable_loop<named_dim::z>;
2277 using cuda_thread_xy_syncable_loop =
2278 cuda_thread_syncable_loop<named_dim::x, named_dim::y>;
2279 using cuda_thread_xz_syncable_loop =
2280 cuda_thread_syncable_loop<named_dim::x, named_dim::z>;
2281 using cuda_thread_yx_syncable_loop =
2282 cuda_thread_syncable_loop<named_dim::y, named_dim::x>;
2283 using cuda_thread_yz_syncable_loop =
2284 cuda_thread_syncable_loop<named_dim::y, named_dim::z>;
2285 using cuda_thread_zx_syncable_loop =
2286 cuda_thread_syncable_loop<named_dim::z, named_dim::x>;
2287 using cuda_thread_zy_syncable_loop =
2288 cuda_thread_syncable_loop<named_dim::z, named_dim::y>;
2289 using cuda_thread_xyz_syncable_loop =
2290 cuda_thread_syncable_loop<named_dim::x, named_dim::y, named_dim::z>;
2291 using cuda_thread_xzy_syncable_loop =
2292 cuda_thread_syncable_loop<named_dim::x, named_dim::z, named_dim::y>;
2293 using cuda_thread_yxz_syncable_loop =
2294 cuda_thread_syncable_loop<named_dim::y, named_dim::x, named_dim::z>;
2295 using cuda_thread_yzx_syncable_loop =
2296 cuda_thread_syncable_loop<named_dim::y, named_dim::z, named_dim::x>;
2297 using cuda_thread_zxy_syncable_loop =
2298 cuda_thread_syncable_loop<named_dim::z, named_dim::x, named_dim::y>;
2299 using cuda_thread_zyx_syncable_loop =
2300 cuda_thread_syncable_loop<named_dim::z, named_dim::y, named_dim::x>;
2303 using cuda_block_syncable_loop = cuda_indexer_syncable_loop<
2304 cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
2305 using cuda_block_x_syncable_loop = cuda_block_syncable_loop<named_dim::x>;
2306 using cuda_block_y_syncable_loop = cuda_block_syncable_loop<named_dim::y>;
2307 using cuda_block_z_syncable_loop = cuda_block_syncable_loop<named_dim::z>;
2308 using cuda_block_xy_syncable_loop =
2309 cuda_block_syncable_loop<named_dim::x, named_dim::y>;
2310 using cuda_block_xz_syncable_loop =
2311 cuda_block_syncable_loop<named_dim::x, named_dim::z>;
2312 using cuda_block_yx_syncable_loop =
2313 cuda_block_syncable_loop<named_dim::y, named_dim::x>;
2314 using cuda_block_yz_syncable_loop =
2315 cuda_block_syncable_loop<named_dim::y, named_dim::z>;
2316 using cuda_block_zx_syncable_loop =
2317 cuda_block_syncable_loop<named_dim::z, named_dim::x>;
2318 using cuda_block_zy_syncable_loop =
2319 cuda_block_syncable_loop<named_dim::z, named_dim::y>;
2320 using cuda_block_xyz_syncable_loop =
2321 cuda_block_syncable_loop<named_dim::x, named_dim::y, named_dim::z>;
2322 using cuda_block_xzy_syncable_loop =
2323 cuda_block_syncable_loop<named_dim::x, named_dim::z, named_dim::y>;
2324 using cuda_block_yxz_syncable_loop =
2325 cuda_block_syncable_loop<named_dim::y, named_dim::x, named_dim::z>;
2326 using cuda_block_yzx_syncable_loop =
2327 cuda_block_syncable_loop<named_dim::y, named_dim::z, named_dim::x>;
2328 using cuda_block_zxy_syncable_loop =
2329 cuda_block_syncable_loop<named_dim::z, named_dim::x, named_dim::y>;
2330 using cuda_block_zyx_syncable_loop =
2331 cuda_block_syncable_loop<named_dim::z, named_dim::y, named_dim::x>;
2334 using cuda_global_syncable_loop =
2335 cuda_indexer_syncable_loop<cuda::IndexGlobal<dims,
2338 using cuda_global_x_syncable_loop = cuda_global_syncable_loop<named_dim::x>;
2339 using cuda_global_y_syncable_loop = cuda_global_syncable_loop<named_dim::y>;
2340 using cuda_global_z_syncable_loop = cuda_global_syncable_loop<named_dim::z>;
2341 using cuda_global_xy_syncable_loop =
2342 cuda_global_syncable_loop<named_dim::x, named_dim::y>;
2343 using cuda_global_xz_syncable_loop =
2344 cuda_global_syncable_loop<named_dim::x, named_dim::z>;
2345 using cuda_global_yx_syncable_loop =
2346 cuda_global_syncable_loop<named_dim::y, named_dim::x>;
2347 using cuda_global_yz_syncable_loop =
2348 cuda_global_syncable_loop<named_dim::y, named_dim::z>;
2349 using cuda_global_zx_syncable_loop =
2350 cuda_global_syncable_loop<named_dim::z, named_dim::x>;
2351 using cuda_global_zy_syncable_loop =
2352 cuda_global_syncable_loop<named_dim::z, named_dim::y>;
2353 using cuda_global_xyz_syncable_loop =
2354 cuda_global_syncable_loop<named_dim::x, named_dim::y, named_dim::z>;
2355 using cuda_global_xzy_syncable_loop =
2356 cuda_global_syncable_loop<named_dim::x, named_dim::z, named_dim::y>;
2357 using cuda_global_yxz_syncable_loop =
2358 cuda_global_syncable_loop<named_dim::y, named_dim::x, named_dim::z>;
2359 using cuda_global_yzx_syncable_loop =
2360 cuda_global_syncable_loop<named_dim::y, named_dim::z, named_dim::x>;
2361 using cuda_global_zxy_syncable_loop =
2362 cuda_global_syncable_loop<named_dim::z, named_dim::x, named_dim::y>;
2363 using cuda_global_zyx_syncable_loop =
2364 cuda_global_syncable_loop<named_dim::z, named_dim::y, named_dim::x>;
2374 using cuda_flatten_thread_direct_unchecked =
2375 cuda_flatten_indexer_direct_unchecked<
2376 cuda::IndexGlobal<dims,
2379 using cuda_flatten_thread_x_direct_unchecked =
2380 cuda_flatten_thread_direct_unchecked<named_dim::x>;
2381 using cuda_flatten_thread_y_direct_unchecked =
2382 cuda_flatten_thread_direct_unchecked<named_dim::y>;
2383 using cuda_flatten_thread_z_direct_unchecked =
2384 cuda_flatten_thread_direct_unchecked<named_dim::z>;
2385 using cuda_flatten_thread_xy_direct_unchecked =
2386 cuda_flatten_thread_direct_unchecked<named_dim::x, named_dim::y>;
2387 using cuda_flatten_thread_xz_direct_unchecked =
2388 cuda_flatten_thread_direct_unchecked<named_dim::x, named_dim::z>;
2389 using cuda_flatten_thread_yx_direct_unchecked =
2390 cuda_flatten_thread_direct_unchecked<named_dim::y, named_dim::x>;
2391 using cuda_flatten_thread_yz_direct_unchecked =
2392 cuda_flatten_thread_direct_unchecked<named_dim::y, named_dim::z>;
2393 using cuda_flatten_thread_zx_direct_unchecked =
2394 cuda_flatten_thread_direct_unchecked<named_dim::z, named_dim::x>;
2395 using cuda_flatten_thread_zy_direct_unchecked =
2396 cuda_flatten_thread_direct_unchecked<named_dim::z, named_dim::y>;
2397 using cuda_flatten_thread_xyz_direct_unchecked =
2401 using cuda_flatten_thread_xzy_direct_unchecked =
2405 using cuda_flatten_thread_yxz_direct_unchecked =
2409 using cuda_flatten_thread_yzx_direct_unchecked =
2413 using cuda_flatten_thread_zxy_direct_unchecked =
2417 using cuda_flatten_thread_zyx_direct_unchecked =
2423 using cuda_flatten_block_direct_unchecked =
2424 cuda_flatten_indexer_direct_unchecked<
2425 cuda::IndexGlobal<dims,
2428 using cuda_flatten_block_x_direct_unchecked =
2429 cuda_flatten_block_direct_unchecked<named_dim::x>;
2430 using cuda_flatten_block_y_direct_unchecked =
2431 cuda_flatten_block_direct_unchecked<named_dim::y>;
2432 using cuda_flatten_block_z_direct_unchecked =
2433 cuda_flatten_block_direct_unchecked<named_dim::z>;
2434 using cuda_flatten_block_xy_direct_unchecked =
2435 cuda_flatten_block_direct_unchecked<named_dim::x, named_dim::y>;
2436 using cuda_flatten_block_xz_direct_unchecked =
2437 cuda_flatten_block_direct_unchecked<named_dim::x, named_dim::z>;
2438 using cuda_flatten_block_yx_direct_unchecked =
2439 cuda_flatten_block_direct_unchecked<named_dim::y, named_dim::x>;
2440 using cuda_flatten_block_yz_direct_unchecked =
2441 cuda_flatten_block_direct_unchecked<named_dim::y, named_dim::z>;
2442 using cuda_flatten_block_zx_direct_unchecked =
2443 cuda_flatten_block_direct_unchecked<named_dim::z, named_dim::x>;
2444 using cuda_flatten_block_zy_direct_unchecked =
2445 cuda_flatten_block_direct_unchecked<named_dim::z, named_dim::y>;
2446 using cuda_flatten_block_xyz_direct_unchecked =
2450 using cuda_flatten_block_xzy_direct_unchecked =
2454 using cuda_flatten_block_yxz_direct_unchecked =
2458 using cuda_flatten_block_yzx_direct_unchecked =
2462 using cuda_flatten_block_zxy_direct_unchecked =
2466 using cuda_flatten_block_zyx_direct_unchecked =
2472 using cuda_flatten_global_direct_unchecked =
2473 cuda_flatten_indexer_direct_unchecked<
2474 cuda::IndexGlobal<dims,
2477 using cuda_flatten_global_x_direct_unchecked =
2478 cuda_flatten_global_direct_unchecked<named_dim::x>;
2479 using cuda_flatten_global_y_direct_unchecked =
2480 cuda_flatten_global_direct_unchecked<named_dim::y>;
2481 using cuda_flatten_global_z_direct_unchecked =
2482 cuda_flatten_global_direct_unchecked<named_dim::z>;
2483 using cuda_flatten_global_xy_direct_unchecked =
2484 cuda_flatten_global_direct_unchecked<named_dim::x, named_dim::y>;
2485 using cuda_flatten_global_xz_direct_unchecked =
2486 cuda_flatten_global_direct_unchecked<named_dim::x, named_dim::z>;
2487 using cuda_flatten_global_yx_direct_unchecked =
2488 cuda_flatten_global_direct_unchecked<named_dim::y, named_dim::x>;
2489 using cuda_flatten_global_yz_direct_unchecked =
2490 cuda_flatten_global_direct_unchecked<named_dim::y, named_dim::z>;
2491 using cuda_flatten_global_zx_direct_unchecked =
2492 cuda_flatten_global_direct_unchecked<named_dim::z, named_dim::x>;
2493 using cuda_flatten_global_zy_direct_unchecked =
2494 cuda_flatten_global_direct_unchecked<named_dim::z, named_dim::y>;
2495 using cuda_flatten_global_xyz_direct_unchecked =
2499 using cuda_flatten_global_xzy_direct_unchecked =
2503 using cuda_flatten_global_yxz_direct_unchecked =
2507 using cuda_flatten_global_yzx_direct_unchecked =
2511 using cuda_flatten_global_zxy_direct_unchecked =
2515 using cuda_flatten_global_zyx_direct_unchecked =
2529 using cuda_flatten_thread_direct = cuda_flatten_indexer_direct<
2530 cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
2531 using cuda_flatten_thread_x_direct = cuda_flatten_thread_direct<named_dim::x>;
2532 using cuda_flatten_thread_y_direct = cuda_flatten_thread_direct<named_dim::y>;
2533 using cuda_flatten_thread_z_direct = cuda_flatten_thread_direct<named_dim::z>;
2534 using cuda_flatten_thread_xy_direct =
2535 cuda_flatten_thread_direct<named_dim::x, named_dim::y>;
2536 using cuda_flatten_thread_xz_direct =
2537 cuda_flatten_thread_direct<named_dim::x, named_dim::z>;
2538 using cuda_flatten_thread_yx_direct =
2539 cuda_flatten_thread_direct<named_dim::y, named_dim::x>;
2540 using cuda_flatten_thread_yz_direct =
2541 cuda_flatten_thread_direct<named_dim::y, named_dim::z>;
2542 using cuda_flatten_thread_zx_direct =
2543 cuda_flatten_thread_direct<named_dim::z, named_dim::x>;
2544 using cuda_flatten_thread_zy_direct =
2545 cuda_flatten_thread_direct<named_dim::z, named_dim::y>;
2546 using cuda_flatten_thread_xyz_direct =
2547 cuda_flatten_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
2548 using cuda_flatten_thread_xzy_direct =
2549 cuda_flatten_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
2550 using cuda_flatten_thread_yxz_direct =
2551 cuda_flatten_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
2552 using cuda_flatten_thread_yzx_direct =
2553 cuda_flatten_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
2554 using cuda_flatten_thread_zxy_direct =
2555 cuda_flatten_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
2556 using cuda_flatten_thread_zyx_direct =
2557 cuda_flatten_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
2560 using cuda_flatten_block_direct = cuda_flatten_indexer_direct<
2561 cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
2562 using cuda_flatten_block_x_direct = cuda_flatten_block_direct<named_dim::x>;
2563 using cuda_flatten_block_y_direct = cuda_flatten_block_direct<named_dim::y>;
2564 using cuda_flatten_block_z_direct = cuda_flatten_block_direct<named_dim::z>;
2565 using cuda_flatten_block_xy_direct =
2566 cuda_flatten_block_direct<named_dim::x, named_dim::y>;
2567 using cuda_flatten_block_xz_direct =
2568 cuda_flatten_block_direct<named_dim::x, named_dim::z>;
2569 using cuda_flatten_block_yx_direct =
2570 cuda_flatten_block_direct<named_dim::y, named_dim::x>;
2571 using cuda_flatten_block_yz_direct =
2572 cuda_flatten_block_direct<named_dim::y, named_dim::z>;
2573 using cuda_flatten_block_zx_direct =
2574 cuda_flatten_block_direct<named_dim::z, named_dim::x>;
2575 using cuda_flatten_block_zy_direct =
2576 cuda_flatten_block_direct<named_dim::z, named_dim::y>;
2577 using cuda_flatten_block_xyz_direct =
2578 cuda_flatten_block_direct<named_dim::x, named_dim::y, named_dim::z>;
2579 using cuda_flatten_block_xzy_direct =
2580 cuda_flatten_block_direct<named_dim::x, named_dim::z, named_dim::y>;
2581 using cuda_flatten_block_yxz_direct =
2582 cuda_flatten_block_direct<named_dim::y, named_dim::x, named_dim::z>;
2583 using cuda_flatten_block_yzx_direct =
2584 cuda_flatten_block_direct<named_dim::y, named_dim::z, named_dim::x>;
2585 using cuda_flatten_block_zxy_direct =
2586 cuda_flatten_block_direct<named_dim::z, named_dim::x, named_dim::y>;
2587 using cuda_flatten_block_zyx_direct =
2588 cuda_flatten_block_direct<named_dim::z, named_dim::y, named_dim::x>;
2591 using cuda_flatten_global_direct =
2592 cuda_flatten_indexer_direct<cuda::IndexGlobal<dims,
2595 using cuda_flatten_global_x_direct = cuda_flatten_global_direct<named_dim::x>;
2596 using cuda_flatten_global_y_direct = cuda_flatten_global_direct<named_dim::y>;
2597 using cuda_flatten_global_z_direct = cuda_flatten_global_direct<named_dim::z>;
2598 using cuda_flatten_global_xy_direct =
2599 cuda_flatten_global_direct<named_dim::x, named_dim::y>;
2600 using cuda_flatten_global_xz_direct =
2601 cuda_flatten_global_direct<named_dim::x, named_dim::z>;
2602 using cuda_flatten_global_yx_direct =
2603 cuda_flatten_global_direct<named_dim::y, named_dim::x>;
2604 using cuda_flatten_global_yz_direct =
2605 cuda_flatten_global_direct<named_dim::y, named_dim::z>;
2606 using cuda_flatten_global_zx_direct =
2607 cuda_flatten_global_direct<named_dim::z, named_dim::x>;
2608 using cuda_flatten_global_zy_direct =
2609 cuda_flatten_global_direct<named_dim::z, named_dim::y>;
2610 using cuda_flatten_global_xyz_direct =
2611 cuda_flatten_global_direct<named_dim::x, named_dim::y, named_dim::z>;
2612 using cuda_flatten_global_xzy_direct =
2613 cuda_flatten_global_direct<named_dim::x, named_dim::z, named_dim::y>;
2614 using cuda_flatten_global_yxz_direct =
2615 cuda_flatten_global_direct<named_dim::y, named_dim::x, named_dim::z>;
2616 using cuda_flatten_global_yzx_direct =
2617 cuda_flatten_global_direct<named_dim::y, named_dim::z, named_dim::x>;
2618 using cuda_flatten_global_zxy_direct =
2619 cuda_flatten_global_direct<named_dim::z, named_dim::x, named_dim::y>;
2620 using cuda_flatten_global_zyx_direct =
2621 cuda_flatten_global_direct<named_dim::z, named_dim::y, named_dim::x>;
2631 using cuda_flatten_thread_loop = cuda_flatten_indexer_loop<
2632 cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
2633 using cuda_flatten_thread_x_loop = cuda_flatten_thread_loop<named_dim::x>;
2634 using cuda_flatten_thread_y_loop = cuda_flatten_thread_loop<named_dim::y>;
2635 using cuda_flatten_thread_z_loop = cuda_flatten_thread_loop<named_dim::z>;
2636 using cuda_flatten_thread_xy_loop =
2637 cuda_flatten_thread_loop<named_dim::x, named_dim::y>;
2638 using cuda_flatten_thread_xz_loop =
2639 cuda_flatten_thread_loop<named_dim::x, named_dim::z>;
2640 using cuda_flatten_thread_yx_loop =
2641 cuda_flatten_thread_loop<named_dim::y, named_dim::x>;
2642 using cuda_flatten_thread_yz_loop =
2643 cuda_flatten_thread_loop<named_dim::y, named_dim::z>;
2644 using cuda_flatten_thread_zx_loop =
2645 cuda_flatten_thread_loop<named_dim::z, named_dim::x>;
2646 using cuda_flatten_thread_zy_loop =
2647 cuda_flatten_thread_loop<named_dim::z, named_dim::y>;
2648 using cuda_flatten_thread_xyz_loop =
2649 cuda_flatten_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
2650 using cuda_flatten_thread_xzy_loop =
2651 cuda_flatten_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
2652 using cuda_flatten_thread_yxz_loop =
2653 cuda_flatten_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
2654 using cuda_flatten_thread_yzx_loop =
2655 cuda_flatten_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
2656 using cuda_flatten_thread_zxy_loop =
2657 cuda_flatten_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
2658 using cuda_flatten_thread_zyx_loop =
2659 cuda_flatten_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
2662 using cuda_flatten_block_loop = cuda_flatten_indexer_loop<
2663 cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
2664 using cuda_flatten_block_x_loop = cuda_flatten_block_loop<named_dim::x>;
2665 using cuda_flatten_block_y_loop = cuda_flatten_block_loop<named_dim::y>;
2666 using cuda_flatten_block_z_loop = cuda_flatten_block_loop<named_dim::z>;
2667 using cuda_flatten_block_xy_loop =
2668 cuda_flatten_block_loop<named_dim::x, named_dim::y>;
2669 using cuda_flatten_block_xz_loop =
2670 cuda_flatten_block_loop<named_dim::x, named_dim::z>;
2671 using cuda_flatten_block_yx_loop =
2672 cuda_flatten_block_loop<named_dim::y, named_dim::x>;
2673 using cuda_flatten_block_yz_loop =
2674 cuda_flatten_block_loop<named_dim::y, named_dim::z>;
2675 using cuda_flatten_block_zx_loop =
2676 cuda_flatten_block_loop<named_dim::z, named_dim::x>;
2677 using cuda_flatten_block_zy_loop =
2678 cuda_flatten_block_loop<named_dim::z, named_dim::y>;
2679 using cuda_flatten_block_xyz_loop =
2680 cuda_flatten_block_loop<named_dim::x, named_dim::y, named_dim::z>;
2681 using cuda_flatten_block_xzy_loop =
2682 cuda_flatten_block_loop<named_dim::x, named_dim::z, named_dim::y>;
2683 using cuda_flatten_block_yxz_loop =
2684 cuda_flatten_block_loop<named_dim::y, named_dim::x, named_dim::z>;
2685 using cuda_flatten_block_yzx_loop =
2686 cuda_flatten_block_loop<named_dim::y, named_dim::z, named_dim::x>;
2687 using cuda_flatten_block_zxy_loop =
2688 cuda_flatten_block_loop<named_dim::z, named_dim::x, named_dim::y>;
2689 using cuda_flatten_block_zyx_loop =
2690 cuda_flatten_block_loop<named_dim::z, named_dim::y, named_dim::x>;
2693 using cuda_flatten_global_loop =
2694 cuda_flatten_indexer_loop<cuda::IndexGlobal<dims,
2697 using cuda_flatten_global_x_loop = cuda_flatten_global_loop<named_dim::x>;
2698 using cuda_flatten_global_y_loop = cuda_flatten_global_loop<named_dim::y>;
2699 using cuda_flatten_global_z_loop = cuda_flatten_global_loop<named_dim::z>;
2700 using cuda_flatten_global_xy_loop =
2701 cuda_flatten_global_loop<named_dim::x, named_dim::y>;
2702 using cuda_flatten_global_xz_loop =
2703 cuda_flatten_global_loop<named_dim::x, named_dim::z>;
2704 using cuda_flatten_global_yx_loop =
2705 cuda_flatten_global_loop<named_dim::y, named_dim::x>;
2706 using cuda_flatten_global_yz_loop =
2707 cuda_flatten_global_loop<named_dim::y, named_dim::z>;
2708 using cuda_flatten_global_zx_loop =
2709 cuda_flatten_global_loop<named_dim::z, named_dim::x>;
2710 using cuda_flatten_global_zy_loop =
2711 cuda_flatten_global_loop<named_dim::z, named_dim::y>;
2712 using cuda_flatten_global_xyz_loop =
2713 cuda_flatten_global_loop<named_dim::x, named_dim::y, named_dim::z>;
2714 using cuda_flatten_global_xzy_loop =
2715 cuda_flatten_global_loop<named_dim::x, named_dim::z, named_dim::y>;
2716 using cuda_flatten_global_yxz_loop =
2717 cuda_flatten_global_loop<named_dim::y, named_dim::x, named_dim::z>;
2718 using cuda_flatten_global_yzx_loop =
2719 cuda_flatten_global_loop<named_dim::y, named_dim::z, named_dim::x>;
2720 using cuda_flatten_global_zxy_loop =
2721 cuda_flatten_global_loop<named_dim::z, named_dim::x, named_dim::y>;
2722 using cuda_flatten_global_zyx_loop =
2723 cuda_flatten_global_loop<named_dim::z, named_dim::y, named_dim::x>;
2730 template<
int X_SIZE>
2731 using cuda_thread_size_x_direct_unchecked =
2732 cuda_indexer_direct_unchecked<cuda::thread_x<X_SIZE>>;
2733 template<
int Y_SIZE>
2734 using cuda_thread_size_y_direct_unchecked =
2735 cuda_indexer_direct_unchecked<cuda::thread_y<Y_SIZE>>;
2736 template<
int Z_SIZE>
2737 using cuda_thread_size_z_direct_unchecked =
2738 cuda_indexer_direct_unchecked<cuda::thread_z<Z_SIZE>>;
2739 template<
int X_SIZE,
int Y_SIZE>
2740 using cuda_thread_size_xy_direct_unchecked =
2741 cuda_indexer_direct_unchecked<cuda::thread_x<X_SIZE>,
2742 cuda::thread_y<Y_SIZE>>;
2743 template<
int X_SIZE,
int Z_SIZE>
2744 using cuda_thread_size_xz_direct_unchecked =
2745 cuda_indexer_direct_unchecked<cuda::thread_x<X_SIZE>,
2746 cuda::thread_z<Z_SIZE>>;
2747 template<
int Y_SIZE,
int X_SIZE>
2748 using cuda_thread_size_yx_direct_unchecked =
2749 cuda_indexer_direct_unchecked<cuda::thread_y<Y_SIZE>,
2750 cuda::thread_x<X_SIZE>>;
2751 template<
int Y_SIZE,
int Z_SIZE>
2752 using cuda_thread_size_yz_direct_unchecked =
2753 cuda_indexer_direct_unchecked<cuda::thread_y<Y_SIZE>,
2754 cuda::thread_z<Z_SIZE>>;
2755 template<
int Z_SIZE,
int X_SIZE>
2756 using cuda_thread_size_zx_direct_unchecked =
2757 cuda_indexer_direct_unchecked<cuda::thread_z<Z_SIZE>,
2758 cuda::thread_x<X_SIZE>>;
2759 template<
int Z_SIZE,
int Y_SIZE>
2760 using cuda_thread_size_zy_direct_unchecked =
2761 cuda_indexer_direct_unchecked<cuda::thread_z<Z_SIZE>,
2762 cuda::thread_y<Y_SIZE>>;
2763 template<
int X_SIZE,
int Y_SIZE,
int Z_SIZE>
2764 using cuda_thread_size_xyz_direct_unchecked =
2765 cuda_indexer_direct_unchecked<cuda::thread_x<X_SIZE>,
2766 cuda::thread_y<Y_SIZE>,
2767 cuda::thread_z<Z_SIZE>>;
2768 template<
int X_SIZE,
int Z_SIZE,
int Y_SIZE>
2769 using cuda_thread_size_xzy_direct_unchecked =
2770 cuda_indexer_direct_unchecked<cuda::thread_x<X_SIZE>,
2771 cuda::thread_z<Z_SIZE>,
2772 cuda::thread_y<Y_SIZE>>;
2773 template<
int Y_SIZE,
int X_SIZE,
int Z_SIZE>
2774 using cuda_thread_size_yxz_direct_unchecked =
2775 cuda_indexer_direct_unchecked<cuda::thread_y<Y_SIZE>,
2776 cuda::thread_x<X_SIZE>,
2777 cuda::thread_z<Z_SIZE>>;
2778 template<
int Y_SIZE,
int Z_SIZE,
int X_SIZE>
2779 using cuda_thread_size_yzx_direct_unchecked =
2780 cuda_indexer_direct_unchecked<cuda::thread_y<Y_SIZE>,
2781 cuda::thread_z<Z_SIZE>,
2782 cuda::thread_x<X_SIZE>>;
2783 template<
int Z_SIZE,
int X_SIZE,
int Y_SIZE>
2784 using cuda_thread_size_zxy_direct_unchecked =
2785 cuda_indexer_direct_unchecked<cuda::thread_z<Z_SIZE>,
2786 cuda::thread_x<X_SIZE>,
2787 cuda::thread_y<Y_SIZE>>;
2788 template<
int Z_SIZE,
int Y_SIZE,
int X_SIZE>
2789 using cuda_thread_size_zyx_direct_unchecked =
2790 cuda_indexer_direct_unchecked<cuda::thread_z<Z_SIZE>,
2791 cuda::thread_y<Y_SIZE>,
2792 cuda::thread_x<X_SIZE>>;
2794 template<
int X_SIZE>
2795 using cuda_block_size_x_direct_unchecked =
2796 cuda_indexer_direct_unchecked<cuda::block_x<X_SIZE>>;
2797 template<
int Y_SIZE>
2798 using cuda_block_size_y_direct_unchecked =
2799 cuda_indexer_direct_unchecked<cuda::block_y<Y_SIZE>>;
2800 template<
int Z_SIZE>
2801 using cuda_block_size_z_direct_unchecked =
2802 cuda_indexer_direct_unchecked<cuda::block_z<Z_SIZE>>;
2803 template<
int X_SIZE,
int Y_SIZE>
2804 using cuda_block_size_xy_direct_unchecked =
2805 cuda_indexer_direct_unchecked<cuda::block_x<X_SIZE>, cuda::block_y<Y_SIZE>>;
2806 template<
int X_SIZE,
int Z_SIZE>
2807 using cuda_block_size_xz_direct_unchecked =
2808 cuda_indexer_direct_unchecked<cuda::block_x<X_SIZE>, cuda::block_z<Z_SIZE>>;
2809 template<
int Y_SIZE,
int X_SIZE>
2810 using cuda_block_size_yx_direct_unchecked =
2811 cuda_indexer_direct_unchecked<cuda::block_y<Y_SIZE>, cuda::block_x<X_SIZE>>;
2812 template<
int Y_SIZE,
int Z_SIZE>
2813 using cuda_block_size_yz_direct_unchecked =
2814 cuda_indexer_direct_unchecked<cuda::block_y<Y_SIZE>, cuda::block_z<Z_SIZE>>;
2815 template<
int Z_SIZE,
int X_SIZE>
2816 using cuda_block_size_zx_direct_unchecked =
2817 cuda_indexer_direct_unchecked<cuda::block_z<Z_SIZE>, cuda::block_x<X_SIZE>>;
2818 template<
int Z_SIZE,
int Y_SIZE>
2819 using cuda_block_size_zy_direct_unchecked =
2820 cuda_indexer_direct_unchecked<cuda::block_z<Z_SIZE>, cuda::block_y<Y_SIZE>>;
2821 template<
int X_SIZE,
int Y_SIZE,
int Z_SIZE>
2822 using cuda_block_size_xyz_direct_unchecked =
2823 cuda_indexer_direct_unchecked<cuda::block_x<X_SIZE>,
2824 cuda::block_y<Y_SIZE>,
2825 cuda::block_z<Z_SIZE>>;
2826 template<
int X_SIZE,
int Z_SIZE,
int Y_SIZE>
2827 using cuda_block_size_xzy_direct_unchecked =
2828 cuda_indexer_direct_unchecked<cuda::block_x<X_SIZE>,
2829 cuda::block_z<Z_SIZE>,
2830 cuda::block_y<Y_SIZE>>;
2831 template<
int Y_SIZE,
int X_SIZE,
int Z_SIZE>
2832 using cuda_block_size_yxz_direct_unchecked =
2833 cuda_indexer_direct_unchecked<cuda::block_y<Y_SIZE>,
2834 cuda::block_x<X_SIZE>,
2835 cuda::block_z<Z_SIZE>>;
2836 template<
int Y_SIZE,
int Z_SIZE,
int X_SIZE>
2837 using cuda_block_size_yzx_direct_unchecked =
2838 cuda_indexer_direct_unchecked<cuda::block_y<Y_SIZE>,
2839 cuda::block_z<Z_SIZE>,
2840 cuda::block_x<X_SIZE>>;
2841 template<
int Z_SIZE,
int X_SIZE,
int Y_SIZE>
2842 using cuda_block_size_zxy_direct_unchecked =
2843 cuda_indexer_direct_unchecked<cuda::block_z<Z_SIZE>,
2844 cuda::block_x<X_SIZE>,
2845 cuda::block_y<Y_SIZE>>;
2846 template<
int Z_SIZE,
int Y_SIZE,
int X_SIZE>
2847 using cuda_block_size_zyx_direct_unchecked =
2848 cuda_indexer_direct_unchecked<cuda::block_z<Z_SIZE>,
2849 cuda::block_y<Y_SIZE>,
2850 cuda::block_x<X_SIZE>>;
2852 template<
int X_BLOCK_SIZE,
int X_GRID_SIZE = named_usage::unspecified>
2853 using cuda_global_size_x_direct_unchecked =
2854 cuda_indexer_direct_unchecked<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
2855 template<
int Y_BLOCK_SIZE,
int Y_GRID_SIZE = named_usage::unspecified>
2856 using cuda_global_size_y_direct_unchecked =
2857 cuda_indexer_direct_unchecked<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
2858 template<
int Z_BLOCK_SIZE,
int Z_GRID_SIZE = named_usage::unspecified>
2859 using cuda_global_size_z_direct_unchecked =
2860 cuda_indexer_direct_unchecked<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
2861 template<
int X_BLOCK_SIZE,
2865 using cuda_global_size_xy_direct_unchecked =
2866 cuda_indexer_direct_unchecked<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
2867 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
2868 template<
int X_BLOCK_SIZE,
2872 using cuda_global_size_xz_direct_unchecked =
2873 cuda_indexer_direct_unchecked<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
2874 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
2875 template<
int Y_BLOCK_SIZE,
2879 using cuda_global_size_yx_direct_unchecked =
2880 cuda_indexer_direct_unchecked<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
2881 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
2882 template<
int Y_BLOCK_SIZE,
2886 using cuda_global_size_yz_direct_unchecked =
2887 cuda_indexer_direct_unchecked<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
2888 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
2889 template<
int Z_BLOCK_SIZE,
2893 using cuda_global_size_zx_direct_unchecked =
2894 cuda_indexer_direct_unchecked<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
2895 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
2896 template<
int Z_BLOCK_SIZE,
2900 using cuda_global_size_zy_direct_unchecked =
2901 cuda_indexer_direct_unchecked<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
2902 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
2903 template<
int X_BLOCK_SIZE,
2909 using cuda_global_size_xyz_direct_unchecked =
2910 cuda_indexer_direct_unchecked<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
2911 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
2912 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
2913 template<
int X_BLOCK_SIZE,
2919 using cuda_global_size_xzy_direct_unchecked =
2920 cuda_indexer_direct_unchecked<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
2921 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
2922 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
2923 template<
int Y_BLOCK_SIZE,
2929 using cuda_global_size_yxz_direct_unchecked =
2930 cuda_indexer_direct_unchecked<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
2931 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
2932 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
2933 template<
int Y_BLOCK_SIZE,
2939 using cuda_global_size_yzx_direct_unchecked =
2940 cuda_indexer_direct_unchecked<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
2941 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
2942 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
2943 template<
int Z_BLOCK_SIZE,
2949 using cuda_global_size_zxy_direct_unchecked =
2950 cuda_indexer_direct_unchecked<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
2951 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
2952 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
2953 template<
int Z_BLOCK_SIZE,
2959 using cuda_global_size_zyx_direct_unchecked =
2960 cuda_indexer_direct_unchecked<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
2961 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
2962 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
2969 template<
int X_SIZE>
2970 using cuda_thread_size_x_direct = cuda_indexer_direct<cuda::thread_x<X_SIZE>>;
2971 template<
int Y_SIZE>
2972 using cuda_thread_size_y_direct = cuda_indexer_direct<cuda::thread_y<Y_SIZE>>;
2973 template<
int Z_SIZE>
2974 using cuda_thread_size_z_direct = cuda_indexer_direct<cuda::thread_z<Z_SIZE>>;
2975 template<
int X_SIZE,
int Y_SIZE>
2976 using cuda_thread_size_xy_direct =
2977 cuda_indexer_direct<cuda::thread_x<X_SIZE>, cuda::thread_y<Y_SIZE>>;
2978 template<
int X_SIZE,
int Z_SIZE>
2979 using cuda_thread_size_xz_direct =
2980 cuda_indexer_direct<cuda::thread_x<X_SIZE>, cuda::thread_z<Z_SIZE>>;
2981 template<
int Y_SIZE,
int X_SIZE>
2982 using cuda_thread_size_yx_direct =
2983 cuda_indexer_direct<cuda::thread_y<Y_SIZE>, cuda::thread_x<X_SIZE>>;
2984 template<
int Y_SIZE,
int Z_SIZE>
2985 using cuda_thread_size_yz_direct =
2986 cuda_indexer_direct<cuda::thread_y<Y_SIZE>, cuda::thread_z<Z_SIZE>>;
2987 template<
int Z_SIZE,
int X_SIZE>
2988 using cuda_thread_size_zx_direct =
2989 cuda_indexer_direct<cuda::thread_z<Z_SIZE>, cuda::thread_x<X_SIZE>>;
2990 template<
int Z_SIZE,
int Y_SIZE>
2991 using cuda_thread_size_zy_direct =
2992 cuda_indexer_direct<cuda::thread_z<Z_SIZE>, cuda::thread_y<Y_SIZE>>;
2993 template<
int X_SIZE,
int Y_SIZE,
int Z_SIZE>
2994 using cuda_thread_size_xyz_direct = cuda_indexer_direct<cuda::thread_x<X_SIZE>,
2995 cuda::thread_y<Y_SIZE>,
2996 cuda::thread_z<Z_SIZE>>;
2997 template<
int X_SIZE,
int Z_SIZE,
int Y_SIZE>
2998 using cuda_thread_size_xzy_direct = cuda_indexer_direct<cuda::thread_x<X_SIZE>,
2999 cuda::thread_z<Z_SIZE>,
3000 cuda::thread_y<Y_SIZE>>;
3001 template<
int Y_SIZE,
int X_SIZE,
int Z_SIZE>
3002 using cuda_thread_size_yxz_direct = cuda_indexer_direct<cuda::thread_y<Y_SIZE>,
3003 cuda::thread_x<X_SIZE>,
3004 cuda::thread_z<Z_SIZE>>;
3005 template<
int Y_SIZE,
int Z_SIZE,
int X_SIZE>
3006 using cuda_thread_size_yzx_direct = cuda_indexer_direct<cuda::thread_y<Y_SIZE>,
3007 cuda::thread_z<Z_SIZE>,
3008 cuda::thread_x<X_SIZE>>;
3009 template<
int Z_SIZE,
int X_SIZE,
int Y_SIZE>
3010 using cuda_thread_size_zxy_direct = cuda_indexer_direct<cuda::thread_z<Z_SIZE>,
3011 cuda::thread_x<X_SIZE>,
3012 cuda::thread_y<Y_SIZE>>;
3013 template<
int Z_SIZE,
int Y_SIZE,
int X_SIZE>
3014 using cuda_thread_size_zyx_direct = cuda_indexer_direct<cuda::thread_z<Z_SIZE>,
3015 cuda::thread_y<Y_SIZE>,
3016 cuda::thread_x<X_SIZE>>;
3018 template<
int X_SIZE>
3019 using cuda_block_size_x_direct = cuda_indexer_direct<cuda::block_x<X_SIZE>>;
3020 template<
int Y_SIZE>
3021 using cuda_block_size_y_direct = cuda_indexer_direct<cuda::block_y<Y_SIZE>>;
3022 template<
int Z_SIZE>
3023 using cuda_block_size_z_direct = cuda_indexer_direct<cuda::block_z<Z_SIZE>>;
3024 template<
int X_SIZE,
int Y_SIZE>
3025 using cuda_block_size_xy_direct =
3026 cuda_indexer_direct<cuda::block_x<X_SIZE>, cuda::block_y<Y_SIZE>>;
3027 template<
int X_SIZE,
int Z_SIZE>
3028 using cuda_block_size_xz_direct =
3029 cuda_indexer_direct<cuda::block_x<X_SIZE>, cuda::block_z<Z_SIZE>>;
3030 template<
int Y_SIZE,
int X_SIZE>
3031 using cuda_block_size_yx_direct =
3032 cuda_indexer_direct<cuda::block_y<Y_SIZE>, cuda::block_x<X_SIZE>>;
3033 template<
int Y_SIZE,
int Z_SIZE>
3034 using cuda_block_size_yz_direct =
3035 cuda_indexer_direct<cuda::block_y<Y_SIZE>, cuda::block_z<Z_SIZE>>;
3036 template<
int Z_SIZE,
int X_SIZE>
3037 using cuda_block_size_zx_direct =
3038 cuda_indexer_direct<cuda::block_z<Z_SIZE>, cuda::block_x<X_SIZE>>;
3039 template<
int Z_SIZE,
int Y_SIZE>
3040 using cuda_block_size_zy_direct =
3041 cuda_indexer_direct<cuda::block_z<Z_SIZE>, cuda::block_y<Y_SIZE>>;
3042 template<
int X_SIZE,
int Y_SIZE,
int Z_SIZE>
3043 using cuda_block_size_xyz_direct = cuda_indexer_direct<cuda::block_x<X_SIZE>,
3044 cuda::block_y<Y_SIZE>,
3045 cuda::block_z<Z_SIZE>>;
3046 template<
int X_SIZE,
int Z_SIZE,
int Y_SIZE>
3047 using cuda_block_size_xzy_direct = cuda_indexer_direct<cuda::block_x<X_SIZE>,
3048 cuda::block_z<Z_SIZE>,
3049 cuda::block_y<Y_SIZE>>;
3050 template<
int Y_SIZE,
int X_SIZE,
int Z_SIZE>
3051 using cuda_block_size_yxz_direct = cuda_indexer_direct<cuda::block_y<Y_SIZE>,
3052 cuda::block_x<X_SIZE>,
3053 cuda::block_z<Z_SIZE>>;
3054 template<
int Y_SIZE,
int Z_SIZE,
int X_SIZE>
3055 using cuda_block_size_yzx_direct = cuda_indexer_direct<cuda::block_y<Y_SIZE>,
3056 cuda::block_z<Z_SIZE>,
3057 cuda::block_x<X_SIZE>>;
3058 template<
int Z_SIZE,
int X_SIZE,
int Y_SIZE>
3059 using cuda_block_size_zxy_direct = cuda_indexer_direct<cuda::block_z<Z_SIZE>,
3060 cuda::block_x<X_SIZE>,
3061 cuda::block_y<Y_SIZE>>;
3062 template<
int Z_SIZE,
int Y_SIZE,
int X_SIZE>
3063 using cuda_block_size_zyx_direct = cuda_indexer_direct<cuda::block_z<Z_SIZE>,
3064 cuda::block_y<Y_SIZE>,
3065 cuda::block_x<X_SIZE>>;
3067 template<
int X_BLOCK_SIZE,
int X_GRID_SIZE = named_usage::unspecified>
3068 using cuda_global_size_x_direct =
3069 cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3070 template<
int Y_BLOCK_SIZE,
int Y_GRID_SIZE = named_usage::unspecified>
3071 using cuda_global_size_y_direct =
3072 cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3073 template<
int Z_BLOCK_SIZE,
int Z_GRID_SIZE = named_usage::unspecified>
3074 using cuda_global_size_z_direct =
3075 cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3076 template<
int X_BLOCK_SIZE,
3080 using cuda_global_size_xy_direct =
3081 cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3082 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3083 template<
int X_BLOCK_SIZE,
3087 using cuda_global_size_xz_direct =
3088 cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3089 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3090 template<
int Y_BLOCK_SIZE,
3094 using cuda_global_size_yx_direct =
3095 cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3096 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3097 template<
int Y_BLOCK_SIZE,
3101 using cuda_global_size_yz_direct =
3102 cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3103 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3104 template<
int Z_BLOCK_SIZE,
3108 using cuda_global_size_zx_direct =
3109 cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3110 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3111 template<
int Z_BLOCK_SIZE,
3115 using cuda_global_size_zy_direct =
3116 cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3117 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3118 template<
int X_BLOCK_SIZE,
3124 using cuda_global_size_xyz_direct =
3125 cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3126 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3127 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3128 template<
int X_BLOCK_SIZE,
3134 using cuda_global_size_xzy_direct =
3135 cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3136 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3137 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3138 template<
int Y_BLOCK_SIZE,
3144 using cuda_global_size_yxz_direct =
3145 cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3146 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3147 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3148 template<
int Y_BLOCK_SIZE,
3154 using cuda_global_size_yzx_direct =
3155 cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3156 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3157 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3158 template<
int Z_BLOCK_SIZE,
3164 using cuda_global_size_zxy_direct =
3165 cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3166 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3167 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3168 template<
int Z_BLOCK_SIZE,
3174 using cuda_global_size_zyx_direct =
3175 cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3176 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3177 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3184 template<
int X_SIZE>
3185 using cuda_thread_size_x_loop = cuda_indexer_loop<cuda::thread_x<X_SIZE>>;
3186 template<
int Y_SIZE>
3187 using cuda_thread_size_y_loop = cuda_indexer_loop<cuda::thread_y<Y_SIZE>>;
3188 template<
int Z_SIZE>
3189 using cuda_thread_size_z_loop = cuda_indexer_loop<cuda::thread_z<Z_SIZE>>;
3190 template<
int X_SIZE,
int Y_SIZE>
3191 using cuda_thread_size_xy_loop =
3192 cuda_indexer_loop<cuda::thread_x<X_SIZE>, cuda::thread_y<Y_SIZE>>;
3193 template<
int X_SIZE,
int Z_SIZE>
3194 using cuda_thread_size_xz_loop =
3195 cuda_indexer_loop<cuda::thread_x<X_SIZE>, cuda::thread_z<Z_SIZE>>;
3196 template<
int Y_SIZE,
int X_SIZE>
3197 using cuda_thread_size_yx_loop =
3198 cuda_indexer_loop<cuda::thread_y<Y_SIZE>, cuda::thread_x<X_SIZE>>;
3199 template<
int Y_SIZE,
int Z_SIZE>
3200 using cuda_thread_size_yz_loop =
3201 cuda_indexer_loop<cuda::thread_y<Y_SIZE>, cuda::thread_z<Z_SIZE>>;
3202 template<
int Z_SIZE,
int X_SIZE>
3203 using cuda_thread_size_zx_loop =
3204 cuda_indexer_loop<cuda::thread_z<Z_SIZE>, cuda::thread_x<X_SIZE>>;
3205 template<
int Z_SIZE,
int Y_SIZE>
3206 using cuda_thread_size_zy_loop =
3207 cuda_indexer_loop<cuda::thread_z<Z_SIZE>, cuda::thread_y<Y_SIZE>>;
3208 template<
int X_SIZE,
int Y_SIZE,
int Z_SIZE>
3209 using cuda_thread_size_xyz_loop = cuda_indexer_loop<cuda::thread_x<X_SIZE>,
3210 cuda::thread_y<Y_SIZE>,
3211 cuda::thread_z<Z_SIZE>>;
3212 template<
int X_SIZE,
int Z_SIZE,
int Y_SIZE>
3213 using cuda_thread_size_xzy_loop = cuda_indexer_loop<cuda::thread_x<X_SIZE>,
3214 cuda::thread_z<Z_SIZE>,
3215 cuda::thread_y<Y_SIZE>>;
3216 template<
int Y_SIZE,
int X_SIZE,
int Z_SIZE>
3217 using cuda_thread_size_yxz_loop = cuda_indexer_loop<cuda::thread_y<Y_SIZE>,
3218 cuda::thread_x<X_SIZE>,
3219 cuda::thread_z<Z_SIZE>>;
3220 template<
int Y_SIZE,
int Z_SIZE,
int X_SIZE>
3221 using cuda_thread_size_yzx_loop = cuda_indexer_loop<cuda::thread_y<Y_SIZE>,
3222 cuda::thread_z<Z_SIZE>,
3223 cuda::thread_x<X_SIZE>>;
3224 template<
int Z_SIZE,
int X_SIZE,
int Y_SIZE>
3225 using cuda_thread_size_zxy_loop = cuda_indexer_loop<cuda::thread_z<Z_SIZE>,
3226 cuda::thread_x<X_SIZE>,
3227 cuda::thread_y<Y_SIZE>>;
3228 template<
int Z_SIZE,
int Y_SIZE,
int X_SIZE>
3229 using cuda_thread_size_zyx_loop = cuda_indexer_loop<cuda::thread_z<Z_SIZE>,
3230 cuda::thread_y<Y_SIZE>,
3231 cuda::thread_x<X_SIZE>>;
3233 template<
int X_SIZE>
3234 using cuda_block_size_x_loop = cuda_indexer_loop<cuda::block_x<X_SIZE>>;
3235 template<
int Y_SIZE>
3236 using cuda_block_size_y_loop = cuda_indexer_loop<cuda::block_y<Y_SIZE>>;
3237 template<
int Z_SIZE>
3238 using cuda_block_size_z_loop = cuda_indexer_loop<cuda::block_z<Z_SIZE>>;
3239 template<
int X_SIZE,
int Y_SIZE>
3240 using cuda_block_size_xy_loop =
3241 cuda_indexer_loop<cuda::block_x<X_SIZE>, cuda::block_y<Y_SIZE>>;
3242 template<
int X_SIZE,
int Z_SIZE>
3243 using cuda_block_size_xz_loop =
3244 cuda_indexer_loop<cuda::block_x<X_SIZE>, cuda::block_z<Z_SIZE>>;
3245 template<
int Y_SIZE,
int X_SIZE>
3246 using cuda_block_size_yx_loop =
3247 cuda_indexer_loop<cuda::block_y<Y_SIZE>, cuda::block_x<X_SIZE>>;
3248 template<
int Y_SIZE,
int Z_SIZE>
3249 using cuda_block_size_yz_loop =
3250 cuda_indexer_loop<cuda::block_y<Y_SIZE>, cuda::block_z<Z_SIZE>>;
3251 template<
int Z_SIZE,
int X_SIZE>
3252 using cuda_block_size_zx_loop =
3253 cuda_indexer_loop<cuda::block_z<Z_SIZE>, cuda::block_x<X_SIZE>>;
3254 template<
int Z_SIZE,
int Y_SIZE>
3255 using cuda_block_size_zy_loop =
3256 cuda_indexer_loop<cuda::block_z<Z_SIZE>, cuda::block_y<Y_SIZE>>;
3257 template<
int X_SIZE,
int Y_SIZE,
int Z_SIZE>
3258 using cuda_block_size_xyz_loop = cuda_indexer_loop<cuda::block_x<X_SIZE>,
3259 cuda::block_y<Y_SIZE>,
3260 cuda::block_z<Z_SIZE>>;
3261 template<
int X_SIZE,
int Z_SIZE,
int Y_SIZE>
3262 using cuda_block_size_xzy_loop = cuda_indexer_loop<cuda::block_x<X_SIZE>,
3263 cuda::block_z<Z_SIZE>,
3264 cuda::block_y<Y_SIZE>>;
3265 template<
int Y_SIZE,
int X_SIZE,
int Z_SIZE>
3266 using cuda_block_size_yxz_loop = cuda_indexer_loop<cuda::block_y<Y_SIZE>,
3267 cuda::block_x<X_SIZE>,
3268 cuda::block_z<Z_SIZE>>;
3269 template<
int Y_SIZE,
int Z_SIZE,
int X_SIZE>
3270 using cuda_block_size_yzx_loop = cuda_indexer_loop<cuda::block_y<Y_SIZE>,
3271 cuda::block_z<Z_SIZE>,
3272 cuda::block_x<X_SIZE>>;
3273 template<
int Z_SIZE,
int X_SIZE,
int Y_SIZE>
3274 using cuda_block_size_zxy_loop = cuda_indexer_loop<cuda::block_z<Z_SIZE>,
3275 cuda::block_x<X_SIZE>,
3276 cuda::block_y<Y_SIZE>>;
3277 template<
int Z_SIZE,
int Y_SIZE,
int X_SIZE>
3278 using cuda_block_size_zyx_loop = cuda_indexer_loop<cuda::block_z<Z_SIZE>,
3279 cuda::block_y<Y_SIZE>,
3280 cuda::block_x<X_SIZE>>;
3282 template<
int X_BLOCK_SIZE,
int X_GRID_SIZE = named_usage::unspecified>
3283 using cuda_global_size_x_loop =
3284 cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3285 template<
int Y_BLOCK_SIZE,
int Y_GRID_SIZE = named_usage::unspecified>
3286 using cuda_global_size_y_loop =
3287 cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3288 template<
int Z_BLOCK_SIZE,
int Z_GRID_SIZE = named_usage::unspecified>
3289 using cuda_global_size_z_loop =
3290 cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3291 template<
int X_BLOCK_SIZE,
3295 using cuda_global_size_xy_loop =
3296 cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3297 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3298 template<
int X_BLOCK_SIZE,
3302 using cuda_global_size_xz_loop =
3303 cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3304 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3305 template<
int Y_BLOCK_SIZE,
3309 using cuda_global_size_yx_loop =
3310 cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3311 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3312 template<
int Y_BLOCK_SIZE,
3316 using cuda_global_size_yz_loop =
3317 cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3318 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3319 template<
int Z_BLOCK_SIZE,
3323 using cuda_global_size_zx_loop =
3324 cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3325 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3326 template<
int Z_BLOCK_SIZE,
3330 using cuda_global_size_zy_loop =
3331 cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3332 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3333 template<
int X_BLOCK_SIZE,
3339 using cuda_global_size_xyz_loop =
3340 cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3341 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3342 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3343 template<
int X_BLOCK_SIZE,
3349 using cuda_global_size_xzy_loop =
3350 cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3351 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3352 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3353 template<
int Y_BLOCK_SIZE,
3359 using cuda_global_size_yxz_loop =
3360 cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3361 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3362 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3363 template<
int Y_BLOCK_SIZE,
3369 using cuda_global_size_yzx_loop =
3370 cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3371 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3372 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3373 template<
int Z_BLOCK_SIZE,
3379 using cuda_global_size_zxy_loop =
3380 cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3381 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3382 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3383 template<
int Z_BLOCK_SIZE,
3389 using cuda_global_size_zyx_loop =
3390 cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3391 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3392 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3401 template<
int X_SIZE>
3402 using cuda_flatten_thread_size_x_direct_unchecked =
3403 cuda_flatten_indexer_direct_unchecked<cuda::thread_x<X_SIZE>>;
3404 template<
int Y_SIZE>
3405 using cuda_flatten_thread_size_y_direct_unchecked =
3406 cuda_flatten_indexer_direct_unchecked<cuda::thread_y<Y_SIZE>>;
3407 template<
int Z_SIZE>
3408 using cuda_flatten_thread_size_z_direct_unchecked =
3409 cuda_flatten_indexer_direct_unchecked<cuda::thread_z<Z_SIZE>>;
3410 template<
int X_SIZE,
int Y_SIZE>
3411 using cuda_flatten_thread_size_xy_direct_unchecked =
3412 cuda_flatten_indexer_direct_unchecked<cuda::thread_x<X_SIZE>,
3413 cuda::thread_y<Y_SIZE>>;
3414 template<
int X_SIZE,
int Z_SIZE>
3415 using cuda_flatten_thread_size_xz_direct_unchecked =
3416 cuda_flatten_indexer_direct_unchecked<cuda::thread_x<X_SIZE>,
3417 cuda::thread_z<Z_SIZE>>;
3418 template<
int Y_SIZE,
int X_SIZE>
3419 using cuda_flatten_thread_size_yx_direct_unchecked =
3420 cuda_flatten_indexer_direct_unchecked<cuda::thread_y<Y_SIZE>,
3421 cuda::thread_x<X_SIZE>>;
3422 template<
int Y_SIZE,
int Z_SIZE>
3423 using cuda_flatten_thread_size_yz_direct_unchecked =
3424 cuda_flatten_indexer_direct_unchecked<cuda::thread_y<Y_SIZE>,
3425 cuda::thread_z<Z_SIZE>>;
3426 template<
int Z_SIZE,
int X_SIZE>
3427 using cuda_flatten_thread_size_zx_direct_unchecked =
3428 cuda_flatten_indexer_direct_unchecked<cuda::thread_z<Z_SIZE>,
3429 cuda::thread_x<X_SIZE>>;
3430 template<
int Z_SIZE,
int Y_SIZE>
3431 using cuda_flatten_thread_size_zy_direct_unchecked =
3432 cuda_flatten_indexer_direct_unchecked<cuda::thread_z<Z_SIZE>,
3433 cuda::thread_y<Y_SIZE>>;
3434 template<
int X_SIZE,
int Y_SIZE,
int Z_SIZE>
3435 using cuda_flatten_thread_size_xyz_direct_unchecked =
3436 cuda_flatten_indexer_direct_unchecked<cuda::thread_x<X_SIZE>,
3437 cuda::thread_y<Y_SIZE>,
3438 cuda::thread_z<Z_SIZE>>;
3439 template<
int X_SIZE,
int Z_SIZE,
int Y_SIZE>
3440 using cuda_flatten_thread_size_xzy_direct_unchecked =
3441 cuda_flatten_indexer_direct_unchecked<cuda::thread_x<X_SIZE>,
3442 cuda::thread_z<Z_SIZE>,
3443 cuda::thread_y<Y_SIZE>>;
3444 template<
int Y_SIZE,
int X_SIZE,
int Z_SIZE>
3445 using cuda_flatten_thread_size_yxz_direct_unchecked =
3446 cuda_flatten_indexer_direct_unchecked<cuda::thread_y<Y_SIZE>,
3447 cuda::thread_x<X_SIZE>,
3448 cuda::thread_z<Z_SIZE>>;
3449 template<
int Y_SIZE,
int Z_SIZE,
int X_SIZE>
3450 using cuda_flatten_thread_size_yzx_direct_unchecked =
3451 cuda_flatten_indexer_direct_unchecked<cuda::thread_y<Y_SIZE>,
3452 cuda::thread_z<Z_SIZE>,
3453 cuda::thread_x<X_SIZE>>;
3454 template<
int Z_SIZE,
int X_SIZE,
int Y_SIZE>
3455 using cuda_flatten_thread_size_zxy_direct_unchecked =
3456 cuda_flatten_indexer_direct_unchecked<cuda::thread_z<Z_SIZE>,
3457 cuda::thread_x<X_SIZE>,
3458 cuda::thread_y<Y_SIZE>>;
3459 template<
int Z_SIZE,
int Y_SIZE,
int X_SIZE>
3460 using cuda_flatten_thread_size_zyx_direct_unchecked =
3461 cuda_flatten_indexer_direct_unchecked<cuda::thread_z<Z_SIZE>,
3462 cuda::thread_y<Y_SIZE>,
3463 cuda::thread_x<X_SIZE>>;
3465 template<
int X_SIZE>
3466 using cuda_flatten_block_size_x_direct_unchecked =
3467 cuda_flatten_indexer_direct_unchecked<cuda::block_x<X_SIZE>>;
3468 template<
int Y_SIZE>
3469 using cuda_flatten_block_size_y_direct_unchecked =
3470 cuda_flatten_indexer_direct_unchecked<cuda::block_y<Y_SIZE>>;
3471 template<
int Z_SIZE>
3472 using cuda_flatten_block_size_z_direct_unchecked =
3473 cuda_flatten_indexer_direct_unchecked<cuda::block_z<Z_SIZE>>;
3474 template<
int X_SIZE,
int Y_SIZE>
3475 using cuda_flatten_block_size_xy_direct_unchecked =
3476 cuda_flatten_indexer_direct_unchecked<cuda::block_x<X_SIZE>,
3477 cuda::block_y<Y_SIZE>>;
3478 template<
int X_SIZE,
int Z_SIZE>
3479 using cuda_flatten_block_size_xz_direct_unchecked =
3480 cuda_flatten_indexer_direct_unchecked<cuda::block_x<X_SIZE>,
3481 cuda::block_z<Z_SIZE>>;
3482 template<
int Y_SIZE,
int X_SIZE>
3483 using cuda_flatten_block_size_yx_direct_unchecked =
3484 cuda_flatten_indexer_direct_unchecked<cuda::block_y<Y_SIZE>,
3485 cuda::block_x<X_SIZE>>;
3486 template<
int Y_SIZE,
int Z_SIZE>
3487 using cuda_flatten_block_size_yz_direct_unchecked =
3488 cuda_flatten_indexer_direct_unchecked<cuda::block_y<Y_SIZE>,
3489 cuda::block_z<Z_SIZE>>;
3490 template<
int Z_SIZE,
int X_SIZE>
3491 using cuda_flatten_block_size_zx_direct_unchecked =
3492 cuda_flatten_indexer_direct_unchecked<cuda::block_z<Z_SIZE>,
3493 cuda::block_x<X_SIZE>>;
3494 template<
int Z_SIZE,
int Y_SIZE>
3495 using cuda_flatten_block_size_zy_direct_unchecked =
3496 cuda_flatten_indexer_direct_unchecked<cuda::block_z<Z_SIZE>,
3497 cuda::block_y<Y_SIZE>>;
3498 template<
int X_SIZE,
int Y_SIZE,
int Z_SIZE>
3499 using cuda_flatten_block_size_xyz_direct_unchecked =
3500 cuda_flatten_indexer_direct_unchecked<cuda::block_x<X_SIZE>,
3501 cuda::block_y<Y_SIZE>,
3502 cuda::block_z<Z_SIZE>>;
3503 template<
int X_SIZE,
int Z_SIZE,
int Y_SIZE>
3504 using cuda_flatten_block_size_xzy_direct_unchecked =
3505 cuda_flatten_indexer_direct_unchecked<cuda::block_x<X_SIZE>,
3506 cuda::block_z<Z_SIZE>,
3507 cuda::block_y<Y_SIZE>>;
3508 template<
int Y_SIZE,
int X_SIZE,
int Z_SIZE>
3509 using cuda_flatten_block_size_yxz_direct_unchecked =
3510 cuda_flatten_indexer_direct_unchecked<cuda::block_y<Y_SIZE>,
3511 cuda::block_x<X_SIZE>,
3512 cuda::block_z<Z_SIZE>>;
3513 template<
int Y_SIZE,
int Z_SIZE,
int X_SIZE>
3514 using cuda_flatten_block_size_yzx_direct_unchecked =
3515 cuda_flatten_indexer_direct_unchecked<cuda::block_y<Y_SIZE>,
3516 cuda::block_z<Z_SIZE>,
3517 cuda::block_x<X_SIZE>>;
3518 template<
int Z_SIZE,
int X_SIZE,
int Y_SIZE>
3519 using cuda_flatten_block_size_zxy_direct_unchecked =
3520 cuda_flatten_indexer_direct_unchecked<cuda::block_z<Z_SIZE>,
3521 cuda::block_x<X_SIZE>,
3522 cuda::block_y<Y_SIZE>>;
3523 template<
int Z_SIZE,
int Y_SIZE,
int X_SIZE>
3524 using cuda_flatten_block_size_zyx_direct_unchecked =
3525 cuda_flatten_indexer_direct_unchecked<cuda::block_z<Z_SIZE>,
3526 cuda::block_y<Y_SIZE>,
3527 cuda::block_x<X_SIZE>>;
3529 template<
int X_BLOCK_SIZE,
int X_GRID_SIZE = named_usage::unspecified>
3530 using cuda_flatten_global_size_x_direct_unchecked =
3531 cuda_flatten_indexer_direct_unchecked<
3532 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3533 template<
int Y_BLOCK_SIZE,
int Y_GRID_SIZE = named_usage::unspecified>
3534 using cuda_flatten_global_size_y_direct_unchecked =
3535 cuda_flatten_indexer_direct_unchecked<
3536 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3537 template<
int Z_BLOCK_SIZE,
int Z_GRID_SIZE = named_usage::unspecified>
3538 using cuda_flatten_global_size_z_direct_unchecked =
3539 cuda_flatten_indexer_direct_unchecked<
3540 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3541 template<
int X_BLOCK_SIZE,
3545 using cuda_flatten_global_size_xy_direct_unchecked =
3546 cuda_flatten_indexer_direct_unchecked<
3547 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3548 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3549 template<
int X_BLOCK_SIZE,
3553 using cuda_flatten_global_size_xz_direct_unchecked =
3554 cuda_flatten_indexer_direct_unchecked<
3555 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3556 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3557 template<
int Y_BLOCK_SIZE,
3561 using cuda_flatten_global_size_yx_direct_unchecked =
3562 cuda_flatten_indexer_direct_unchecked<
3563 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3564 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3565 template<
int Y_BLOCK_SIZE,
3569 using cuda_flatten_global_size_yz_direct_unchecked =
3570 cuda_flatten_indexer_direct_unchecked<
3571 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3572 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3573 template<
int Z_BLOCK_SIZE,
3577 using cuda_flatten_global_size_zx_direct_unchecked =
3578 cuda_flatten_indexer_direct_unchecked<
3579 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3580 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3581 template<
int Z_BLOCK_SIZE,
3585 using cuda_flatten_global_size_zy_direct_unchecked =
3586 cuda_flatten_indexer_direct_unchecked<
3587 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3588 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3589 template<
int X_BLOCK_SIZE,
3595 using cuda_flatten_global_size_xyz_direct_unchecked =
3596 cuda_flatten_indexer_direct_unchecked<
3597 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3598 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3599 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3600 template<
int X_BLOCK_SIZE,
3606 using cuda_flatten_global_size_xzy_direct_unchecked =
3607 cuda_flatten_indexer_direct_unchecked<
3608 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3609 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3610 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3611 template<
int Y_BLOCK_SIZE,
3617 using cuda_flatten_global_size_yxz_direct_unchecked =
3618 cuda_flatten_indexer_direct_unchecked<
3619 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3620 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3621 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3622 template<
int Y_BLOCK_SIZE,
3628 using cuda_flatten_global_size_yzx_direct_unchecked =
3629 cuda_flatten_indexer_direct_unchecked<
3630 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3631 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3632 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3633 template<
int Z_BLOCK_SIZE,
3639 using cuda_flatten_global_size_zxy_direct_unchecked =
3640 cuda_flatten_indexer_direct_unchecked<
3641 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3642 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3643 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3644 template<
int Z_BLOCK_SIZE,
3650 using cuda_flatten_global_size_zyx_direct_unchecked =
3651 cuda_flatten_indexer_direct_unchecked<
3652 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3653 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3654 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3664 template<
int X_SIZE>
3665 using cuda_flatten_thread_size_x_direct =
3666 cuda_flatten_indexer_direct<cuda::thread_x<X_SIZE>>;
3667 template<
int Y_SIZE>
3668 using cuda_flatten_thread_size_y_direct =
3669 cuda_flatten_indexer_direct<cuda::thread_y<Y_SIZE>>;
3670 template<
int Z_SIZE>
3671 using cuda_flatten_thread_size_z_direct =
3672 cuda_flatten_indexer_direct<cuda::thread_z<Z_SIZE>>;
3673 template<
int X_SIZE,
int Y_SIZE>
3674 using cuda_flatten_thread_size_xy_direct =
3675 cuda_flatten_indexer_direct<cuda::thread_x<X_SIZE>, cuda::thread_y<Y_SIZE>>;
3676 template<
int X_SIZE,
int Z_SIZE>
3677 using cuda_flatten_thread_size_xz_direct =
3678 cuda_flatten_indexer_direct<cuda::thread_x<X_SIZE>, cuda::thread_z<Z_SIZE>>;
3679 template<
int Y_SIZE,
int X_SIZE>
3680 using cuda_flatten_thread_size_yx_direct =
3681 cuda_flatten_indexer_direct<cuda::thread_y<Y_SIZE>, cuda::thread_x<X_SIZE>>;
3682 template<
int Y_SIZE,
int Z_SIZE>
3683 using cuda_flatten_thread_size_yz_direct =
3684 cuda_flatten_indexer_direct<cuda::thread_y<Y_SIZE>, cuda::thread_z<Z_SIZE>>;
3685 template<
int Z_SIZE,
int X_SIZE>
3686 using cuda_flatten_thread_size_zx_direct =
3687 cuda_flatten_indexer_direct<cuda::thread_z<Z_SIZE>, cuda::thread_x<X_SIZE>>;
3688 template<
int Z_SIZE,
int Y_SIZE>
3689 using cuda_flatten_thread_size_zy_direct =
3690 cuda_flatten_indexer_direct<cuda::thread_z<Z_SIZE>, cuda::thread_y<Y_SIZE>>;
3691 template<
int X_SIZE,
int Y_SIZE,
int Z_SIZE>
3692 using cuda_flatten_thread_size_xyz_direct =
3693 cuda_flatten_indexer_direct<cuda::thread_x<X_SIZE>,
3694 cuda::thread_y<Y_SIZE>,
3695 cuda::thread_z<Z_SIZE>>;
3696 template<
int X_SIZE,
int Z_SIZE,
int Y_SIZE>
3697 using cuda_flatten_thread_size_xzy_direct =
3698 cuda_flatten_indexer_direct<cuda::thread_x<X_SIZE>,
3699 cuda::thread_z<Z_SIZE>,
3700 cuda::thread_y<Y_SIZE>>;
3701 template<
int Y_SIZE,
int X_SIZE,
int Z_SIZE>
3702 using cuda_flatten_thread_size_yxz_direct =
3703 cuda_flatten_indexer_direct<cuda::thread_y<Y_SIZE>,
3704 cuda::thread_x<X_SIZE>,
3705 cuda::thread_z<Z_SIZE>>;
3706 template<
int Y_SIZE,
int Z_SIZE,
int X_SIZE>
3707 using cuda_flatten_thread_size_yzx_direct =
3708 cuda_flatten_indexer_direct<cuda::thread_y<Y_SIZE>,
3709 cuda::thread_z<Z_SIZE>,
3710 cuda::thread_x<X_SIZE>>;
3711 template<
int Z_SIZE,
int X_SIZE,
int Y_SIZE>
3712 using cuda_flatten_thread_size_zxy_direct =
3713 cuda_flatten_indexer_direct<cuda::thread_z<Z_SIZE>,
3714 cuda::thread_x<X_SIZE>,
3715 cuda::thread_y<Y_SIZE>>;
3716 template<
int Z_SIZE,
int Y_SIZE,
int X_SIZE>
3717 using cuda_flatten_thread_size_zyx_direct =
3718 cuda_flatten_indexer_direct<cuda::thread_z<Z_SIZE>,
3719 cuda::thread_y<Y_SIZE>,
3720 cuda::thread_x<X_SIZE>>;
3722 template<
int X_SIZE>
3723 using cuda_flatten_block_size_x_direct =
3724 cuda_flatten_indexer_direct<cuda::block_x<X_SIZE>>;
3725 template<
int Y_SIZE>
3726 using cuda_flatten_block_size_y_direct =
3727 cuda_flatten_indexer_direct<cuda::block_y<Y_SIZE>>;
3728 template<
int Z_SIZE>
3729 using cuda_flatten_block_size_z_direct =
3730 cuda_flatten_indexer_direct<cuda::block_z<Z_SIZE>>;
3731 template<
int X_SIZE,
int Y_SIZE>
3732 using cuda_flatten_block_size_xy_direct =
3733 cuda_flatten_indexer_direct<cuda::block_x<X_SIZE>, cuda::block_y<Y_SIZE>>;
3734 template<
int X_SIZE,
int Z_SIZE>
3735 using cuda_flatten_block_size_xz_direct =
3736 cuda_flatten_indexer_direct<cuda::block_x<X_SIZE>, cuda::block_z<Z_SIZE>>;
3737 template<
int Y_SIZE,
int X_SIZE>
3738 using cuda_flatten_block_size_yx_direct =
3739 cuda_flatten_indexer_direct<cuda::block_y<Y_SIZE>, cuda::block_x<X_SIZE>>;
3740 template<
int Y_SIZE,
int Z_SIZE>
3741 using cuda_flatten_block_size_yz_direct =
3742 cuda_flatten_indexer_direct<cuda::block_y<Y_SIZE>, cuda::block_z<Z_SIZE>>;
3743 template<
int Z_SIZE,
int X_SIZE>
3744 using cuda_flatten_block_size_zx_direct =
3745 cuda_flatten_indexer_direct<cuda::block_z<Z_SIZE>, cuda::block_x<X_SIZE>>;
3746 template<
int Z_SIZE,
int Y_SIZE>
3747 using cuda_flatten_block_size_zy_direct =
3748 cuda_flatten_indexer_direct<cuda::block_z<Z_SIZE>, cuda::block_y<Y_SIZE>>;
3749 template<
int X_SIZE,
int Y_SIZE,
int Z_SIZE>
3750 using cuda_flatten_block_size_xyz_direct =
3751 cuda_flatten_indexer_direct<cuda::block_x<X_SIZE>,
3752 cuda::block_y<Y_SIZE>,
3753 cuda::block_z<Z_SIZE>>;
3754 template<
int X_SIZE,
int Z_SIZE,
int Y_SIZE>
3755 using cuda_flatten_block_size_xzy_direct =
3756 cuda_flatten_indexer_direct<cuda::block_x<X_SIZE>,
3757 cuda::block_z<Z_SIZE>,
3758 cuda::block_y<Y_SIZE>>;
3759 template<
int Y_SIZE,
int X_SIZE,
int Z_SIZE>
3760 using cuda_flatten_block_size_yxz_direct =
3761 cuda_flatten_indexer_direct<cuda::block_y<Y_SIZE>,
3762 cuda::block_x<X_SIZE>,
3763 cuda::block_z<Z_SIZE>>;
3764 template<
int Y_SIZE,
int Z_SIZE,
int X_SIZE>
3765 using cuda_flatten_block_size_yzx_direct =
3766 cuda_flatten_indexer_direct<cuda::block_y<Y_SIZE>,
3767 cuda::block_z<Z_SIZE>,
3768 cuda::block_x<X_SIZE>>;
3769 template<
int Z_SIZE,
int X_SIZE,
int Y_SIZE>
3770 using cuda_flatten_block_size_zxy_direct =
3771 cuda_flatten_indexer_direct<cuda::block_z<Z_SIZE>,
3772 cuda::block_x<X_SIZE>,
3773 cuda::block_y<Y_SIZE>>;
3774 template<
int Z_SIZE,
int Y_SIZE,
int X_SIZE>
3775 using cuda_flatten_block_size_zyx_direct =
3776 cuda_flatten_indexer_direct<cuda::block_z<Z_SIZE>,
3777 cuda::block_y<Y_SIZE>,
3778 cuda::block_x<X_SIZE>>;
3780 template<
int X_BLOCK_SIZE,
int X_GRID_SIZE = named_usage::unspecified>
3781 using cuda_flatten_global_size_x_direct =
3782 cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3783 template<
int Y_BLOCK_SIZE,
int Y_GRID_SIZE = named_usage::unspecified>
3784 using cuda_flatten_global_size_y_direct =
3785 cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3786 template<
int Z_BLOCK_SIZE,
int Z_GRID_SIZE = named_usage::unspecified>
3787 using cuda_flatten_global_size_z_direct =
3788 cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3789 template<
int X_BLOCK_SIZE,
3793 using cuda_flatten_global_size_xy_direct =
3794 cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3795 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3796 template<
int X_BLOCK_SIZE,
3800 using cuda_flatten_global_size_xz_direct =
3801 cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3802 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3803 template<
int Y_BLOCK_SIZE,
3807 using cuda_flatten_global_size_yx_direct =
3808 cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3809 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3810 template<
int Y_BLOCK_SIZE,
3814 using cuda_flatten_global_size_yz_direct =
3815 cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3816 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3817 template<
int Z_BLOCK_SIZE,
3821 using cuda_flatten_global_size_zx_direct =
3822 cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3823 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3824 template<
int Z_BLOCK_SIZE,
3828 using cuda_flatten_global_size_zy_direct =
3829 cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3830 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3831 template<
int X_BLOCK_SIZE,
3837 using cuda_flatten_global_size_xyz_direct =
3838 cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3839 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3840 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3841 template<
int X_BLOCK_SIZE,
3847 using cuda_flatten_global_size_xzy_direct =
3848 cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3849 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3850 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3851 template<
int Y_BLOCK_SIZE,
3857 using cuda_flatten_global_size_yxz_direct =
3858 cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3859 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3860 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3861 template<
int Y_BLOCK_SIZE,
3867 using cuda_flatten_global_size_yzx_direct =
3868 cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3869 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3870 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3871 template<
int Z_BLOCK_SIZE,
3877 using cuda_flatten_global_size_zxy_direct =
3878 cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3879 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3880 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3881 template<
int Z_BLOCK_SIZE,
3887 using cuda_flatten_global_size_zyx_direct =
3888 cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3889 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3890 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3899 template<
int X_SIZE>
3900 using cuda_flatten_thread_size_x_loop =
3901 cuda_flatten_indexer_loop<cuda::thread_x<X_SIZE>>;
3902 template<
int Y_SIZE>
3903 using cuda_flatten_thread_size_y_loop =
3904 cuda_flatten_indexer_loop<cuda::thread_y<Y_SIZE>>;
3905 template<
int Z_SIZE>
3906 using cuda_flatten_thread_size_z_loop =
3907 cuda_flatten_indexer_loop<cuda::thread_z<Z_SIZE>>;
3908 template<
int X_SIZE,
int Y_SIZE>
3909 using cuda_flatten_thread_size_xy_loop =
3910 cuda_flatten_indexer_loop<cuda::thread_x<X_SIZE>, cuda::thread_y<Y_SIZE>>;
3911 template<
int X_SIZE,
int Z_SIZE>
3912 using cuda_flatten_thread_size_xz_loop =
3913 cuda_flatten_indexer_loop<cuda::thread_x<X_SIZE>, cuda::thread_z<Z_SIZE>>;
3914 template<
int Y_SIZE,
int X_SIZE>
3915 using cuda_flatten_thread_size_yx_loop =
3916 cuda_flatten_indexer_loop<cuda::thread_y<Y_SIZE>, cuda::thread_x<X_SIZE>>;
3917 template<
int Y_SIZE,
int Z_SIZE>
3918 using cuda_flatten_thread_size_yz_loop =
3919 cuda_flatten_indexer_loop<cuda::thread_y<Y_SIZE>, cuda::thread_z<Z_SIZE>>;
3920 template<
int Z_SIZE,
int X_SIZE>
3921 using cuda_flatten_thread_size_zx_loop =
3922 cuda_flatten_indexer_loop<cuda::thread_z<Z_SIZE>, cuda::thread_x<X_SIZE>>;
3923 template<
int Z_SIZE,
int Y_SIZE>
3924 using cuda_flatten_thread_size_zy_loop =
3925 cuda_flatten_indexer_loop<cuda::thread_z<Z_SIZE>, cuda::thread_y<Y_SIZE>>;
3926 template<
int X_SIZE,
int Y_SIZE,
int Z_SIZE>
3927 using cuda_flatten_thread_size_xyz_loop =
3928 cuda_flatten_indexer_loop<cuda::thread_x<X_SIZE>,
3929 cuda::thread_y<Y_SIZE>,
3930 cuda::thread_z<Z_SIZE>>;
3931 template<
int X_SIZE,
int Z_SIZE,
int Y_SIZE>
3932 using cuda_flatten_thread_size_xzy_loop =
3933 cuda_flatten_indexer_loop<cuda::thread_x<X_SIZE>,
3934 cuda::thread_z<Z_SIZE>,
3935 cuda::thread_y<Y_SIZE>>;
3936 template<
int Y_SIZE,
int X_SIZE,
int Z_SIZE>
3937 using cuda_flatten_thread_size_yxz_loop =
3938 cuda_flatten_indexer_loop<cuda::thread_y<Y_SIZE>,
3939 cuda::thread_x<X_SIZE>,
3940 cuda::thread_z<Z_SIZE>>;
3941 template<
int Y_SIZE,
int Z_SIZE,
int X_SIZE>
3942 using cuda_flatten_thread_size_yzx_loop =
3943 cuda_flatten_indexer_loop<cuda::thread_y<Y_SIZE>,
3944 cuda::thread_z<Z_SIZE>,
3945 cuda::thread_x<X_SIZE>>;
3946 template<
int Z_SIZE,
int X_SIZE,
int Y_SIZE>
3947 using cuda_flatten_thread_size_zxy_loop =
3948 cuda_flatten_indexer_loop<cuda::thread_z<Z_SIZE>,
3949 cuda::thread_x<X_SIZE>,
3950 cuda::thread_y<Y_SIZE>>;
3951 template<
int Z_SIZE,
int Y_SIZE,
int X_SIZE>
3952 using cuda_flatten_thread_size_zyx_loop =
3953 cuda_flatten_indexer_loop<cuda::thread_z<Z_SIZE>,
3954 cuda::thread_y<Y_SIZE>,
3955 cuda::thread_x<X_SIZE>>;
3957 template<
int X_SIZE>
3958 using cuda_flatten_block_size_x_loop =
3959 cuda_flatten_indexer_loop<cuda::block_x<X_SIZE>>;
3960 template<
int Y_SIZE>
3961 using cuda_flatten_block_size_y_loop =
3962 cuda_flatten_indexer_loop<cuda::block_y<Y_SIZE>>;
3963 template<
int Z_SIZE>
3964 using cuda_flatten_block_size_z_loop =
3965 cuda_flatten_indexer_loop<cuda::block_z<Z_SIZE>>;
3966 template<
int X_SIZE,
int Y_SIZE>
3967 using cuda_flatten_block_size_xy_loop =
3968 cuda_flatten_indexer_loop<cuda::block_x<X_SIZE>, cuda::block_y<Y_SIZE>>;
3969 template<
int X_SIZE,
int Z_SIZE>
3970 using cuda_flatten_block_size_xz_loop =
3971 cuda_flatten_indexer_loop<cuda::block_x<X_SIZE>, cuda::block_z<Z_SIZE>>;
3972 template<
int Y_SIZE,
int X_SIZE>
3973 using cuda_flatten_block_size_yx_loop =
3974 cuda_flatten_indexer_loop<cuda::block_y<Y_SIZE>, cuda::block_x<X_SIZE>>;
3975 template<
int Y_SIZE,
int Z_SIZE>
3976 using cuda_flatten_block_size_yz_loop =
3977 cuda_flatten_indexer_loop<cuda::block_y<Y_SIZE>, cuda::block_z<Z_SIZE>>;
3978 template<
int Z_SIZE,
int X_SIZE>
3979 using cuda_flatten_block_size_zx_loop =
3980 cuda_flatten_indexer_loop<cuda::block_z<Z_SIZE>, cuda::block_x<X_SIZE>>;
3981 template<
int Z_SIZE,
int Y_SIZE>
3982 using cuda_flatten_block_size_zy_loop =
3983 cuda_flatten_indexer_loop<cuda::block_z<Z_SIZE>, cuda::block_y<Y_SIZE>>;
3984 template<
int X_SIZE,
int Y_SIZE,
int Z_SIZE>
3985 using cuda_flatten_block_size_xyz_loop =
3986 cuda_flatten_indexer_loop<cuda::block_x<X_SIZE>,
3987 cuda::block_y<Y_SIZE>,
3988 cuda::block_z<Z_SIZE>>;
3989 template<
int X_SIZE,
int Z_SIZE,
int Y_SIZE>
3990 using cuda_flatten_block_size_xzy_loop =
3991 cuda_flatten_indexer_loop<cuda::block_x<X_SIZE>,
3992 cuda::block_z<Z_SIZE>,
3993 cuda::block_y<Y_SIZE>>;
3994 template<
int Y_SIZE,
int X_SIZE,
int Z_SIZE>
3995 using cuda_flatten_block_size_yxz_loop =
3996 cuda_flatten_indexer_loop<cuda::block_y<Y_SIZE>,
3997 cuda::block_x<X_SIZE>,
3998 cuda::block_z<Z_SIZE>>;
3999 template<
int Y_SIZE,
int Z_SIZE,
int X_SIZE>
4000 using cuda_flatten_block_size_yzx_loop =
4001 cuda_flatten_indexer_loop<cuda::block_y<Y_SIZE>,
4002 cuda::block_z<Z_SIZE>,
4003 cuda::block_x<X_SIZE>>;
4004 template<
int Z_SIZE,
int X_SIZE,
int Y_SIZE>
4005 using cuda_flatten_block_size_zxy_loop =
4006 cuda_flatten_indexer_loop<cuda::block_z<Z_SIZE>,
4007 cuda::block_x<X_SIZE>,
4008 cuda::block_y<Y_SIZE>>;
4009 template<
int Z_SIZE,
int Y_SIZE,
int X_SIZE>
4010 using cuda_flatten_block_size_zyx_loop =
4011 cuda_flatten_indexer_loop<cuda::block_z<Z_SIZE>,
4012 cuda::block_y<Y_SIZE>,
4013 cuda::block_x<X_SIZE>>;
4015 template<
int X_BLOCK_SIZE,
int X_GRID_SIZE = named_usage::unspecified>
4016 using cuda_flatten_global_size_x_loop =
4017 cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
4018 template<
int Y_BLOCK_SIZE,
int Y_GRID_SIZE = named_usage::unspecified>
4019 using cuda_flatten_global_size_y_loop =
4020 cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
4021 template<
int Z_BLOCK_SIZE,
int Z_GRID_SIZE = named_usage::unspecified>
4022 using cuda_flatten_global_size_z_loop =
4023 cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
4024 template<
int X_BLOCK_SIZE,
4028 using cuda_flatten_global_size_xy_loop =
4029 cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
4030 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
4031 template<
int X_BLOCK_SIZE,
4035 using cuda_flatten_global_size_xz_loop =
4036 cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
4037 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
4038 template<
int Y_BLOCK_SIZE,
4042 using cuda_flatten_global_size_yx_loop =
4043 cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
4044 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
4045 template<
int Y_BLOCK_SIZE,
4049 using cuda_flatten_global_size_yz_loop =
4050 cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
4051 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
4052 template<
int Z_BLOCK_SIZE,
4056 using cuda_flatten_global_size_zx_loop =
4057 cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
4058 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
4059 template<
int Z_BLOCK_SIZE,
4063 using cuda_flatten_global_size_zy_loop =
4064 cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
4065 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
4066 template<
int X_BLOCK_SIZE,
4072 using cuda_flatten_global_size_xyz_loop =
4073 cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
4074 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
4075 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
4076 template<
int X_BLOCK_SIZE,
4082 using cuda_flatten_global_size_xzy_loop =
4083 cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
4084 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
4085 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
4086 template<
int Y_BLOCK_SIZE,
4092 using cuda_flatten_global_size_yxz_loop =
4093 cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
4094 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
4095 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
4096 template<
int Y_BLOCK_SIZE,
4102 using cuda_flatten_global_size_yzx_loop =
4103 cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
4104 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
4105 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
4106 template<
int Z_BLOCK_SIZE,
4112 using cuda_flatten_global_size_zxy_loop =
4113 cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
4114 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
4115 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
4116 template<
int Z_BLOCK_SIZE,
4122 using cuda_flatten_global_size_zyx_loop =
4123 cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
4124 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
4125 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
4131 using cuda_global_thread_x = cuda_global_x_direct;
4132 using cuda_global_thread_y = cuda_global_y_direct;
4133 using cuda_global_thread_z = cuda_global_z_direct;
4135 using cuda_global_thread_xy = cuda_global_xy_direct;
4136 using cuda_global_thread_xz = cuda_global_xz_direct;
4137 using cuda_global_thread_yx = cuda_global_yx_direct;
4138 using cuda_global_thread_yz = cuda_global_yz_direct;
4139 using cuda_global_thread_zx = cuda_global_zx_direct;
4140 using cuda_global_thread_zy = cuda_global_zy_direct;
4142 using cuda_global_thread_xyz = cuda_global_xyz_direct;
4143 using cuda_global_thread_xzy = cuda_global_xzy_direct;
4144 using cuda_global_thread_yxz = cuda_global_yxz_direct;
4145 using cuda_global_thread_yzx = cuda_global_yzx_direct;
4146 using cuda_global_thread_zxy = cuda_global_zxy_direct;
4147 using cuda_global_thread_zyx = cuda_global_zyx_direct;
4149 using cuda_flatten_block_threads_xy_direct = cuda_flatten_thread_xy_direct;
4150 using cuda_flatten_block_threads_xz_direct = cuda_flatten_thread_xz_direct;
4151 using cuda_flatten_block_threads_yx_direct = cuda_flatten_thread_yx_direct;
4152 using cuda_flatten_block_threads_yz_direct = cuda_flatten_thread_yz_direct;
4153 using cuda_flatten_block_threads_zx_direct = cuda_flatten_thread_zx_direct;
4154 using cuda_flatten_block_threads_zy_direct = cuda_flatten_thread_zy_direct;
4156 using cuda_flatten_block_threads_xyz_direct = cuda_flatten_thread_xyz_direct;
4157 using cuda_flatten_block_threads_xzy_direct = cuda_flatten_thread_xzy_direct;
4158 using cuda_flatten_block_threads_yxz_direct = cuda_flatten_thread_yxz_direct;
4159 using cuda_flatten_block_threads_yzx_direct = cuda_flatten_thread_yzx_direct;
4160 using cuda_flatten_block_threads_zxy_direct = cuda_flatten_thread_zxy_direct;
4161 using cuda_flatten_block_threads_zyx_direct = cuda_flatten_thread_zyx_direct;
4163 using cuda_flatten_block_threads_xy_loop = cuda_flatten_thread_xy_loop;
4164 using cuda_flatten_block_threads_xz_loop = cuda_flatten_thread_xz_loop;
4165 using cuda_flatten_block_threads_yx_loop = cuda_flatten_thread_yx_loop;
4166 using cuda_flatten_block_threads_yz_loop = cuda_flatten_thread_yz_loop;
4167 using cuda_flatten_block_threads_zx_loop = cuda_flatten_thread_zx_loop;
4168 using cuda_flatten_block_threads_zy_loop = cuda_flatten_thread_zy_loop;
4170 using cuda_flatten_block_threads_xyz_loop = cuda_flatten_thread_xyz_loop;
4171 using cuda_flatten_block_threads_xzy_loop = cuda_flatten_thread_xzy_loop;
4172 using cuda_flatten_block_threads_yxz_loop = cuda_flatten_thread_yxz_loop;
4173 using cuda_flatten_block_threads_yzx_loop = cuda_flatten_thread_yzx_loop;
4174 using cuda_flatten_block_threads_zxy_loop = cuda_flatten_thread_zxy_loop;
4175 using cuda_flatten_block_threads_zyx_loop = cuda_flatten_thread_zyx_loop;
4177 using cuda_block_xy_nested_direct = cuda_block_xy_direct;
4178 using cuda_block_xz_nested_direct = cuda_block_xz_direct;
4179 using cuda_block_yx_nested_direct = cuda_block_yx_direct;
4180 using cuda_block_yz_nested_direct = cuda_block_yz_direct;
4181 using cuda_block_zx_nested_direct = cuda_block_zx_direct;
4182 using cuda_block_zy_nested_direct = cuda_block_zy_direct;
4184 using cuda_block_xyz_nested_direct = cuda_block_xyz_direct;
4185 using cuda_block_xzy_nested_direct = cuda_block_xzy_direct;
4186 using cuda_block_yxz_nested_direct = cuda_block_yxz_direct;
4187 using cuda_block_yzx_nested_direct = cuda_block_yzx_direct;
4188 using cuda_block_zxy_nested_direct = cuda_block_zxy_direct;
4189 using cuda_block_zyx_nested_direct = cuda_block_zyx_direct;
4191 using cuda_block_xy_nested_loop = cuda_block_xy_loop;
4192 using cuda_block_xz_nested_loop = cuda_block_xz_loop;
4193 using cuda_block_yx_nested_loop = cuda_block_yx_loop;
4194 using cuda_block_yz_nested_loop = cuda_block_yz_loop;
4195 using cuda_block_zx_nested_loop = cuda_block_zx_loop;
4196 using cuda_block_zy_nested_loop = cuda_block_zy_loop;
4198 using cuda_block_xyz_nested_loop = cuda_block_xyz_loop;
4199 using cuda_block_xzy_nested_loop = cuda_block_xzy_loop;
4200 using cuda_block_yxz_nested_loop = cuda_block_yxz_loop;
4201 using cuda_block_yzx_nested_loop = cuda_block_yzx_loop;
4202 using cuda_block_zxy_nested_loop = cuda_block_zxy_loop;
4203 using cuda_block_zyx_nested_loop = cuda_block_zyx_loop;
RAJA header file defining Simple Offset Calculators.
Header file for RAJA operator definitions.
Header file for basic RAJA policy mechanics.
Header file containing RAJA intrinsics templates for CUDA execution.
#define RAJA_HOST_DEVICE
Definition: macros.hpp:65
#define RAJA_UNUSED_ARG(x)
Definition: macros.hpp:97
#define RAJA_DIVIDE_CEILING_INT(dividend, divisor)
Definition: macros.hpp:122
#define RAJA_DEVICE
Definition: macros.hpp:66
Header file providing RAJA math templates.
multi_reduce_algorithm
Definition: policy.hpp:31
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result min(Args... args)
Definition: foldl.hpp:161
named_dim
Definition: types.hpp:53
Launch
Definition: PolicyBase.hpp:60
RAJA_HOST_DEVICE constexpr RAJA_INLINE T next_pow2(T n) noexcept
"round up" to the next greatest power of 2
Definition: math.hpp:63
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
kernel_sync_requirement
Definition: types.hpp:63
named_usage
Definition: types.hpp:44
@ ignored
Definition: types.hpp:45
@ unspecified
Definition: types.hpp:46
PolicyBaseT< Policy_, Pattern_, Launch_, Platform::undefined, Args... > make_policy_pattern_launch_t
Definition: PolicyBase.hpp:180
RAJA_HOST_DEVICE constexpr RAJA_INLINE T prev_pow2(T n) noexcept
"round down" to the largest power of 2 that is less than or equal to n
Definition: math.hpp:85
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155
Header file providing RAJA reduction declarations.
Header file containing RAJA sequential policy definitions.
static constexpr int_t multiply(int_t val) noexcept
Definition: types.hpp:255
Definition: PolicyBase.hpp:75
Header file for RAJA type definitions.