RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
policy.hpp
Go to the documentation of this file.
1 
11 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
12 // Copyright (c) Lawrence Livermore National Security, LLC and other
13 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
14 // files for dates and other details. No copyright assignment is required
15 // to contribute to RAJA.
16 //
17 // SPDX-License-Identifier: (BSD-3-Clause)
18 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
19 
20 #ifndef RAJA_policy_hip_HPP
21 #define RAJA_policy_hip_HPP
22 
23 #include "RAJA/config.hpp"
24 
25 #if defined(RAJA_HIP_ACTIVE)
26 
27 #include <utility>
28 #include "hip/hip_runtime.h"
29 
30 #include "RAJA/pattern/reduce.hpp"
31 
35 
36 #include "RAJA/util/Operators.hpp"
38 #include "RAJA/util/types.hpp"
39 #include "RAJA/util/math.hpp"
40 
41 namespace RAJA
42 {
43 
44 using hip_dim_t = RAJA_HIP_DIM_T;
45 
46 using hip_dim_member_t = camp::decay<decltype(std::declval<hip_dim_t>().x)>;
47 
48 //
50 //
51 // Execution policies
52 //
54 //
55 
59 
60 namespace detail
61 {
62 template<bool Async>
63 struct get_launch
64 {
65  static constexpr RAJA::Launch value = RAJA::Launch::async;
66 };
67 
68 template<>
69 struct get_launch<false>
70 {
71  static constexpr RAJA::Launch value = RAJA::Launch::sync;
72 };
73 } // end namespace detail
74 
75 namespace hip
76 {
77 
79 template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
80 struct IndexGlobal;
81 
82 template<typename... indexers>
83 struct IndexFlatten;
84 
85 template<size_t divisor, typename index>
86 struct IndexDivide;
87 
88 template<size_t divisor, typename index>
89 struct IndexModulo;
90 
97 struct MaxOccupancyConcretizer
98 {
99  template<typename IdxT, typename Data>
100  static IdxT get_max_grid_size(Data const& data)
101  {
102  IdxT device_sm_per_device = data.device_sm_per_device;
103  IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
104 
105  IdxT func_max_blocks_per_device =
106  func_max_blocks_per_sm * device_sm_per_device;
107 
108  return func_max_blocks_per_device;
109  }
110 };
111 
119 template<typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
120 struct FractionOffsetOccupancyConcretizer
121 {
122  template<typename IdxT, typename Data>
123  static IdxT get_max_grid_size(Data const& data)
124  {
125  using Fraction = typename t_Fraction::template rebind<IdxT>;
126 
127  IdxT device_sm_per_device = data.device_sm_per_device;
128  IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
129 
130  if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0))
131  {
132  func_max_blocks_per_sm = Fraction::multiply(func_max_blocks_per_sm);
133  }
134 
135  if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) >
136  IdxT(0))
137  {
138  func_max_blocks_per_sm =
139  IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
140  }
141 
142  IdxT func_max_blocks_per_device =
143  func_max_blocks_per_sm * device_sm_per_device;
144 
145  return func_max_blocks_per_device;
146  }
147 };
148 
157 template<typename AvoidMaxOccupancyConcretizer>
158 struct AvoidDeviceMaxThreadOccupancyConcretizer
159 {
160  template<typename IdxT, typename Data>
161  static IdxT get_max_grid_size(Data const& data)
162  {
163  IdxT device_max_threads_per_sm = data.device_max_threads_per_sm;
164  IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
165  IdxT func_threads_per_block = data.func_threads_per_block;
166 
167  IdxT func_max_threads_per_sm =
168  func_threads_per_block * func_max_blocks_per_sm;
169 
170  if (func_max_threads_per_sm < device_max_threads_per_sm)
171  {
172  return MaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
173  }
174  else
175  {
176  return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(
177  data);
178  }
179  }
180 };
181 
185 template<size_t preferred_replication>
186 struct ConstantPreferredReplicationConcretizer
187 {
188  template<typename IdxT, typename Data>
189  static IdxT get_preferred_replication(Data const& RAJA_UNUSED_ARG(data))
190  {
191  return IdxT(preferred_replication);
192  }
193 };
194 
200 template<size_t t_cutoff,
201  size_t preferred_replication_before_cutoff,
202  size_t preferred_replication_after_cutoff>
203 struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
204 {
205  template<typename IdxT, typename Data>
206  static IdxT get_preferred_replication(Data const& data)
207  {
208  IdxT cutoff = t_cutoff;
209  IdxT func_threads_per_block = data.func_threads_per_block;
210 
211  if (func_threads_per_block < cutoff)
212  {
213  return IdxT(preferred_replication_before_cutoff);
214  }
215  else
216  {
217  return IdxT(preferred_replication_after_cutoff);
218  }
219  }
220 };
221 
227 template<typename GetPreferredReplication>
228 struct SharedAtomicReplicationMaxPow2Concretizer
229 {
230  template<typename IdxT, typename Data>
231  static IdxT get_shared_replication(Data const& data)
232  {
233  IdxT func_max_shared_replication_per_block =
234  data.func_max_shared_replication_per_block;
235 
236  IdxT preferred_replication =
237  GetPreferredReplication {}.template get_preferred_replication<IdxT>(
238  data);
239 
240  return prev_pow2(
241  std::min(preferred_replication, func_max_shared_replication_per_block));
242  }
243 };
244 
250 template<typename GetPreferredReplication>
251 struct GlobalAtomicReplicationMinPow2Concretizer
252 {
253  template<typename IdxT, typename Data>
254  static IdxT get_global_replication(Data const& data)
255  {
256  IdxT func_min_global_replication = data.func_min_global_replication;
257 
258  IdxT preferred_replication =
259  GetPreferredReplication {}.template get_preferred_replication<IdxT>(
260  data);
261 
262  return next_pow2(
263  std::max(preferred_replication, func_min_global_replication));
264  }
265 };
266 
267 
268 enum struct reduce_algorithm : int
269 {
270  combine_last_block,
271  init_device_combine_atomic_block,
272  init_host_combine_atomic_block
273 };
274 
275 enum struct block_communication_mode : int
276 {
277  device_fence,
278  block_fence
279 };
280 
281 template<reduce_algorithm t_algorithm,
282  block_communication_mode t_comm_mode,
283  size_t t_replication,
284  size_t t_atomic_stride>
285 struct ReduceTuning
286 {
287  static constexpr reduce_algorithm algorithm = t_algorithm;
288  static constexpr block_communication_mode comm_mode = t_comm_mode;
289  static constexpr size_t replication = t_replication;
290  static constexpr size_t atomic_stride = t_atomic_stride;
291  static constexpr bool consistent =
292  (algorithm == reduce_algorithm::combine_last_block);
293 };
294 
295 
296 enum struct multi_reduce_algorithm : int
297 {
298  init_host_combine_block_atomic_then_grid_atomic,
299  init_host_combine_global_atomic
300 };
301 
302 template<typename t_AtomicReplicationConcretizer,
303  typename t_ReplicationIndexer,
304  typename t_OffsetCalculator>
305 struct AtomicReplicationTuning
306 {
307  using AtomicReplicationConcretizer = t_AtomicReplicationConcretizer;
308  using ReplicationIndexer = t_ReplicationIndexer;
309  using OffsetCalculator = t_OffsetCalculator;
310 };
311 
312 template<multi_reduce_algorithm t_algorithm,
313  typename t_SharedAtomicReplicationTuning,
314  typename t_GlobalAtomicReplicationTuning>
315 struct MultiReduceTuning
316 {
317  static constexpr multi_reduce_algorithm algorithm = t_algorithm;
318  using SharedAtomicReplicationTuning = t_SharedAtomicReplicationTuning;
319  using GlobalAtomicReplicationTuning = t_GlobalAtomicReplicationTuning;
320  static constexpr bool consistent = false;
321 };
322 
323 } // namespace hip
324 
325 namespace policy
326 {
327 namespace hip
328 {
329 
330 template<typename _IterationMapping,
332  typename... _IterationGetters>
333 struct hip_indexer
334 {};
335 
336 template<typename _IterationMapping,
338  typename... _IterationGetters>
339 struct hip_flatten_indexer : public RAJA::make_policy_pattern_launch_platform_t<
340  RAJA::Policy::hip,
341  RAJA::Pattern::region,
342  detail::get_launch<true /*async */>::value,
343  RAJA::Platform::hip>
344 {
345  using IterationGetter = RAJA::hip::IndexFlatten<_IterationGetters...>;
346 };
347 
348 template<typename _IterationMapping,
349  typename _IterationGetter,
350  typename _LaunchConcretizer,
351  bool Async = false>
352 struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t<
353  RAJA::Policy::hip,
354  RAJA::Pattern::forall,
355  detail::get_launch<Async>::value,
356  RAJA::Platform::hip>
357 {
358  using IterationMapping = _IterationMapping;
359  using IterationGetter = _IterationGetter;
360  using LaunchConcretizer = _LaunchConcretizer;
361 };
362 
363 template<bool Async, int num_threads = named_usage::unspecified>
364 struct hip_launch_t : public RAJA::make_policy_pattern_launch_platform_t<
365  RAJA::Policy::hip,
366  RAJA::Pattern::region,
367  detail::get_launch<Async>::value,
368  RAJA::Platform::hip>
369 {};
370 
371 //
372 // NOTE: There is no Index set segment iteration policy for HIP
373 //
374 
378 template<size_t BLOCK_SIZE, bool Async = false>
379 struct hip_work : public RAJA::make_policy_pattern_launch_platform_t<
380  RAJA::Policy::hip,
381  RAJA::Pattern::workgroup_exec,
382  detail::get_launch<Async>::value,
383  RAJA::Platform::hip>
384 {};
385 
390 struct unordered_hip_loop_y_block_iter_x_threadblock_average
392  RAJA::Policy::hip,
393  RAJA::Pattern::workgroup_order,
394  RAJA::Platform::hip>
395 {};
396 
404 
405 template<typename tuning>
406 struct hip_reduce_policy : public RAJA::make_policy_pattern_launch_platform_t<
407  RAJA::Policy::hip,
408  RAJA::Pattern::reduce,
409  detail::get_launch<false>::value,
410  RAJA::Platform::hip,
411  std::conditional_t<tuning::consistent,
412  reduce::ordered,
413  reduce::unordered>>
414 {};
415 
416 template<typename tuning>
417 struct hip_multi_reduce_policy
419  RAJA::Policy::hip,
420  RAJA::Pattern::multi_reduce,
421  detail::get_launch<false>::value,
422  RAJA::Platform::hip,
423  std::conditional_t<tuning::consistent,
424  reduce::ordered,
425  reduce::unordered>>
426 {};
427 
432 template<typename host_policy>
433 struct hip_atomic_explicit
434 {};
435 
440 using hip_atomic = hip_atomic_explicit<seq_atomic>;
441 
442 // Policy for RAJA::statement::Reduce that reduces threads in a block
443 // down to threadIdx 0
444 struct hip_block_reduce
445 {};
446 
447 // Policy for RAJA::statement::Reduce that reduces threads in a warp
448 // down to the first lane of the warp
449 struct hip_warp_reduce
450 {};
451 
452 // Policy to map work directly to threads within a warp
453 // Maximum iteration count is WARP_SIZE
454 // Cannot be used in conjunction with hip_thread_x_*
455 // Multiple warps have to be created by using hip_thread_{yz}_*
456 // struct hip_warp_direct{};
457 
458 // Policy to map work to threads within a warp using a warp-stride loop
459 // Cannot be used in conjunction with hip_thread_x_*
460 // Multiple warps have to be created by using hip_thread_{yz}_*
461 // struct hip_warp_loop{};
462 
463 
464 // Policy to map work to threads within a warp using a bit mask
465 // Cannot be used in conjunction with hip_thread_x_*
466 // Multiple warps have to be created by using hip_thread_{yz}_*
467 // Since we are masking specific threads, multiple nested
468 // hip_warp_masked
469 // can be used to create complex thread interleaving patterns
470 template<typename Mask>
471 struct hip_warp_masked_direct
472 {};
473 
474 // Policy to map work to threads within a warp using a bit mask
475 // Cannot be used in conjunction with hip_thread_x_*
476 // Multiple warps have to be created by using hip_thread_{yz}_*
477 // Since we are masking specific threads, multiple nested
478 // hip_warp_masked
479 // can be used to create complex thread interleaving patterns
480 template<typename Mask>
481 struct hip_warp_masked_loop
482 {};
483 
484 template<typename Mask>
485 struct hip_thread_masked_direct
486 {};
487 
488 template<typename Mask>
489 struct hip_thread_masked_loop
490 {};
491 
492 struct hip_synchronize : make_policy_pattern_launch_t<Policy::hip,
493  Pattern::synchronize,
494  Launch::sync>
495 {};
496 
497 } // end namespace hip
498 } // end namespace policy
499 
500 namespace internal
501 {
502 
503 RAJA_INLINE
504 int get_size(hip_dim_t dims)
505 {
506  if (dims.x == 0 && dims.y == 0 && dims.z == 0)
507  {
508  return 0;
509  }
510  return (dims.x ? dims.x : 1) * (dims.y ? dims.y : 1) * (dims.z ? dims.z : 1);
511 }
512 
513 struct HipDims
514 {
515 
516  hip_dim_t blocks {0, 0, 0};
517  hip_dim_t threads {0, 0, 0};
518 
519  HipDims() = default;
520  HipDims(HipDims const&) = default;
521  HipDims& operator=(HipDims const&) = default;
522 
523  RAJA_INLINE
524  HipDims(hip_dim_member_t default_val)
525  : blocks {default_val, default_val, default_val},
526  threads {default_val, default_val, default_val}
527  {}
528 
529  RAJA_INLINE
530  int num_blocks() const { return get_size(blocks); }
531 
532  RAJA_INLINE
533  int num_threads() const { return get_size(threads); }
534 
535  RAJA_INLINE
536  hip_dim_t get_blocks() const
537  {
538  if (num_blocks() != 0)
539  {
540  return {(blocks.x ? blocks.x : 1), (blocks.y ? blocks.y : 1),
541  (blocks.z ? blocks.z : 1)};
542  }
543  else
544  {
545  return blocks;
546  }
547  }
548 
549  RAJA_INLINE
550  hip_dim_t get_threads() const
551  {
552  if (num_threads() != 0)
553  {
554  return {(threads.x ? threads.x : 1), (threads.y ? threads.y : 1),
555  (threads.z ? threads.z : 1)};
556  }
557  else
558  {
559  return threads;
560  }
561  }
562 };
563 
564 template<named_dim dim>
565 struct HipDimHelper;
566 
567 template<>
568 struct HipDimHelper<named_dim::x>
569 {
570 
571  template<typename dim_t>
572  RAJA_HOST_DEVICE inline static constexpr hip_dim_member_t get(dim_t const& d)
573  {
574  return d.x;
575  }
576 
577  template<typename dim_t>
578  RAJA_HOST_DEVICE inline static void set(dim_t& d, hip_dim_member_t value)
579  {
580  d.x = value;
581  }
582 };
583 
584 template<>
585 struct HipDimHelper<named_dim::y>
586 {
587 
588  template<typename dim_t>
589  RAJA_HOST_DEVICE inline static constexpr hip_dim_member_t get(dim_t const& d)
590  {
591  return d.y;
592  }
593 
594  template<typename dim_t>
595  RAJA_HOST_DEVICE inline static void set(dim_t& d, hip_dim_member_t value)
596  {
597  d.y = value;
598  }
599 };
600 
601 template<>
602 struct HipDimHelper<named_dim::z>
603 {
604 
605  template<typename dim_t>
606  RAJA_HOST_DEVICE inline static constexpr hip_dim_member_t get(dim_t const& d)
607  {
608  return d.z;
609  }
610 
611  template<typename dim_t>
612  RAJA_HOST_DEVICE inline static void set(dim_t& d, hip_dim_member_t value)
613  {
614  d.z = value;
615  }
616 };
617 
618 template<named_dim dim, typename dim_t>
619 RAJA_HOST_DEVICE constexpr hip_dim_member_t get_hip_dim(dim_t const& d)
620 {
621  return HipDimHelper<dim>::get(d);
622 }
623 
624 template<named_dim dim, typename dim_t>
625 RAJA_HOST_DEVICE void set_hip_dim(dim_t& d, hip_dim_member_t value)
626 {
627  return HipDimHelper<dim>::set(d, value);
628 }
629 
630 } // namespace internal
631 
632 namespace hip
633 {
634 
636 struct IndexSize
637 {
638  hip_dim_member_t block_size = named_usage::unspecified;
639  hip_dim_member_t grid_size = named_usage::unspecified;
640 
641  RAJA_HOST_DEVICE constexpr IndexSize(
642  hip_dim_member_t _block_size = named_usage::unspecified,
643  hip_dim_member_t _grid_size = named_usage::unspecified)
644  : block_size(_block_size),
645  grid_size(_grid_size)
646  {}
647 };
648 
649 // Class to help cache thread indices or not based on template arg
650 template<bool cache_threadIdx>
651 struct ThreadIndices
652 {
653  template<named_dim dim>
654  RAJA_DEVICE constexpr hip_dim_member_t get_threadIdx() const
655  {
657  }
658 };
659 
660 template<>
661 struct ThreadIndices<true>
662 {
663  dim3 m_threadIdx;
664 
665  RAJA_HOST_DEVICE ThreadIndices()
666 #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
667  : m_threadIdx(threadIdx)
668 #endif
669  {}
670 
671  template<named_dim dim>
672  RAJA_DEVICE constexpr hip_dim_member_t get_threadIdx() const
673  {
675  }
676 };
677 
678 // Class to help cache block indices or not based on template arg
679 template<bool cache_blockIdx>
680 struct BlockIndices
681 {
682  template<named_dim dim>
683  RAJA_DEVICE constexpr hip_dim_member_t get_blockIdx() const
684  {
686  }
687 };
688 
689 template<>
690 struct BlockIndices<true>
691 {
692  dim3 m_blockIdx;
693 
694  RAJA_HOST_DEVICE BlockIndices()
695 #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
696  : m_blockIdx(blockIdx)
697 #endif
698  {}
699 
700  template<named_dim dim>
701  RAJA_DEVICE constexpr hip_dim_member_t get_blockIdx() const
702  {
704  }
705 };
706 
707 // Class to help cache block dimensions or not based on template arg
708 template<bool cache_blockDim>
709 struct BlockDimensions
710 {
711  template<named_dim dim>
712  RAJA_DEVICE constexpr hip_dim_member_t get_blockDim() const
713  {
715  }
716 };
717 
718 template<>
719 struct BlockDimensions<true>
720 {
721  dim3 m_blockDim;
722 
723  RAJA_HOST_DEVICE BlockDimensions()
724 #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
725  : m_blockDim(blockDim)
726 #endif
727  {}
728 
729  template<named_dim dim>
730  RAJA_DEVICE constexpr hip_dim_member_t get_blockDim() const
731  {
733  }
734 };
735 
736 // Class to help cache grid dimensions or not based on template arg
737 template<bool cache_gridDim>
738 struct GridDimensions
739 {
740  template<named_dim dim>
741  RAJA_DEVICE constexpr hip_dim_member_t get_gridDim() const
742  {
744  }
745 };
746 
747 template<>
748 struct GridDimensions<true>
749 {
750  dim3 m_gridDim = gridDim;
751 
752  RAJA_HOST_DEVICE GridDimensions()
753 #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
754  : m_gridDim(gridDim)
755 #endif
756  {}
757 
758  template<named_dim dim>
759  RAJA_DEVICE constexpr hip_dim_member_t get_gridDim() const
760  {
762  }
763 };
764 
765 // Class to help cache indices and dimensions or not based on template args
766 template<bool cache_threadIdx,
767  bool cache_blockIdx,
768  bool cache_blockDim,
769  bool cache_gridDim>
770 struct IndicesAndDims : ThreadIndices<cache_threadIdx>,
771  BlockIndices<cache_blockIdx>,
772  BlockDimensions<cache_blockDim>,
773  GridDimensions<cache_gridDim>
774 {};
775 
776 // Nothing cached
777 using NonCachedIndicesAndDims = IndicesAndDims<false, false, false, false>;
778 
779 // threadIdx and blockDim cached, rest not cached
780 using CachedBlockDims = IndicesAndDims<false, false, true, false>;
781 
782 // threadIdx, blockIdx, blockDim, gridDim cached
783 using AllCachedIndicesAndDims = IndicesAndDims<true, true, true, true>;
784 
792 template<typename IndicesAndDimsT = NonCachedIndicesAndDims>
793 struct LaunchContextIndicesAndDimsPolicy
794 {
795  using indices_and_dims_t = IndicesAndDimsT;
796 };
797 
798 using LaunchContextNonCachedIndicesAndDimsPolicy =
799  LaunchContextIndicesAndDimsPolicy<NonCachedIndicesAndDims>;
800 
801 using LaunchContextCachedBlockDimsPolicy =
802  LaunchContextIndicesAndDimsPolicy<CachedBlockDims>;
803 
804 using LaunchContextAllCachedIndicesAndDimsPolicy =
805  LaunchContextIndicesAndDimsPolicy<AllCachedIndicesAndDims>;
806 
809 
812 template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
813 struct IndexGlobal
814 {
815  static_assert(BLOCK_SIZE > 0, "block size must not be negative");
816  static_assert(GRID_SIZE > 0, "grid size must not be negative");
817 
818  static constexpr int block_size = BLOCK_SIZE;
819  static constexpr int grid_size = GRID_SIZE;
820 
821  template<typename IdxT = hip_dim_member_t,
822  typename IdxNDims = NonCachedIndicesAndDims>
823  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
824  {
825  return static_cast<IdxT>(idxNDims.template get_threadIdx<dim>()) +
826  static_cast<IdxT>(block_size) *
827  static_cast<IdxT>(idxNDims.template get_blockIdx<dim>());
828  }
829 
830  template<typename IdxT = hip_dim_member_t,
831  typename IdxNDims = NonCachedIndicesAndDims>
832  RAJA_DEVICE static constexpr IdxT size(IdxNDims const& idxNDims = IdxNDims {})
833  {
834  return static_cast<IdxT>(block_size) * static_cast<IdxT>(grid_size);
835  }
836 };
837 
839 template<named_dim dim, int GRID_SIZE>
840 struct IndexGlobal<dim, 1, GRID_SIZE>
841 {
842  static_assert(GRID_SIZE > 0, "grid size must not be negative");
843 
844  static constexpr int block_size = 1;
845  static constexpr int grid_size = GRID_SIZE;
846 
847  template<typename IdxT = hip_dim_member_t,
848  typename IdxNDims = NonCachedIndicesAndDims>
849  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
850  {
851  return static_cast<IdxT>(idxNDims.template get_blockIdx<dim>());
852  }
853 
854  template<typename IdxT = hip_dim_member_t,
855  typename IdxNDims = NonCachedIndicesAndDims>
856  RAJA_DEVICE static constexpr IdxT size(IdxNDims const& idxNDims = IdxNDims {})
857  {
858  return static_cast<IdxT>(grid_size);
859  }
860 };
861 
863 template<named_dim dim, int BLOCK_SIZE>
864 struct IndexGlobal<dim, BLOCK_SIZE, 1>
865 {
866  static_assert(BLOCK_SIZE > 0, "block size must not be negative");
867 
868  static constexpr int block_size = BLOCK_SIZE;
869  static constexpr int grid_size = 1;
870 
871  template<typename IdxT = hip_dim_member_t,
872  typename IdxNDims = NonCachedIndicesAndDims>
873  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
874  {
875  return static_cast<IdxT>(idxNDims.template get_threadIdx<dim>());
876  }
877 
878  template<typename IdxT = hip_dim_member_t,
879  typename IdxNDims = NonCachedIndicesAndDims>
880  RAJA_DEVICE static constexpr IdxT size(IdxNDims const& idxNDims = IdxNDims {})
881  {
882  return static_cast<IdxT>(block_size);
883  }
884 };
885 
887 template<named_dim dim>
888 struct IndexGlobal<dim, 1, 1>
889 {
890  static constexpr int block_size = 1;
891  static constexpr int grid_size = 1;
892 
893  template<typename IdxT = hip_dim_member_t,
894  typename IdxNDims = NonCachedIndicesAndDims>
895  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
896  {
897  return static_cast<IdxT>(0);
898  }
899 
900  template<typename IdxT = hip_dim_member_t,
901  typename IdxNDims = NonCachedIndicesAndDims>
902  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
903  {
904  return static_cast<IdxT>(1);
905  }
906 };
907 
909 template<named_dim dim, int GRID_SIZE>
910 struct IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>
911 {
912  static_assert(GRID_SIZE > 0, "grid size must not be negative");
913 
914  static constexpr int block_size = named_usage::unspecified;
915  static constexpr int grid_size = GRID_SIZE;
916 
917  template<typename IdxT = hip_dim_member_t,
918  typename IdxNDims = NonCachedIndicesAndDims>
919  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
920  {
921  return static_cast<IdxT>(idxNDims.template get_threadIdx<dim>()) +
922  static_cast<IdxT>(idxNDims.template get_blockDim<dim>()) *
923  static_cast<IdxT>(idxNDims.template get_blockIdx<dim>());
924  }
925 
926  template<typename IdxT = hip_dim_member_t,
927  typename IdxNDims = NonCachedIndicesAndDims>
928  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
929  {
930  return static_cast<IdxT>(idxNDims.template get_blockDim<dim>()) *
931  static_cast<IdxT>(grid_size);
932  }
933 };
934 
936 template<named_dim dim>
937 struct IndexGlobal<dim, named_usage::unspecified, 1>
938 {
939  static constexpr int block_size = named_usage::unspecified;
940  static constexpr int grid_size = 1;
941 
942  template<typename IdxT = hip_dim_member_t,
943  typename IdxNDims = NonCachedIndicesAndDims>
944  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
945  {
946  return static_cast<IdxT>(idxNDims.template get_threadIdx<dim>());
947  }
948 
949  template<typename IdxT = hip_dim_member_t,
950  typename IdxNDims = NonCachedIndicesAndDims>
951  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
952  {
953  return static_cast<IdxT>(idxNDims.template get_blockDim<dim>());
954  }
955 };
956 
958 template<named_dim dim, int BLOCK_SIZE>
959 struct IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>
960 {
961  static_assert(BLOCK_SIZE > 0, "block size must not be negative");
962 
963  static constexpr int block_size = BLOCK_SIZE;
964  static constexpr int grid_size = named_usage::unspecified;
965 
966  template<typename IdxT = hip_dim_member_t,
967  typename IdxNDims = NonCachedIndicesAndDims>
968  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
969  {
970  return static_cast<IdxT>(idxNDims.template get_threadIdx<dim>()) +
971  static_cast<IdxT>(block_size) *
972  static_cast<IdxT>(idxNDims.template get_blockIdx<dim>());
973  }
974 
975  template<typename IdxT = hip_dim_member_t,
976  typename IdxNDims = NonCachedIndicesAndDims>
977  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
978  {
979  return static_cast<IdxT>(block_size) *
980  static_cast<IdxT>(idxNDims.template get_gridDim<dim>());
981  }
982 };
983 
985 template<named_dim dim>
986 struct IndexGlobal<dim, 1, named_usage::unspecified>
987 {
988  static constexpr int block_size = 1;
989  static constexpr int grid_size = named_usage::unspecified;
990 
991  template<typename IdxT = hip_dim_member_t,
992  typename IdxNDims = NonCachedIndicesAndDims>
993  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
994  {
995  return static_cast<IdxT>(idxNDims.template get_blockIdx<dim>());
996  }
997 
998  template<typename IdxT = hip_dim_member_t,
999  typename IdxNDims = NonCachedIndicesAndDims>
1000  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1001  {
1002  return static_cast<IdxT>(idxNDims.template get_gridDim<dim>());
1003  }
1004 };
1005 
1007 template<named_dim dim>
1008 struct IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>
1009 {
1010  static constexpr int block_size = named_usage::unspecified;
1011  static constexpr int grid_size = named_usage::unspecified;
1012 
1013  template<typename IdxT = hip_dim_member_t,
1014  typename IdxNDims = NonCachedIndicesAndDims>
1015  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1016  {
1017  return static_cast<IdxT>(idxNDims.template get_threadIdx<dim>()) +
1018  static_cast<IdxT>(idxNDims.template get_blockDim<dim>()) *
1019  static_cast<IdxT>(idxNDims.template get_blockIdx<dim>());
1020  }
1021 
1022  template<typename IdxT = hip_dim_member_t,
1023  typename IdxNDims = NonCachedIndicesAndDims>
1024  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1025  {
1026  return static_cast<IdxT>(idxNDims.template get_blockDim<dim>()) *
1027  static_cast<IdxT>(idxNDims.template get_gridDim<dim>());
1028  }
1029 };
1030 
1033 template<named_dim dim, int GRID_SIZE>
1034 struct IndexGlobal<dim, named_usage::ignored, GRID_SIZE>
1035 {
1036  static_assert(GRID_SIZE > 0, "grid size must not be negative");
1037 
1038  static constexpr int block_size = named_usage::ignored;
1039  static constexpr int grid_size = GRID_SIZE;
1040 
1041  template<typename IdxT = hip_dim_member_t,
1042  typename IdxNDims = NonCachedIndicesAndDims>
1043  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1044  {
1045  return static_cast<IdxT>(idxNDims.template get_blockIdx<dim>());
1046  }
1047 
1048  template<typename IdxT = hip_dim_member_t,
1049  typename IdxNDims = NonCachedIndicesAndDims>
1050  RAJA_DEVICE static constexpr IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1051  {
1052  return static_cast<IdxT>(grid_size);
1053  }
1054 };
1055 
1057 template<named_dim dim>
1058 struct IndexGlobal<dim, named_usage::ignored, 1>
1059 {
1060  static constexpr int block_size = named_usage::ignored;
1061  static constexpr int grid_size = 1;
1062 
1063  template<typename IdxT = hip_dim_member_t,
1064  typename IdxNDims = NonCachedIndicesAndDims>
1065  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1066  {
1067  return static_cast<IdxT>(0);
1068  }
1069 
1070  template<typename IdxT = hip_dim_member_t,
1071  typename IdxNDims = NonCachedIndicesAndDims>
1072  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1073  {
1074  return static_cast<IdxT>(1);
1075  }
1076 };
1077 
1079 template<named_dim dim>
1080 struct IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>
1081 {
1082  static constexpr int block_size = named_usage::ignored;
1083  static constexpr int grid_size = named_usage::unspecified;
1084 
1085  template<typename IdxT = hip_dim_member_t,
1086  typename IdxNDims = NonCachedIndicesAndDims>
1087  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1088  {
1089  return static_cast<IdxT>(idxNDims.template get_blockIdx<dim>());
1090  }
1091 
1092  template<typename IdxT = hip_dim_member_t,
1093  typename IdxNDims = NonCachedIndicesAndDims>
1094  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1095  {
1096  return static_cast<IdxT>(idxNDims.template get_gridDim<dim>());
1097  }
1098 };
1099 
1102 template<named_dim dim, int BLOCK_SIZE>
1103 struct IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>
1104 {
1105  static_assert(BLOCK_SIZE > 0, "block size must not be negative");
1106 
1107  static constexpr int block_size = BLOCK_SIZE;
1108  static constexpr int grid_size = named_usage::ignored;
1109 
1110  template<typename IdxT = hip_dim_member_t,
1111  typename IdxNDims = NonCachedIndicesAndDims>
1112  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1113  {
1114  return static_cast<IdxT>(idxNDims.template get_threadIdx<dim>());
1115  }
1116 
1117  template<typename IdxT = hip_dim_member_t,
1118  typename IdxNDims = NonCachedIndicesAndDims>
1119  RAJA_DEVICE static constexpr IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1120  {
1121  return static_cast<IdxT>(block_size);
1122  }
1123 };
1124 
1126 template<named_dim dim>
1127 struct IndexGlobal<dim, 1, named_usage::ignored>
1128 {
1129  static constexpr int block_size = 1;
1130  static constexpr int grid_size = named_usage::ignored;
1131 
1132  template<typename IdxT = hip_dim_member_t,
1133  typename IdxNDims = NonCachedIndicesAndDims>
1134  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1135  {
1136  return static_cast<IdxT>(0);
1137  }
1138 
1139  template<typename IdxT = hip_dim_member_t,
1140  typename IdxNDims = NonCachedIndicesAndDims>
1141  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1142  {
1143  return static_cast<IdxT>(1);
1144  }
1145 };
1146 
1148 template<named_dim dim>
1149 struct IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>
1150 {
1151  static constexpr int block_size = named_usage::unspecified;
1152  static constexpr int grid_size = named_usage::ignored;
1153 
1154  template<typename IdxT = hip_dim_member_t,
1155  typename IdxNDims = NonCachedIndicesAndDims>
1156  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1157  {
1158  return static_cast<IdxT>(idxNDims.template get_threadIdx<dim>());
1159  }
1160 
1161  template<typename IdxT = hip_dim_member_t,
1162  typename IdxNDims = NonCachedIndicesAndDims>
1163  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1164  {
1165  return static_cast<IdxT>(idxNDims.template get_blockDim<dim>());
1166  }
1167 };
1168 
1171 template<named_dim dim>
1172 struct IndexGlobal<dim, named_usage::ignored, named_usage::ignored>
1173 {
1174  static constexpr int block_size = named_usage::ignored;
1175  static constexpr int grid_size = named_usage::ignored;
1176 
1177  template<typename IdxT = hip_dim_member_t,
1178  typename IdxNDims = NonCachedIndicesAndDims>
1179  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1180  {
1181  return static_cast<IdxT>(0);
1182  }
1183 
1184  template<typename IdxT = hip_dim_member_t,
1185  typename IdxNDims = NonCachedIndicesAndDims>
1186  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1187  {
1188  return static_cast<IdxT>(1);
1189  }
1190 };
1191 
1192 // useful for flatten global index (includes x)
1193 template<typename x_index>
1194 struct IndexFlatten<x_index>
1195 {
1196 
1197  template<typename IdxT = hip_dim_member_t,
1198  typename IdxNDims = NonCachedIndicesAndDims>
1199  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1200  {
1201 
1202  return x_index::template index<IdxT>(idxNDims);
1203  }
1204 
1205  template<typename IdxT = hip_dim_member_t,
1206  typename IdxNDims = NonCachedIndicesAndDims>
1207  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1208  {
1209  return x_index::template size<IdxT>(idxNDims);
1210  }
1211 };
1212 
1213 // useful for flatten global index (includes x,y)
1214 template<typename x_index, typename y_index>
1215 struct IndexFlatten<x_index, y_index>
1216 {
1217 
1218  template<typename IdxT = hip_dim_member_t,
1219  typename IdxNDims = NonCachedIndicesAndDims>
1220  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1221  {
1222 
1223  return x_index::template index<IdxT>(idxNDims) +
1224  x_index::template size<IdxT>(idxNDims) *
1225  (y_index::template index<IdxT>(idxNDims));
1226  }
1227 
1228  template<typename IdxT = hip_dim_member_t,
1229  typename IdxNDims = NonCachedIndicesAndDims>
1230  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1231  {
1232  return x_index::template size<IdxT>(idxNDims) *
1233  y_index::template size<IdxT>(idxNDims);
1234  }
1235 };
1236 
1237 // useful for flatten global index (includes x,y,z)
1238 template<typename x_index, typename y_index, typename z_index>
1239 struct IndexFlatten<x_index, y_index, z_index>
1240 {
1241 
1242  template<typename IdxT = hip_dim_member_t,
1243  typename IdxNDims = NonCachedIndicesAndDims>
1244  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1245  {
1246 
1247  return x_index::template index<IdxT>(idxNDims) +
1248  x_index::template size<IdxT>(idxNDims) *
1249  (y_index::template index<IdxT>(idxNDims) +
1250  y_index::template size<IdxT>(idxNDims) *
1251  z_index::template index<IdxT>(idxNDims));
1252  }
1253 
1254  template<typename IdxT = hip_dim_member_t,
1255  typename IdxNDims = NonCachedIndicesAndDims>
1256  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1257  {
1258  return x_index::template size<IdxT>(idxNDims) *
1259  y_index::template size<IdxT>(idxNDims) *
1260  z_index::template size<IdxT>(idxNDims);
1261  }
1262 };
1263 
1264 template<size_t divisor, typename indexer>
1265 struct IndexDivide
1266 {
1267  template<typename IdxT = hip_dim_member_t,
1268  typename IdxNDims = NonCachedIndicesAndDims>
1269  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1270  {
1271  return indexer::template index<IdxT>(idxNDims) / static_cast<IdxT>(divisor);
1272  }
1273 
1274  template<typename IdxT = hip_dim_member_t,
1275  typename IdxNDims = NonCachedIndicesAndDims>
1276  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1277  {
1278  return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(idxNDims),
1279  static_cast<IdxT>(divisor));
1280  }
1281 };
1282 
1283 template<size_t divisor, typename indexer>
1284 struct IndexModulo
1285 {
1286  template<typename IdxT = hip_dim_member_t,
1287  typename IdxNDims = NonCachedIndicesAndDims>
1288  RAJA_DEVICE static inline IdxT index(IdxNDims const& idxNDims = IdxNDims {})
1289  {
1290  return indexer::template index<IdxT>(idxNDims) % static_cast<IdxT>(divisor);
1291  }
1292 
1293  template<typename IdxT = hip_dim_member_t,
1294  typename IdxNDims = NonCachedIndicesAndDims>
1295  RAJA_DEVICE static inline IdxT size(IdxNDims const& idxNDims = IdxNDims {})
1296  {
1297  return static_cast<IdxT>(divisor);
1298  }
1299 };
1300 
1301 
1302 // helper to get just the thread indexing part of IndexGlobal
1303 template<typename index_global>
1304 struct get_index_thread;
1305 
1307 template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
1308 struct get_index_thread<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
1309 {
1310  using type = IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
1311 };
1312 
1314 template<typename x_index, typename y_index, typename z_index>
1315 struct get_index_thread<IndexFlatten<x_index, y_index, z_index>>
1316 {
1317  using type = IndexFlatten<typename get_index_thread<x_index>::type,
1318  typename get_index_thread<y_index>::type,
1319  typename get_index_thread<z_index>::type>;
1320 };
1321 
1322 // helper to get just the block indexing part of IndexGlobal
1323 template<typename index_global>
1324 struct get_index_block;
1325 
1327 template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
1328 struct get_index_block<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
1329 {
1330  using type = IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
1331 };
1332 
1334 template<typename x_index, typename y_index, typename z_index>
1335 struct get_index_block<IndexFlatten<x_index, y_index, z_index>>
1336 {
1337  using type = IndexFlatten<typename get_index_block<x_index>::type,
1338  typename get_index_block<y_index>::type,
1339  typename get_index_block<z_index>::type>;
1340 };
1341 
1342 template<size_t BLOCK_SIZE = named_usage::unspecified>
1343 using thread_x = IndexGlobal<named_dim::x, BLOCK_SIZE, named_usage::ignored>;
1344 template<size_t BLOCK_SIZE = named_usage::unspecified>
1345 using thread_y = IndexGlobal<named_dim::y, BLOCK_SIZE, named_usage::ignored>;
1346 template<size_t BLOCK_SIZE = named_usage::unspecified>
1347 using thread_z = IndexGlobal<named_dim::z, BLOCK_SIZE, named_usage::ignored>;
1348 
1349 template<size_t BLOCK_SIZE_X = named_usage::unspecified,
1350  size_t BLOCK_SIZE_Y = named_usage::unspecified,
1351  size_t BLOCK_SIZE_Z = named_usage::unspecified>
1352 using thread_xyz = IndexFlatten<thread_x<BLOCK_SIZE_X>,
1353  thread_y<BLOCK_SIZE_Y>,
1354  thread_z<BLOCK_SIZE_Z>>;
1355 
1356 template<size_t GRID_SIZE = named_usage::unspecified>
1357 using block_x = IndexGlobal<named_dim::x, named_usage::ignored, GRID_SIZE>;
1358 template<size_t GRID_SIZE = named_usage::unspecified>
1359 using block_y = IndexGlobal<named_dim::y, named_usage::ignored, GRID_SIZE>;
1360 template<size_t GRID_SIZE = named_usage::unspecified>
1361 using block_z = IndexGlobal<named_dim::z, named_usage::ignored, GRID_SIZE>;
1362 
1363 template<size_t GRID_SIZE_X = named_usage::unspecified,
1364  size_t GRID_SIZE_Y = named_usage::unspecified,
1365  size_t GRID_SIZE_Z = named_usage::unspecified>
1366 using block_xyz = IndexFlatten<block_x<GRID_SIZE_X>,
1367  block_y<GRID_SIZE_Y>,
1368  block_z<GRID_SIZE_Z>>;
1369 
1370 template<size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
1371 using global_x = IndexGlobal<named_dim::x, BLOCK_SIZE, GRID_SIZE>;
1372 template<size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
1373 using global_y = IndexGlobal<named_dim::y, BLOCK_SIZE, GRID_SIZE>;
1374 template<size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
1375 using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
1376 
1377 
1378 template<size_t BLOCK_SIZE_X,
1379  size_t BLOCK_SIZE_Y,
1380  size_t BLOCK_SIZE_Z,
1381  size_t GRID_SIZE_X = named_usage::unspecified,
1382  size_t GRID_SIZE_Y = named_usage::unspecified,
1383  size_t GRID_SIZE_Z = named_usage::unspecified>
1384 using global_xyz = IndexFlatten<global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
1385  global_y<BLOCK_SIZE_Y, GRID_SIZE_Y>,
1386  global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
1387 
1388 
1389 template<size_t WARP_SIZE = RAJA::policy::hip::device_constants.WARP_SIZE,
1390  size_t BLOCK_SIZE_X = named_usage::unspecified,
1391  size_t BLOCK_SIZE_Y = named_usage::unspecified,
1392  size_t BLOCK_SIZE_Z = named_usage::unspecified>
1393 using warp_xyz =
1394  IndexDivide<WARP_SIZE,
1395  thread_xyz<BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>>;
1396 
1397 template<size_t WARP_SIZE = RAJA::policy::hip::device_constants.WARP_SIZE,
1398  size_t BLOCK_SIZE_X = named_usage::unspecified,
1399  size_t BLOCK_SIZE_Y = named_usage::unspecified,
1400  size_t BLOCK_SIZE_Z = named_usage::unspecified,
1401  size_t GRID_SIZE_X = named_usage::unspecified,
1402  size_t GRID_SIZE_Y = named_usage::unspecified,
1403  size_t GRID_SIZE_Z = named_usage::unspecified>
1404 using warp_global_xyz =
1405  IndexFlatten<warp_xyz<WARP_SIZE, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>,
1406  block_xyz<GRID_SIZE_X, GRID_SIZE_Y, GRID_SIZE_Z>>;
1407 
1408 } // namespace hip
1409 
1410 using HipAllCachedIndicesAndDims = hip::AllCachedIndicesAndDims;
1411 using HipCachedBlockDims = hip::CachedBlockDims;
1412 using HipNonCachedIndicesAndDims = hip::NonCachedIndicesAndDims;
1413 
1414 template<bool cache_threadIdx,
1415  bool cache_blockIdx,
1416  bool cache_blockDim,
1417  bool cache_gridDim>
1418 using HipIndicesAndDims = hip::IndicesAndDims<cache_threadIdx,
1419  cache_blockIdx,
1420  cache_blockDim,
1421  cache_gridDim>;
1422 
1423 using HipLaunchContextAllCachedIndicesAndDimsPolicy =
1424  hip::LaunchContextAllCachedIndicesAndDimsPolicy;
1425 using HipLaunchContextCachedBlockDimsPolicy =
1426  hip::LaunchContextCachedBlockDimsPolicy;
1427 template<typename IndicesAndDimsT = hip::NonCachedIndicesAndDims>
1428 using HipLaunchContextIndicesAndDimsPolicy =
1429  hip::LaunchContextIndicesAndDimsPolicy<IndicesAndDimsT>;
1430 using HipLaunchContextNonCachedIndicesAndDimsPolicy =
1431  hip::LaunchContextNonCachedIndicesAndDimsPolicy;
1432 
1433 // contretizers used in forall, scan, and sort policies
1434 
1435 using HipAvoidDeviceMaxThreadOccupancyConcretizer =
1436  hip::AvoidDeviceMaxThreadOccupancyConcretizer<
1437  hip::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
1438 
1439 template<typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
1440 using HipFractionOffsetOccupancyConcretizer =
1441  hip::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
1442 
1443 using HipMaxOccupancyConcretizer = hip::MaxOccupancyConcretizer;
1444 
1445 using HipReduceDefaultConcretizer =
1446  HipFractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 2>, 0>;
1447 
1448 using HipDefaultConcretizer = HipAvoidDeviceMaxThreadOccupancyConcretizer;
1449 
1450 // policies usable with forall, scan, and sort
1451 
1452 template<size_t BLOCK_SIZE, size_t GRID_SIZE, bool Async = false>
1453 using hip_exec_grid = policy::hip::hip_exec<
1454  iteration_mapping::StridedLoop<named_usage::unspecified>,
1455  hip::global_x<BLOCK_SIZE, GRID_SIZE>,
1456  HipDefaultConcretizer,
1457  Async>;
1458 
1459 template<size_t BLOCK_SIZE, size_t GRID_SIZE>
1460 using hip_exec_grid_async = policy::hip::hip_exec<
1461  iteration_mapping::StridedLoop<named_usage::unspecified>,
1462  hip::global_x<BLOCK_SIZE, GRID_SIZE>,
1463  HipDefaultConcretizer,
1464  true>;
1465 
1466 template<size_t BLOCK_SIZE, bool Async = false>
1467 using hip_exec = policy::hip::hip_exec<iteration_mapping::Direct,
1468  hip::global_x<BLOCK_SIZE>,
1469  HipDefaultConcretizer,
1470  Async>;
1471 
1472 template<size_t BLOCK_SIZE>
1473 using hip_exec_async = policy::hip::hip_exec<iteration_mapping::Direct,
1474  hip::global_x<BLOCK_SIZE>,
1475  HipDefaultConcretizer,
1476  true>;
1477 
1478 template<size_t BLOCK_SIZE, bool Async = false>
1479 using hip_exec_occ_calc = policy::hip::hip_exec<
1480  iteration_mapping::StridedLoop<named_usage::unspecified>,
1481  hip::global_x<BLOCK_SIZE>,
1482  HipDefaultConcretizer,
1483  Async>;
1484 
1485 template<size_t BLOCK_SIZE>
1486 using hip_exec_occ_calc_async = policy::hip::hip_exec<
1487  iteration_mapping::StridedLoop<named_usage::unspecified>,
1488  hip::global_x<BLOCK_SIZE>,
1489  HipDefaultConcretizer,
1490  true>;
1491 
1492 template<size_t BLOCK_SIZE, bool Async = false>
1493 using hip_exec_occ_max = policy::hip::hip_exec<
1494  iteration_mapping::StridedLoop<named_usage::unspecified>,
1495  hip::global_x<BLOCK_SIZE>,
1496  HipMaxOccupancyConcretizer,
1497  Async>;
1498 
1499 template<size_t BLOCK_SIZE>
1500 using hip_exec_occ_max_async = policy::hip::hip_exec<
1501  iteration_mapping::StridedLoop<named_usage::unspecified>,
1502  hip::global_x<BLOCK_SIZE>,
1503  HipMaxOccupancyConcretizer,
1504  true>;
1505 
1506 template<size_t BLOCK_SIZE, typename Fraction, bool Async = false>
1507 using hip_exec_occ_fraction = policy::hip::hip_exec<
1508  iteration_mapping::StridedLoop<named_usage::unspecified>,
1509  hip::global_x<BLOCK_SIZE>,
1510  HipFractionOffsetOccupancyConcretizer<Fraction, 0>,
1511  Async>;
1512 
1513 template<size_t BLOCK_SIZE, typename Fraction>
1514 using hip_exec_occ_fraction_async = policy::hip::hip_exec<
1515  iteration_mapping::StridedLoop<named_usage::unspecified>,
1516  hip::global_x<BLOCK_SIZE>,
1517  HipFractionOffsetOccupancyConcretizer<Fraction, 0>,
1518  true>;
1519 
1520 template<size_t BLOCK_SIZE, typename Concretizer, bool Async = false>
1521 using hip_exec_occ_custom = policy::hip::hip_exec<
1522  iteration_mapping::StridedLoop<named_usage::unspecified>,
1523  hip::global_x<BLOCK_SIZE>,
1524  Concretizer,
1525  Async>;
1526 
1527 template<size_t BLOCK_SIZE, typename Concretizer>
1528 using hip_exec_occ_custom_async = policy::hip::hip_exec<
1529  iteration_mapping::StridedLoop<named_usage::unspecified>,
1530  hip::global_x<BLOCK_SIZE>,
1531  Concretizer,
1532  true>;
1533 
1534 template<size_t BLOCK_SIZE, bool Async = false>
1535 using hip_exec_with_reduce = policy::hip::hip_exec<
1536  iteration_mapping::StridedLoop<named_usage::unspecified>,
1537  hip::global_x<BLOCK_SIZE>,
1538  HipReduceDefaultConcretizer,
1539  Async>;
1540 
1541 template<size_t BLOCK_SIZE>
1542 using hip_exec_with_reduce_async = policy::hip::hip_exec<
1543  iteration_mapping::StridedLoop<named_usage::unspecified>,
1544  hip::global_x<BLOCK_SIZE>,
1545  HipReduceDefaultConcretizer,
1546  true>;
1547 
1548 template<bool with_reduce, size_t BLOCK_SIZE, bool Async = false>
1549 using hip_exec_base =
1550  std::conditional_t<with_reduce,
1551  hip_exec_with_reduce<BLOCK_SIZE, Async>,
1552  hip_exec<BLOCK_SIZE, Async>>;
1553 
1554 template<bool with_reduce, size_t BLOCK_SIZE>
1555 using hip_exec_base_async =
1556  std::conditional_t<with_reduce,
1557  hip_exec_with_reduce_async<BLOCK_SIZE>,
1558  hip_exec_async<BLOCK_SIZE>>;
1559 
1560 // policies usable with WorkGroup
1561 using policy::hip::hip_work;
1562 
1563 template<size_t BLOCK_SIZE>
1564 using hip_work_async = policy::hip::hip_work<BLOCK_SIZE, true>;
1565 
1566 using policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average;
1567 
1568 // policies usable with atomics
1569 using policy::hip::hip_atomic;
1570 using policy::hip::hip_atomic_explicit;
1571 
1572 
1573 // policies usable with reducers
1574 template<hip::reduce_algorithm algorithm,
1575  hip::block_communication_mode comm_mode,
1576  size_t replication = named_usage::unspecified,
1577  size_t atomic_stride = named_usage::unspecified>
1578 using hip_reduce_tuning = policy::hip::hip_reduce_policy<
1579  hip::ReduceTuning<algorithm, comm_mode, replication, atomic_stride>>;
1580 
1581 // Policies for RAJA::Reduce* objects with specific behaviors.
1582 // - non-atomic policies store partial results and combine them in the same
1583 // order every time, leading to consistent results for a loop run to run.
1584 // - *atomic* policies may use atomics to combine partial results. The
1585 // use of atomics leads to order of operation differences which change the
1586 // results of floating point sum reductions for a loop run to run. Falls back
1587 // on a non-atomic implementation if atomics can't be used with the given
1588 // type. The memory used with atomics is initialized on the device using
1589 // atomics which adds overhead.
1590 // - *atomic_host* policies are similar to the atomic policies above. However
1591 // the memory used with atomics is initialized on the host. This is faster
1592 // overall than other policies on HW with direct host access to device memory
1593 // such as the AMD MI300A El Capitan/Tuolumne systems.
1594 // - *device_fence* policies use normal memory accesses with device scope fences
1595 // in the implementation. This works on all HW.
1596 // - *block_fence* policies use special (atomic) memory accesses that use
1597 // a cache shared by the whole device to avoid having to use
1598 // device scope fences. This improves performance on some HW but
1599 // is more difficult to code correctly.
1600 using hip_reduce_device_fence =
1601  hip_reduce_tuning<hip::reduce_algorithm::combine_last_block,
1602  hip::block_communication_mode::device_fence,
1606 using hip_reduce_block_fence =
1607  hip_reduce_tuning<hip::reduce_algorithm::combine_last_block,
1608  hip::block_communication_mode::block_fence,
1612 using hip_reduce_atomic_device_init_device_fence =
1613  hip_reduce_tuning<hip::reduce_algorithm::init_device_combine_atomic_block,
1614  hip::block_communication_mode::device_fence,
1618 using hip_reduce_atomic_device_init_block_fence =
1619  hip_reduce_tuning<hip::reduce_algorithm::init_device_combine_atomic_block,
1620  hip::block_communication_mode::block_fence,
1624 using hip_reduce_atomic_host_init_device_fence =
1625  hip_reduce_tuning<hip::reduce_algorithm::init_host_combine_atomic_block,
1626  hip::block_communication_mode::device_fence,
1630 using hip_reduce_atomic_host_init_block_fence =
1631  hip_reduce_tuning<hip::reduce_algorithm::init_host_combine_atomic_block,
1632  hip::block_communication_mode::block_fence,
1635 
1636 // Policy for RAJA::Reduce* objects that gives the same answer every time when
1637 // used in the same way
1638 using hip_reduce = hip_reduce_block_fence;
1639 
1640 // Policy for RAJA::Reduce* objects that may use atomics and may not give the
1641 // same answer every time when used in the same way
1642 using hip_reduce_atomic = hip_reduce_atomic_host_init_block_fence;
1643 
1644 // Policy for RAJA::Reduce* objects that lets you select the default atomic or
1645 // non-atomic policy with a bool
1646 template<bool with_atomic>
1647 using hip_reduce_base =
1648  std::conditional_t<with_atomic, hip_reduce_atomic, hip_reduce>;
1649 
1650 
1651 // policies usable with multi_reducers
1652 template<hip::multi_reduce_algorithm algorithm,
1653  typename SharedAtomicReplicationConcretizer,
1654  typename SharedAtomicReplicationIndexer,
1655  typename GlobalAtomicReplicationConcretizer,
1656  typename GlobalAtomicReplicationIndexer>
1657 using hip_multi_reduce_tuning =
1658  policy::hip::hip_multi_reduce_policy<hip::MultiReduceTuning<
1659  algorithm,
1660  hip::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
1661  SharedAtomicReplicationIndexer,
1662  GetOffsetRight<int>>,
1663  hip::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
1664  GlobalAtomicReplicationIndexer,
1665  GetOffsetLeft<int>>>>;
1666 
1667 // Policies for RAJA::MultiReduce* objects with specific behaviors.
1668 // - *atomic* policies may use atomics to combine partial results. The
1669 // use of atomics leads to order of operation differences which change the
1670 // results of floating point sum reductions for a loop run to run.
1671 // - *no_replication* policies use the minimum amount of resources. The
1672 // lack of resources means they may perform poorly. These policies are
1673 // intended for use cases where low overhead is more important than high
1674 // performance such as error flags that are rarely set.
1675 // - *host_init* policies initialize memory used with atomics on the host.
1676 // This is faster overall than other policies on HW with direct host access
1677 // to device memory such as the AMD MI300A El Capitan/Tuolumne systems.
1678 using hip_multi_reduce_atomic_block_then_atomic_grid_host_init =
1679  hip_multi_reduce_tuning<
1680  hip::multi_reduce_algorithm::
1681  init_host_combine_block_atomic_then_grid_atomic,
1682  hip::SharedAtomicReplicationMaxPow2Concretizer<
1683  hip::ConstantPreferredReplicationConcretizer<4>>,
1684  hip::thread_xyz<>,
1685  hip::GlobalAtomicReplicationMinPow2Concretizer<
1686  hip::ConstantPreferredReplicationConcretizer<32>>,
1687  hip::warp_global_xyz<>>;
1688 // special policy to test that multi-reducers work if there is not enough shmem
1689 using hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing =
1690  hip_multi_reduce_tuning<
1691  hip::multi_reduce_algorithm::
1692  init_host_combine_block_atomic_then_grid_atomic,
1693  hip::SharedAtomicReplicationMaxPow2Concretizer<
1694  hip::ConstantPreferredReplicationConcretizer<0>>,
1695  hip::thread_xyz<>,
1696  hip::GlobalAtomicReplicationMinPow2Concretizer<
1697  hip::ConstantPreferredReplicationConcretizer<32>>,
1698  hip::warp_global_xyz<>>;
1699 //
1700 using hip_multi_reduce_atomic_global_host_init = hip_multi_reduce_tuning<
1701  hip::multi_reduce_algorithm::init_host_combine_global_atomic,
1702  void, // unused with this algorithm
1703  void, // unused with this algorithm
1704  hip::GlobalAtomicReplicationMinPow2Concretizer<
1705  hip::ConstantPreferredReplicationConcretizer<32>>,
1706  hip::warp_global_xyz<>>;
1707 //
1708 using hip_multi_reduce_atomic_global_no_replication_host_init =
1709  hip_multi_reduce_tuning<
1710  hip::multi_reduce_algorithm::init_host_combine_global_atomic,
1711  void, // unused with this algorithm
1712  void, // unused with this algorithm
1713  hip::GlobalAtomicReplicationMinPow2Concretizer<
1714  hip::ConstantPreferredReplicationConcretizer<1>>,
1715  hip::block_xyz<>>;
1716 
1717 // Policy for RAJA::MultiReduce* objects that may use atomics and may not give
1718 // the same answer every time when used in the same way
1719 using hip_multi_reduce_atomic =
1720  hip_multi_reduce_atomic_block_then_atomic_grid_host_init;
1721 // Similar to above but optimized for low overhead in cases where it is rarely
1722 // used
1723 using hip_multi_reduce_atomic_low_performance_low_overhead =
1724  hip_multi_reduce_atomic_global_no_replication_host_init;
1725 
1726 
1727 // policies usable with kernel
1728 using policy::hip::hip_block_reduce;
1729 using policy::hip::hip_warp_reduce;
1730 
1731 using hip_warp_direct_unchecked = RAJA::policy::hip::hip_indexer<
1732  iteration_mapping::DirectUnchecked,
1734  hip::thread_x<RAJA::policy::hip::device_constants.WARP_SIZE>>;
1735 using hip_warp_direct = RAJA::policy::hip::hip_indexer<
1736  iteration_mapping::Direct,
1738  hip::thread_x<RAJA::policy::hip::device_constants.WARP_SIZE>>;
1739 using hip_warp_loop = RAJA::policy::hip::hip_indexer<
1740  iteration_mapping::StridedLoop<named_usage::unspecified>,
1742  hip::thread_x<RAJA::policy::hip::device_constants.WARP_SIZE>>;
1743 
1744 using policy::hip::hip_warp_masked_direct;
1745 using policy::hip::hip_warp_masked_loop;
1746 
1747 using policy::hip::hip_thread_masked_direct;
1748 using policy::hip::hip_thread_masked_loop;
1749 
1750 // policies usable with synchronize
1751 using policy::hip::hip_synchronize;
1752 
1753 // policies usable with launch
1754 using policy::hip::hip_launch_t;
1755 
1756 
1757 // policies usable with kernel and launch
1758 template<typename... indexers>
1759 using hip_indexer_direct_unchecked =
1760  policy::hip::hip_indexer<iteration_mapping::DirectUnchecked,
1762  indexers...>;
1763 
1764 template<typename... indexers>
1765 using hip_indexer_direct =
1766  policy::hip::hip_indexer<iteration_mapping::Direct,
1768  indexers...>;
1769 
1770 template<typename... indexers>
1771 using hip_indexer_loop = policy::hip::hip_indexer<
1772  iteration_mapping::StridedLoop<named_usage::unspecified>,
1774  indexers...>;
1775 
1776 template<typename... indexers>
1777 using hip_indexer_syncable_loop = policy::hip::hip_indexer<
1778  iteration_mapping::StridedLoop<named_usage::unspecified>,
1780  indexers...>;
1781 
1782 template<typename... indexers>
1783 using hip_flatten_indexer_direct_unchecked =
1784  policy::hip::hip_flatten_indexer<iteration_mapping::DirectUnchecked,
1786  indexers...>;
1787 
1788 template<typename... indexers>
1789 using hip_flatten_indexer_direct =
1790  policy::hip::hip_flatten_indexer<iteration_mapping::Direct,
1792  indexers...>;
1793 
1794 template<typename... indexers>
1795 using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer<
1796  iteration_mapping::StridedLoop<named_usage::unspecified>,
1798  indexers...>;
1799 
1800 
1808 template<named_dim... dims>
1809 using hip_thread_direct_unchecked = hip_indexer_direct_unchecked<
1810  hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
1811 using hip_thread_x_direct_unchecked = hip_thread_direct_unchecked<named_dim::x>;
1812 using hip_thread_y_direct_unchecked = hip_thread_direct_unchecked<named_dim::y>;
1813 using hip_thread_z_direct_unchecked = hip_thread_direct_unchecked<named_dim::z>;
1814 using hip_thread_xy_direct_unchecked =
1815  hip_thread_direct_unchecked<named_dim::x, named_dim::y>;
1816 using hip_thread_xz_direct_unchecked =
1817  hip_thread_direct_unchecked<named_dim::x, named_dim::z>;
1818 using hip_thread_yx_direct_unchecked =
1819  hip_thread_direct_unchecked<named_dim::y, named_dim::x>;
1820 using hip_thread_yz_direct_unchecked =
1821  hip_thread_direct_unchecked<named_dim::y, named_dim::z>;
1822 using hip_thread_zx_direct_unchecked =
1823  hip_thread_direct_unchecked<named_dim::z, named_dim::x>;
1824 using hip_thread_zy_direct_unchecked =
1825  hip_thread_direct_unchecked<named_dim::z, named_dim::y>;
1826 using hip_thread_xyz_direct_unchecked =
1827  hip_thread_direct_unchecked<named_dim::x, named_dim::y, named_dim::z>;
1828 using hip_thread_xzy_direct_unchecked =
1829  hip_thread_direct_unchecked<named_dim::x, named_dim::z, named_dim::y>;
1830 using hip_thread_yxz_direct_unchecked =
1831  hip_thread_direct_unchecked<named_dim::y, named_dim::x, named_dim::z>;
1832 using hip_thread_yzx_direct_unchecked =
1833  hip_thread_direct_unchecked<named_dim::y, named_dim::z, named_dim::x>;
1834 using hip_thread_zxy_direct_unchecked =
1835  hip_thread_direct_unchecked<named_dim::z, named_dim::x, named_dim::y>;
1836 using hip_thread_zyx_direct_unchecked =
1837  hip_thread_direct_unchecked<named_dim::z, named_dim::y, named_dim::x>;
1838 
1839 template<named_dim... dims>
1840 using hip_block_direct_unchecked = hip_indexer_direct_unchecked<
1841  hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
1842 using hip_block_x_direct_unchecked = hip_block_direct_unchecked<named_dim::x>;
1843 using hip_block_y_direct_unchecked = hip_block_direct_unchecked<named_dim::y>;
1844 using hip_block_z_direct_unchecked = hip_block_direct_unchecked<named_dim::z>;
1845 using hip_block_xy_direct_unchecked =
1846  hip_block_direct_unchecked<named_dim::x, named_dim::y>;
1847 using hip_block_xz_direct_unchecked =
1848  hip_block_direct_unchecked<named_dim::x, named_dim::z>;
1849 using hip_block_yx_direct_unchecked =
1850  hip_block_direct_unchecked<named_dim::y, named_dim::x>;
1851 using hip_block_yz_direct_unchecked =
1852  hip_block_direct_unchecked<named_dim::y, named_dim::z>;
1853 using hip_block_zx_direct_unchecked =
1854  hip_block_direct_unchecked<named_dim::z, named_dim::x>;
1855 using hip_block_zy_direct_unchecked =
1856  hip_block_direct_unchecked<named_dim::z, named_dim::y>;
1857 using hip_block_xyz_direct_unchecked =
1858  hip_block_direct_unchecked<named_dim::x, named_dim::y, named_dim::z>;
1859 using hip_block_xzy_direct_unchecked =
1860  hip_block_direct_unchecked<named_dim::x, named_dim::z, named_dim::y>;
1861 using hip_block_yxz_direct_unchecked =
1862  hip_block_direct_unchecked<named_dim::y, named_dim::x, named_dim::z>;
1863 using hip_block_yzx_direct_unchecked =
1864  hip_block_direct_unchecked<named_dim::y, named_dim::z, named_dim::x>;
1865 using hip_block_zxy_direct_unchecked =
1866  hip_block_direct_unchecked<named_dim::z, named_dim::x, named_dim::y>;
1867 using hip_block_zyx_direct_unchecked =
1868  hip_block_direct_unchecked<named_dim::z, named_dim::y, named_dim::x>;
1869 
1870 template<named_dim... dims>
1871 using hip_global_direct_unchecked =
1872  hip_indexer_direct_unchecked<hip::IndexGlobal<dims,
1875 using hip_global_x_direct_unchecked = hip_global_direct_unchecked<named_dim::x>;
1876 using hip_global_y_direct_unchecked = hip_global_direct_unchecked<named_dim::y>;
1877 using hip_global_z_direct_unchecked = hip_global_direct_unchecked<named_dim::z>;
1878 using hip_global_xy_direct_unchecked =
1879  hip_global_direct_unchecked<named_dim::x, named_dim::y>;
1880 using hip_global_xz_direct_unchecked =
1881  hip_global_direct_unchecked<named_dim::x, named_dim::z>;
1882 using hip_global_yx_direct_unchecked =
1883  hip_global_direct_unchecked<named_dim::y, named_dim::x>;
1884 using hip_global_yz_direct_unchecked =
1885  hip_global_direct_unchecked<named_dim::y, named_dim::z>;
1886 using hip_global_zx_direct_unchecked =
1887  hip_global_direct_unchecked<named_dim::z, named_dim::x>;
1888 using hip_global_zy_direct_unchecked =
1889  hip_global_direct_unchecked<named_dim::z, named_dim::y>;
1890 using hip_global_xyz_direct_unchecked =
1891  hip_global_direct_unchecked<named_dim::x, named_dim::y, named_dim::z>;
1892 using hip_global_xzy_direct_unchecked =
1893  hip_global_direct_unchecked<named_dim::x, named_dim::z, named_dim::y>;
1894 using hip_global_yxz_direct_unchecked =
1895  hip_global_direct_unchecked<named_dim::y, named_dim::x, named_dim::z>;
1896 using hip_global_yzx_direct_unchecked =
1897  hip_global_direct_unchecked<named_dim::y, named_dim::z, named_dim::x>;
1898 using hip_global_zxy_direct_unchecked =
1899  hip_global_direct_unchecked<named_dim::z, named_dim::x, named_dim::y>;
1900 using hip_global_zyx_direct_unchecked =
1901  hip_global_direct_unchecked<named_dim::z, named_dim::y, named_dim::x>;
1902 
1910 template<named_dim... dims>
1911 using hip_thread_direct = hip_indexer_direct<
1912  hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
1913 using hip_thread_x_direct = hip_thread_direct<named_dim::x>;
1914 using hip_thread_y_direct = hip_thread_direct<named_dim::y>;
1915 using hip_thread_z_direct = hip_thread_direct<named_dim::z>;
1916 using hip_thread_xy_direct = hip_thread_direct<named_dim::x, named_dim::y>;
1917 using hip_thread_xz_direct = hip_thread_direct<named_dim::x, named_dim::z>;
1918 using hip_thread_yx_direct = hip_thread_direct<named_dim::y, named_dim::x>;
1919 using hip_thread_yz_direct = hip_thread_direct<named_dim::y, named_dim::z>;
1920 using hip_thread_zx_direct = hip_thread_direct<named_dim::z, named_dim::x>;
1921 using hip_thread_zy_direct = hip_thread_direct<named_dim::z, named_dim::y>;
1922 using hip_thread_xyz_direct =
1923  hip_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
1924 using hip_thread_xzy_direct =
1925  hip_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
1926 using hip_thread_yxz_direct =
1927  hip_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
1928 using hip_thread_yzx_direct =
1929  hip_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
1930 using hip_thread_zxy_direct =
1931  hip_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
1932 using hip_thread_zyx_direct =
1933  hip_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
1934 
1935 template<named_dim... dims>
1936 using hip_block_direct = hip_indexer_direct<
1937  hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
1938 using hip_block_x_direct = hip_block_direct<named_dim::x>;
1939 using hip_block_y_direct = hip_block_direct<named_dim::y>;
1940 using hip_block_z_direct = hip_block_direct<named_dim::z>;
1941 using hip_block_xy_direct = hip_block_direct<named_dim::x, named_dim::y>;
1942 using hip_block_xz_direct = hip_block_direct<named_dim::x, named_dim::z>;
1943 using hip_block_yx_direct = hip_block_direct<named_dim::y, named_dim::x>;
1944 using hip_block_yz_direct = hip_block_direct<named_dim::y, named_dim::z>;
1945 using hip_block_zx_direct = hip_block_direct<named_dim::z, named_dim::x>;
1946 using hip_block_zy_direct = hip_block_direct<named_dim::z, named_dim::y>;
1947 using hip_block_xyz_direct =
1948  hip_block_direct<named_dim::x, named_dim::y, named_dim::z>;
1949 using hip_block_xzy_direct =
1950  hip_block_direct<named_dim::x, named_dim::z, named_dim::y>;
1951 using hip_block_yxz_direct =
1952  hip_block_direct<named_dim::y, named_dim::x, named_dim::z>;
1953 using hip_block_yzx_direct =
1954  hip_block_direct<named_dim::y, named_dim::z, named_dim::x>;
1955 using hip_block_zxy_direct =
1956  hip_block_direct<named_dim::z, named_dim::x, named_dim::y>;
1957 using hip_block_zyx_direct =
1958  hip_block_direct<named_dim::z, named_dim::y, named_dim::x>;
1959 
1960 template<named_dim... dims>
1961 using hip_global_direct =
1962  hip_indexer_direct<hip::IndexGlobal<dims,
1965 using hip_global_x_direct = hip_global_direct<named_dim::x>;
1966 using hip_global_y_direct = hip_global_direct<named_dim::y>;
1967 using hip_global_z_direct = hip_global_direct<named_dim::z>;
1968 using hip_global_xy_direct = hip_global_direct<named_dim::x, named_dim::y>;
1969 using hip_global_xz_direct = hip_global_direct<named_dim::x, named_dim::z>;
1970 using hip_global_yx_direct = hip_global_direct<named_dim::y, named_dim::x>;
1971 using hip_global_yz_direct = hip_global_direct<named_dim::y, named_dim::z>;
1972 using hip_global_zx_direct = hip_global_direct<named_dim::z, named_dim::x>;
1973 using hip_global_zy_direct = hip_global_direct<named_dim::z, named_dim::y>;
1974 using hip_global_xyz_direct =
1975  hip_global_direct<named_dim::x, named_dim::y, named_dim::z>;
1976 using hip_global_xzy_direct =
1977  hip_global_direct<named_dim::x, named_dim::z, named_dim::y>;
1978 using hip_global_yxz_direct =
1979  hip_global_direct<named_dim::y, named_dim::x, named_dim::z>;
1980 using hip_global_yzx_direct =
1981  hip_global_direct<named_dim::y, named_dim::z, named_dim::x>;
1982 using hip_global_zxy_direct =
1983  hip_global_direct<named_dim::z, named_dim::x, named_dim::y>;
1984 using hip_global_zyx_direct =
1985  hip_global_direct<named_dim::z, named_dim::y, named_dim::x>;
1986 
1992 template<named_dim... dims>
1993 using hip_thread_loop = hip_indexer_loop<
1994  hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
1995 using hip_thread_x_loop = hip_thread_loop<named_dim::x>;
1996 using hip_thread_y_loop = hip_thread_loop<named_dim::y>;
1997 using hip_thread_z_loop = hip_thread_loop<named_dim::z>;
1998 using hip_thread_xy_loop = hip_thread_loop<named_dim::x, named_dim::y>;
1999 using hip_thread_xz_loop = hip_thread_loop<named_dim::x, named_dim::z>;
2000 using hip_thread_yx_loop = hip_thread_loop<named_dim::y, named_dim::x>;
2001 using hip_thread_yz_loop = hip_thread_loop<named_dim::y, named_dim::z>;
2002 using hip_thread_zx_loop = hip_thread_loop<named_dim::z, named_dim::x>;
2003 using hip_thread_zy_loop = hip_thread_loop<named_dim::z, named_dim::y>;
2004 using hip_thread_xyz_loop =
2005  hip_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
2006 using hip_thread_xzy_loop =
2007  hip_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
2008 using hip_thread_yxz_loop =
2009  hip_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
2010 using hip_thread_yzx_loop =
2011  hip_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
2012 using hip_thread_zxy_loop =
2013  hip_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
2014 using hip_thread_zyx_loop =
2015  hip_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
2016 
2017 template<named_dim... dims>
2018 using hip_block_loop = hip_indexer_loop<
2019  hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
2020 using hip_block_x_loop = hip_block_loop<named_dim::x>;
2021 using hip_block_y_loop = hip_block_loop<named_dim::y>;
2022 using hip_block_z_loop = hip_block_loop<named_dim::z>;
2023 using hip_block_xy_loop = hip_block_loop<named_dim::x, named_dim::y>;
2024 using hip_block_xz_loop = hip_block_loop<named_dim::x, named_dim::z>;
2025 using hip_block_yx_loop = hip_block_loop<named_dim::y, named_dim::x>;
2026 using hip_block_yz_loop = hip_block_loop<named_dim::y, named_dim::z>;
2027 using hip_block_zx_loop = hip_block_loop<named_dim::z, named_dim::x>;
2028 using hip_block_zy_loop = hip_block_loop<named_dim::z, named_dim::y>;
2029 using hip_block_xyz_loop =
2030  hip_block_loop<named_dim::x, named_dim::y, named_dim::z>;
2031 using hip_block_xzy_loop =
2032  hip_block_loop<named_dim::x, named_dim::z, named_dim::y>;
2033 using hip_block_yxz_loop =
2034  hip_block_loop<named_dim::y, named_dim::x, named_dim::z>;
2035 using hip_block_yzx_loop =
2036  hip_block_loop<named_dim::y, named_dim::z, named_dim::x>;
2037 using hip_block_zxy_loop =
2038  hip_block_loop<named_dim::z, named_dim::x, named_dim::y>;
2039 using hip_block_zyx_loop =
2040  hip_block_loop<named_dim::z, named_dim::y, named_dim::x>;
2041 
2042 template<named_dim... dims>
2043 using hip_global_loop =
2044  hip_indexer_loop<hip::IndexGlobal<dims,
2047 using hip_global_x_loop = hip_global_loop<named_dim::x>;
2048 using hip_global_y_loop = hip_global_loop<named_dim::y>;
2049 using hip_global_z_loop = hip_global_loop<named_dim::z>;
2050 using hip_global_xy_loop = hip_global_loop<named_dim::x, named_dim::y>;
2051 using hip_global_xz_loop = hip_global_loop<named_dim::x, named_dim::z>;
2052 using hip_global_yx_loop = hip_global_loop<named_dim::y, named_dim::x>;
2053 using hip_global_yz_loop = hip_global_loop<named_dim::y, named_dim::z>;
2054 using hip_global_zx_loop = hip_global_loop<named_dim::z, named_dim::x>;
2055 using hip_global_zy_loop = hip_global_loop<named_dim::z, named_dim::y>;
2056 using hip_global_xyz_loop =
2057  hip_global_loop<named_dim::x, named_dim::y, named_dim::z>;
2058 using hip_global_xzy_loop =
2059  hip_global_loop<named_dim::x, named_dim::z, named_dim::y>;
2060 using hip_global_yxz_loop =
2061  hip_global_loop<named_dim::y, named_dim::x, named_dim::z>;
2062 using hip_global_yzx_loop =
2063  hip_global_loop<named_dim::y, named_dim::z, named_dim::x>;
2064 using hip_global_zxy_loop =
2065  hip_global_loop<named_dim::z, named_dim::x, named_dim::y>;
2066 using hip_global_zyx_loop =
2067  hip_global_loop<named_dim::z, named_dim::y, named_dim::x>;
2068 
2076 template<named_dim... dims>
2077 using hip_thread_syncable_loop = hip_indexer_syncable_loop<
2078  hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
2079 using hip_thread_x_syncable_loop = hip_thread_syncable_loop<named_dim::x>;
2080 using hip_thread_y_syncable_loop = hip_thread_syncable_loop<named_dim::y>;
2081 using hip_thread_z_syncable_loop = hip_thread_syncable_loop<named_dim::z>;
2082 using hip_thread_xy_syncable_loop =
2083  hip_thread_syncable_loop<named_dim::x, named_dim::y>;
2084 using hip_thread_xz_syncable_loop =
2085  hip_thread_syncable_loop<named_dim::x, named_dim::z>;
2086 using hip_thread_yx_syncable_loop =
2087  hip_thread_syncable_loop<named_dim::y, named_dim::x>;
2088 using hip_thread_yz_syncable_loop =
2089  hip_thread_syncable_loop<named_dim::y, named_dim::z>;
2090 using hip_thread_zx_syncable_loop =
2091  hip_thread_syncable_loop<named_dim::z, named_dim::x>;
2092 using hip_thread_zy_syncable_loop =
2093  hip_thread_syncable_loop<named_dim::z, named_dim::y>;
2094 using hip_thread_xyz_syncable_loop =
2095  hip_thread_syncable_loop<named_dim::x, named_dim::y, named_dim::z>;
2096 using hip_thread_xzy_syncable_loop =
2097  hip_thread_syncable_loop<named_dim::x, named_dim::z, named_dim::y>;
2098 using hip_thread_yxz_syncable_loop =
2099  hip_thread_syncable_loop<named_dim::y, named_dim::x, named_dim::z>;
2100 using hip_thread_yzx_syncable_loop =
2101  hip_thread_syncable_loop<named_dim::y, named_dim::z, named_dim::x>;
2102 using hip_thread_zxy_syncable_loop =
2103  hip_thread_syncable_loop<named_dim::z, named_dim::x, named_dim::y>;
2104 using hip_thread_zyx_syncable_loop =
2105  hip_thread_syncable_loop<named_dim::z, named_dim::y, named_dim::x>;
2106 
2107 template<named_dim... dims>
2108 using hip_block_syncable_loop = hip_indexer_syncable_loop<
2109  hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
2110 using hip_block_x_syncable_loop = hip_block_syncable_loop<named_dim::x>;
2111 using hip_block_y_syncable_loop = hip_block_syncable_loop<named_dim::y>;
2112 using hip_block_z_syncable_loop = hip_block_syncable_loop<named_dim::z>;
2113 using hip_block_xy_syncable_loop =
2114  hip_block_syncable_loop<named_dim::x, named_dim::y>;
2115 using hip_block_xz_syncable_loop =
2116  hip_block_syncable_loop<named_dim::x, named_dim::z>;
2117 using hip_block_yx_syncable_loop =
2118  hip_block_syncable_loop<named_dim::y, named_dim::x>;
2119 using hip_block_yz_syncable_loop =
2120  hip_block_syncable_loop<named_dim::y, named_dim::z>;
2121 using hip_block_zx_syncable_loop =
2122  hip_block_syncable_loop<named_dim::z, named_dim::x>;
2123 using hip_block_zy_syncable_loop =
2124  hip_block_syncable_loop<named_dim::z, named_dim::y>;
2125 using hip_block_xyz_syncable_loop =
2126  hip_block_syncable_loop<named_dim::x, named_dim::y, named_dim::z>;
2127 using hip_block_xzy_syncable_loop =
2128  hip_block_syncable_loop<named_dim::x, named_dim::z, named_dim::y>;
2129 using hip_block_yxz_syncable_loop =
2130  hip_block_syncable_loop<named_dim::y, named_dim::x, named_dim::z>;
2131 using hip_block_yzx_syncable_loop =
2132  hip_block_syncable_loop<named_dim::y, named_dim::z, named_dim::x>;
2133 using hip_block_zxy_syncable_loop =
2134  hip_block_syncable_loop<named_dim::z, named_dim::x, named_dim::y>;
2135 using hip_block_zyx_syncable_loop =
2136  hip_block_syncable_loop<named_dim::z, named_dim::y, named_dim::x>;
2137 
2138 template<named_dim... dims>
2139 using hip_global_syncable_loop =
2140  hip_indexer_syncable_loop<hip::IndexGlobal<dims,
2143 using hip_global_x_syncable_loop = hip_global_syncable_loop<named_dim::x>;
2144 using hip_global_y_syncable_loop = hip_global_syncable_loop<named_dim::y>;
2145 using hip_global_z_syncable_loop = hip_global_syncable_loop<named_dim::z>;
2146 using hip_global_xy_syncable_loop =
2147  hip_global_syncable_loop<named_dim::x, named_dim::y>;
2148 using hip_global_xz_syncable_loop =
2149  hip_global_syncable_loop<named_dim::x, named_dim::z>;
2150 using hip_global_yx_syncable_loop =
2151  hip_global_syncable_loop<named_dim::y, named_dim::x>;
2152 using hip_global_yz_syncable_loop =
2153  hip_global_syncable_loop<named_dim::y, named_dim::z>;
2154 using hip_global_zx_syncable_loop =
2155  hip_global_syncable_loop<named_dim::z, named_dim::x>;
2156 using hip_global_zy_syncable_loop =
2157  hip_global_syncable_loop<named_dim::z, named_dim::y>;
2158 using hip_global_xyz_syncable_loop =
2159  hip_global_syncable_loop<named_dim::x, named_dim::y, named_dim::z>;
2160 using hip_global_xzy_syncable_loop =
2161  hip_global_syncable_loop<named_dim::x, named_dim::z, named_dim::y>;
2162 using hip_global_yxz_syncable_loop =
2163  hip_global_syncable_loop<named_dim::y, named_dim::x, named_dim::z>;
2164 using hip_global_yzx_syncable_loop =
2165  hip_global_syncable_loop<named_dim::y, named_dim::z, named_dim::x>;
2166 using hip_global_zxy_syncable_loop =
2167  hip_global_syncable_loop<named_dim::z, named_dim::x, named_dim::y>;
2168 using hip_global_zyx_syncable_loop =
2169  hip_global_syncable_loop<named_dim::z, named_dim::y, named_dim::x>;
2170 
2171 /*
2172  * Maps segment indices to flattened HIP threads, blocks, or global threads.
2173  * This is the lowest overhead mapping, but requires that there are the same
2174  * number of physical threads, blocks, or global threads as map requests.
2175  * Reshapes multiple physical threads, blocks, or global threads into a 1D
2176  * iteration space
2177  */
2178 template<named_dim... dims>
2179 using hip_flatten_thread_direct_unchecked =
2180  hip_flatten_indexer_direct_unchecked<
2181  hip::IndexGlobal<dims,
2183  named_usage::ignored>...>;
2184 using hip_flatten_thread_x_direct_unchecked =
2185  hip_flatten_thread_direct_unchecked<named_dim::x>;
2186 using hip_flatten_thread_y_direct_unchecked =
2187  hip_flatten_thread_direct_unchecked<named_dim::y>;
2188 using hip_flatten_thread_z_direct_unchecked =
2189  hip_flatten_thread_direct_unchecked<named_dim::z>;
2190 using hip_flatten_thread_xy_direct_unchecked =
2191  hip_flatten_thread_direct_unchecked<named_dim::x, named_dim::y>;
2192 using hip_flatten_thread_xz_direct_unchecked =
2193  hip_flatten_thread_direct_unchecked<named_dim::x, named_dim::z>;
2194 using hip_flatten_thread_yx_direct_unchecked =
2195  hip_flatten_thread_direct_unchecked<named_dim::y, named_dim::x>;
2196 using hip_flatten_thread_yz_direct_unchecked =
2197  hip_flatten_thread_direct_unchecked<named_dim::y, named_dim::z>;
2198 using hip_flatten_thread_zx_direct_unchecked =
2199  hip_flatten_thread_direct_unchecked<named_dim::z, named_dim::x>;
2200 using hip_flatten_thread_zy_direct_unchecked =
2201  hip_flatten_thread_direct_unchecked<named_dim::z, named_dim::y>;
2202 using hip_flatten_thread_xyz_direct_unchecked =
2203  hip_flatten_thread_direct_unchecked<named_dim::x,
2204  named_dim::y,
2205  named_dim::z>;
2206 using hip_flatten_thread_xzy_direct_unchecked =
2207  hip_flatten_thread_direct_unchecked<named_dim::x,
2208  named_dim::z,
2209  named_dim::y>;
2210 using hip_flatten_thread_yxz_direct_unchecked =
2211  hip_flatten_thread_direct_unchecked<named_dim::y,
2212  named_dim::x,
2213  named_dim::z>;
2214 using hip_flatten_thread_yzx_direct_unchecked =
2215  hip_flatten_thread_direct_unchecked<named_dim::y,
2216  named_dim::z,
2217  named_dim::x>;
2218 using hip_flatten_thread_zxy_direct_unchecked =
2219  hip_flatten_thread_direct_unchecked<named_dim::z,
2220  named_dim::x,
2221  named_dim::y>;
2222 using hip_flatten_thread_zyx_direct_unchecked =
2223  hip_flatten_thread_direct_unchecked<named_dim::z,
2224  named_dim::y,
2225  named_dim::x>;
2226 
2227 template<named_dim... dims>
2228 using hip_flatten_block_direct_unchecked = hip_flatten_indexer_direct_unchecked<
2229  hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
2230 using hip_flatten_block_x_direct_unchecked =
2231  hip_flatten_block_direct_unchecked<named_dim::x>;
2232 using hip_flatten_block_y_direct_unchecked =
2233  hip_flatten_block_direct_unchecked<named_dim::y>;
2234 using hip_flatten_block_z_direct_unchecked =
2235  hip_flatten_block_direct_unchecked<named_dim::z>;
2236 using hip_flatten_block_xy_direct_unchecked =
2237  hip_flatten_block_direct_unchecked<named_dim::x, named_dim::y>;
2238 using hip_flatten_block_xz_direct_unchecked =
2239  hip_flatten_block_direct_unchecked<named_dim::x, named_dim::z>;
2240 using hip_flatten_block_yx_direct_unchecked =
2241  hip_flatten_block_direct_unchecked<named_dim::y, named_dim::x>;
2242 using hip_flatten_block_yz_direct_unchecked =
2243  hip_flatten_block_direct_unchecked<named_dim::y, named_dim::z>;
2244 using hip_flatten_block_zx_direct_unchecked =
2245  hip_flatten_block_direct_unchecked<named_dim::z, named_dim::x>;
2246 using hip_flatten_block_zy_direct_unchecked =
2247  hip_flatten_block_direct_unchecked<named_dim::z, named_dim::y>;
2248 using hip_flatten_block_xyz_direct_unchecked =
2249  hip_flatten_block_direct_unchecked<named_dim::x,
2250  named_dim::y,
2251  named_dim::z>;
2252 using hip_flatten_block_xzy_direct_unchecked =
2253  hip_flatten_block_direct_unchecked<named_dim::x,
2254  named_dim::z,
2255  named_dim::y>;
2256 using hip_flatten_block_yxz_direct_unchecked =
2257  hip_flatten_block_direct_unchecked<named_dim::y,
2258  named_dim::x,
2259  named_dim::z>;
2260 using hip_flatten_block_yzx_direct_unchecked =
2261  hip_flatten_block_direct_unchecked<named_dim::y,
2262  named_dim::z,
2263  named_dim::x>;
2264 using hip_flatten_block_zxy_direct_unchecked =
2265  hip_flatten_block_direct_unchecked<named_dim::z,
2266  named_dim::x,
2267  named_dim::y>;
2268 using hip_flatten_block_zyx_direct_unchecked =
2269  hip_flatten_block_direct_unchecked<named_dim::z,
2270  named_dim::y,
2271  named_dim::x>;
2272 
2273 template<named_dim... dims>
2274 using hip_flatten_global_direct_unchecked =
2275  hip_flatten_indexer_direct_unchecked<
2276  hip::IndexGlobal<dims,
2279 using hip_flatten_global_x_direct_unchecked =
2280  hip_flatten_global_direct_unchecked<named_dim::x>;
2281 using hip_flatten_global_y_direct_unchecked =
2282  hip_flatten_global_direct_unchecked<named_dim::y>;
2283 using hip_flatten_global_z_direct_unchecked =
2284  hip_flatten_global_direct_unchecked<named_dim::z>;
2285 using hip_flatten_global_xy_direct_unchecked =
2286  hip_flatten_global_direct_unchecked<named_dim::x, named_dim::y>;
2287 using hip_flatten_global_xz_direct_unchecked =
2288  hip_flatten_global_direct_unchecked<named_dim::x, named_dim::z>;
2289 using hip_flatten_global_yx_direct_unchecked =
2290  hip_flatten_global_direct_unchecked<named_dim::y, named_dim::x>;
2291 using hip_flatten_global_yz_direct_unchecked =
2292  hip_flatten_global_direct_unchecked<named_dim::y, named_dim::z>;
2293 using hip_flatten_global_zx_direct_unchecked =
2294  hip_flatten_global_direct_unchecked<named_dim::z, named_dim::x>;
2295 using hip_flatten_global_zy_direct_unchecked =
2296  hip_flatten_global_direct_unchecked<named_dim::z, named_dim::y>;
2297 using hip_flatten_global_xyz_direct_unchecked =
2298  hip_flatten_global_direct_unchecked<named_dim::x,
2299  named_dim::y,
2300  named_dim::z>;
2301 using hip_flatten_global_xzy_direct_unchecked =
2302  hip_flatten_global_direct_unchecked<named_dim::x,
2303  named_dim::z,
2304  named_dim::y>;
2305 using hip_flatten_global_yxz_direct_unchecked =
2306  hip_flatten_global_direct_unchecked<named_dim::y,
2307  named_dim::x,
2308  named_dim::z>;
2309 using hip_flatten_global_yzx_direct_unchecked =
2310  hip_flatten_global_direct_unchecked<named_dim::y,
2311  named_dim::z,
2312  named_dim::x>;
2313 using hip_flatten_global_zxy_direct_unchecked =
2314  hip_flatten_global_direct_unchecked<named_dim::z,
2315  named_dim::x,
2316  named_dim::y>;
2317 using hip_flatten_global_zyx_direct_unchecked =
2318  hip_flatten_global_direct_unchecked<named_dim::z,
2319  named_dim::y,
2320  named_dim::x>;
2321 
2322 /*
2323  * Maps segment indices to flattened HIP threads, blocks, or global threads.
2324  * This is a low overhead mapping, but requires that there are enough
2325  * physical threads, blocks, or global threads to fit all of the direct map
2326  * requests.
2327  * Reshapes multiple physical threads, blocks, or global threads into a 1D
2328  * iteration space
2329  */
2330 template<named_dim... dims>
2331 using hip_flatten_thread_direct = hip_flatten_indexer_direct<
2332  hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
2333 using hip_flatten_thread_x_direct = hip_flatten_thread_direct<named_dim::x>;
2334 using hip_flatten_thread_y_direct = hip_flatten_thread_direct<named_dim::y>;
2335 using hip_flatten_thread_z_direct = hip_flatten_thread_direct<named_dim::z>;
2336 using hip_flatten_thread_xy_direct =
2337  hip_flatten_thread_direct<named_dim::x, named_dim::y>;
2338 using hip_flatten_thread_xz_direct =
2339  hip_flatten_thread_direct<named_dim::x, named_dim::z>;
2340 using hip_flatten_thread_yx_direct =
2341  hip_flatten_thread_direct<named_dim::y, named_dim::x>;
2342 using hip_flatten_thread_yz_direct =
2343  hip_flatten_thread_direct<named_dim::y, named_dim::z>;
2344 using hip_flatten_thread_zx_direct =
2345  hip_flatten_thread_direct<named_dim::z, named_dim::x>;
2346 using hip_flatten_thread_zy_direct =
2347  hip_flatten_thread_direct<named_dim::z, named_dim::y>;
2348 using hip_flatten_thread_xyz_direct =
2349  hip_flatten_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
2350 using hip_flatten_thread_xzy_direct =
2351  hip_flatten_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
2352 using hip_flatten_thread_yxz_direct =
2353  hip_flatten_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
2354 using hip_flatten_thread_yzx_direct =
2355  hip_flatten_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
2356 using hip_flatten_thread_zxy_direct =
2357  hip_flatten_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
2358 using hip_flatten_thread_zyx_direct =
2359  hip_flatten_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
2360 
2361 template<named_dim... dims>
2362 using hip_flatten_block_direct = hip_flatten_indexer_direct<
2363  hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
2364 using hip_flatten_block_x_direct = hip_flatten_block_direct<named_dim::x>;
2365 using hip_flatten_block_y_direct = hip_flatten_block_direct<named_dim::y>;
2366 using hip_flatten_block_z_direct = hip_flatten_block_direct<named_dim::z>;
2367 using hip_flatten_block_xy_direct =
2368  hip_flatten_block_direct<named_dim::x, named_dim::y>;
2369 using hip_flatten_block_xz_direct =
2370  hip_flatten_block_direct<named_dim::x, named_dim::z>;
2371 using hip_flatten_block_yx_direct =
2372  hip_flatten_block_direct<named_dim::y, named_dim::x>;
2373 using hip_flatten_block_yz_direct =
2374  hip_flatten_block_direct<named_dim::y, named_dim::z>;
2375 using hip_flatten_block_zx_direct =
2376  hip_flatten_block_direct<named_dim::z, named_dim::x>;
2377 using hip_flatten_block_zy_direct =
2378  hip_flatten_block_direct<named_dim::z, named_dim::y>;
2379 using hip_flatten_block_xyz_direct =
2380  hip_flatten_block_direct<named_dim::x, named_dim::y, named_dim::z>;
2381 using hip_flatten_block_xzy_direct =
2382  hip_flatten_block_direct<named_dim::x, named_dim::z, named_dim::y>;
2383 using hip_flatten_block_yxz_direct =
2384  hip_flatten_block_direct<named_dim::y, named_dim::x, named_dim::z>;
2385 using hip_flatten_block_yzx_direct =
2386  hip_flatten_block_direct<named_dim::y, named_dim::z, named_dim::x>;
2387 using hip_flatten_block_zxy_direct =
2388  hip_flatten_block_direct<named_dim::z, named_dim::x, named_dim::y>;
2389 using hip_flatten_block_zyx_direct =
2390  hip_flatten_block_direct<named_dim::z, named_dim::y, named_dim::x>;
2391 
2392 template<named_dim... dims>
2393 using hip_flatten_global_direct =
2394  hip_flatten_indexer_direct<hip::IndexGlobal<dims,
2397 using hip_flatten_global_x_direct = hip_flatten_global_direct<named_dim::x>;
2398 using hip_flatten_global_y_direct = hip_flatten_global_direct<named_dim::y>;
2399 using hip_flatten_global_z_direct = hip_flatten_global_direct<named_dim::z>;
2400 using hip_flatten_global_xy_direct =
2401  hip_flatten_global_direct<named_dim::x, named_dim::y>;
2402 using hip_flatten_global_xz_direct =
2403  hip_flatten_global_direct<named_dim::x, named_dim::z>;
2404 using hip_flatten_global_yx_direct =
2405  hip_flatten_global_direct<named_dim::y, named_dim::x>;
2406 using hip_flatten_global_yz_direct =
2407  hip_flatten_global_direct<named_dim::y, named_dim::z>;
2408 using hip_flatten_global_zx_direct =
2409  hip_flatten_global_direct<named_dim::z, named_dim::x>;
2410 using hip_flatten_global_zy_direct =
2411  hip_flatten_global_direct<named_dim::z, named_dim::y>;
2412 using hip_flatten_global_xyz_direct =
2413  hip_flatten_global_direct<named_dim::x, named_dim::y, named_dim::z>;
2414 using hip_flatten_global_xzy_direct =
2415  hip_flatten_global_direct<named_dim::x, named_dim::z, named_dim::y>;
2416 using hip_flatten_global_yxz_direct =
2417  hip_flatten_global_direct<named_dim::y, named_dim::x, named_dim::z>;
2418 using hip_flatten_global_yzx_direct =
2419  hip_flatten_global_direct<named_dim::y, named_dim::z, named_dim::x>;
2420 using hip_flatten_global_zxy_direct =
2421  hip_flatten_global_direct<named_dim::z, named_dim::x, named_dim::y>;
2422 using hip_flatten_global_zyx_direct =
2423  hip_flatten_global_direct<named_dim::z, named_dim::y, named_dim::x>;
2424 
2425 /*
2426  * Maps segment indices to flattened HIP threads, blocks, or global threads.
2427  * Reshapes multiple physical threads, blocks, or global threads into a 1D
2428  * iteration space
2429  * Uses block-stride or grid-stride looping to exceed the maximum number of
2430  * physical threads, blocks, or global threads
2431  */
2432 template<named_dim... dims>
2433 using hip_flatten_thread_loop = hip_flatten_indexer_loop<
2434  hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
2435 using hip_flatten_thread_x_loop = hip_flatten_thread_loop<named_dim::x>;
2436 using hip_flatten_thread_y_loop = hip_flatten_thread_loop<named_dim::y>;
2437 using hip_flatten_thread_z_loop = hip_flatten_thread_loop<named_dim::z>;
2438 using hip_flatten_thread_xy_loop =
2439  hip_flatten_thread_loop<named_dim::x, named_dim::y>;
2440 using hip_flatten_thread_xz_loop =
2441  hip_flatten_thread_loop<named_dim::x, named_dim::z>;
2442 using hip_flatten_thread_yx_loop =
2443  hip_flatten_thread_loop<named_dim::y, named_dim::x>;
2444 using hip_flatten_thread_yz_loop =
2445  hip_flatten_thread_loop<named_dim::y, named_dim::z>;
2446 using hip_flatten_thread_zx_loop =
2447  hip_flatten_thread_loop<named_dim::z, named_dim::x>;
2448 using hip_flatten_thread_zy_loop =
2449  hip_flatten_thread_loop<named_dim::z, named_dim::y>;
2450 using hip_flatten_thread_xyz_loop =
2451  hip_flatten_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
2452 using hip_flatten_thread_xzy_loop =
2453  hip_flatten_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
2454 using hip_flatten_thread_yxz_loop =
2455  hip_flatten_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
2456 using hip_flatten_thread_yzx_loop =
2457  hip_flatten_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
2458 using hip_flatten_thread_zxy_loop =
2459  hip_flatten_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
2460 using hip_flatten_thread_zyx_loop =
2461  hip_flatten_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
2462 
2463 template<named_dim... dims>
2464 using hip_flatten_block_loop = hip_flatten_indexer_loop<
2465  hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
2466 using hip_flatten_block_x_loop = hip_flatten_block_loop<named_dim::x>;
2467 using hip_flatten_block_y_loop = hip_flatten_block_loop<named_dim::y>;
2468 using hip_flatten_block_z_loop = hip_flatten_block_loop<named_dim::z>;
2469 using hip_flatten_block_xy_loop =
2470  hip_flatten_block_loop<named_dim::x, named_dim::y>;
2471 using hip_flatten_block_xz_loop =
2472  hip_flatten_block_loop<named_dim::x, named_dim::z>;
2473 using hip_flatten_block_yx_loop =
2474  hip_flatten_block_loop<named_dim::y, named_dim::x>;
2475 using hip_flatten_block_yz_loop =
2476  hip_flatten_block_loop<named_dim::y, named_dim::z>;
2477 using hip_flatten_block_zx_loop =
2478  hip_flatten_block_loop<named_dim::z, named_dim::x>;
2479 using hip_flatten_block_zy_loop =
2480  hip_flatten_block_loop<named_dim::z, named_dim::y>;
2481 using hip_flatten_block_xyz_loop =
2482  hip_flatten_block_loop<named_dim::x, named_dim::y, named_dim::z>;
2483 using hip_flatten_block_xzy_loop =
2484  hip_flatten_block_loop<named_dim::x, named_dim::z, named_dim::y>;
2485 using hip_flatten_block_yxz_loop =
2486  hip_flatten_block_loop<named_dim::y, named_dim::x, named_dim::z>;
2487 using hip_flatten_block_yzx_loop =
2488  hip_flatten_block_loop<named_dim::y, named_dim::z, named_dim::x>;
2489 using hip_flatten_block_zxy_loop =
2490  hip_flatten_block_loop<named_dim::z, named_dim::x, named_dim::y>;
2491 using hip_flatten_block_zyx_loop =
2492  hip_flatten_block_loop<named_dim::z, named_dim::y, named_dim::x>;
2493 
2494 template<named_dim... dims>
2495 using hip_flatten_global_loop =
2496  hip_flatten_indexer_loop<hip::IndexGlobal<dims,
2499 using hip_flatten_global_x_loop = hip_flatten_global_loop<named_dim::x>;
2500 using hip_flatten_global_y_loop = hip_flatten_global_loop<named_dim::y>;
2501 using hip_flatten_global_z_loop = hip_flatten_global_loop<named_dim::z>;
2502 using hip_flatten_global_xy_loop =
2503  hip_flatten_global_loop<named_dim::x, named_dim::y>;
2504 using hip_flatten_global_xz_loop =
2505  hip_flatten_global_loop<named_dim::x, named_dim::z>;
2506 using hip_flatten_global_yx_loop =
2507  hip_flatten_global_loop<named_dim::y, named_dim::x>;
2508 using hip_flatten_global_yz_loop =
2509  hip_flatten_global_loop<named_dim::y, named_dim::z>;
2510 using hip_flatten_global_zx_loop =
2511  hip_flatten_global_loop<named_dim::z, named_dim::x>;
2512 using hip_flatten_global_zy_loop =
2513  hip_flatten_global_loop<named_dim::z, named_dim::y>;
2514 using hip_flatten_global_xyz_loop =
2515  hip_flatten_global_loop<named_dim::x, named_dim::y, named_dim::z>;
2516 using hip_flatten_global_xzy_loop =
2517  hip_flatten_global_loop<named_dim::x, named_dim::z, named_dim::y>;
2518 using hip_flatten_global_yxz_loop =
2519  hip_flatten_global_loop<named_dim::y, named_dim::x, named_dim::z>;
2520 using hip_flatten_global_yzx_loop =
2521  hip_flatten_global_loop<named_dim::y, named_dim::z, named_dim::x>;
2522 using hip_flatten_global_zxy_loop =
2523  hip_flatten_global_loop<named_dim::z, named_dim::x, named_dim::y>;
2524 using hip_flatten_global_zyx_loop =
2525  hip_flatten_global_loop<named_dim::z, named_dim::y, named_dim::x>;
2526 
2532 template<int X_SIZE>
2533 using hip_thread_size_x_direct_unchecked =
2534  hip_indexer_direct_unchecked<hip::thread_x<X_SIZE>>;
2535 template<int Y_SIZE>
2536 using hip_thread_size_y_direct_unchecked =
2537  hip_indexer_direct_unchecked<hip::thread_y<Y_SIZE>>;
2538 template<int Z_SIZE>
2539 using hip_thread_size_z_direct_unchecked =
2540  hip_indexer_direct_unchecked<hip::thread_z<Z_SIZE>>;
2541 template<int X_SIZE, int Y_SIZE>
2542 using hip_thread_size_xy_direct_unchecked =
2543  hip_indexer_direct_unchecked<hip::thread_x<X_SIZE>, hip::thread_y<Y_SIZE>>;
2544 template<int X_SIZE, int Z_SIZE>
2545 using hip_thread_size_xz_direct_unchecked =
2546  hip_indexer_direct_unchecked<hip::thread_x<X_SIZE>, hip::thread_z<Z_SIZE>>;
2547 template<int Y_SIZE, int X_SIZE>
2548 using hip_thread_size_yx_direct_unchecked =
2549  hip_indexer_direct_unchecked<hip::thread_y<Y_SIZE>, hip::thread_x<X_SIZE>>;
2550 template<int Y_SIZE, int Z_SIZE>
2551 using hip_thread_size_yz_direct_unchecked =
2552  hip_indexer_direct_unchecked<hip::thread_y<Y_SIZE>, hip::thread_z<Z_SIZE>>;
2553 template<int Z_SIZE, int X_SIZE>
2554 using hip_thread_size_zx_direct_unchecked =
2555  hip_indexer_direct_unchecked<hip::thread_z<Z_SIZE>, hip::thread_x<X_SIZE>>;
2556 template<int Z_SIZE, int Y_SIZE>
2557 using hip_thread_size_zy_direct_unchecked =
2558  hip_indexer_direct_unchecked<hip::thread_z<Z_SIZE>, hip::thread_y<Y_SIZE>>;
2559 template<int X_SIZE, int Y_SIZE, int Z_SIZE>
2560 using hip_thread_size_xyz_direct_unchecked =
2561  hip_indexer_direct_unchecked<hip::thread_x<X_SIZE>,
2562  hip::thread_y<Y_SIZE>,
2563  hip::thread_z<Z_SIZE>>;
2564 template<int X_SIZE, int Z_SIZE, int Y_SIZE>
2565 using hip_thread_size_xzy_direct_unchecked =
2566  hip_indexer_direct_unchecked<hip::thread_x<X_SIZE>,
2567  hip::thread_z<Z_SIZE>,
2568  hip::thread_y<Y_SIZE>>;
2569 template<int Y_SIZE, int X_SIZE, int Z_SIZE>
2570 using hip_thread_size_yxz_direct_unchecked =
2571  hip_indexer_direct_unchecked<hip::thread_y<Y_SIZE>,
2572  hip::thread_x<X_SIZE>,
2573  hip::thread_z<Z_SIZE>>;
2574 template<int Y_SIZE, int Z_SIZE, int X_SIZE>
2575 using hip_thread_size_yzx_direct_unchecked =
2576  hip_indexer_direct_unchecked<hip::thread_y<Y_SIZE>,
2577  hip::thread_z<Z_SIZE>,
2578  hip::thread_x<X_SIZE>>;
2579 template<int Z_SIZE, int X_SIZE, int Y_SIZE>
2580 using hip_thread_size_zxy_direct_unchecked =
2581  hip_indexer_direct_unchecked<hip::thread_z<Z_SIZE>,
2582  hip::thread_x<X_SIZE>,
2583  hip::thread_y<Y_SIZE>>;
2584 template<int Z_SIZE, int Y_SIZE, int X_SIZE>
2585 using hip_thread_size_zyx_direct_unchecked =
2586  hip_indexer_direct_unchecked<hip::thread_z<Z_SIZE>,
2587  hip::thread_y<Y_SIZE>,
2588  hip::thread_x<X_SIZE>>;
2589 
2590 template<int X_SIZE>
2591 using hip_block_size_x_direct_unchecked =
2592  hip_indexer_direct_unchecked<hip::block_x<X_SIZE>>;
2593 template<int Y_SIZE>
2594 using hip_block_size_y_direct_unchecked =
2595  hip_indexer_direct_unchecked<hip::block_y<Y_SIZE>>;
2596 template<int Z_SIZE>
2597 using hip_block_size_z_direct_unchecked =
2598  hip_indexer_direct_unchecked<hip::block_z<Z_SIZE>>;
2599 template<int X_SIZE, int Y_SIZE>
2600 using hip_block_size_xy_direct_unchecked =
2601  hip_indexer_direct_unchecked<hip::block_x<X_SIZE>, hip::block_y<Y_SIZE>>;
2602 template<int X_SIZE, int Z_SIZE>
2603 using hip_block_size_xz_direct_unchecked =
2604  hip_indexer_direct_unchecked<hip::block_x<X_SIZE>, hip::block_z<Z_SIZE>>;
2605 template<int Y_SIZE, int X_SIZE>
2606 using hip_block_size_yx_direct_unchecked =
2607  hip_indexer_direct_unchecked<hip::block_y<Y_SIZE>, hip::block_x<X_SIZE>>;
2608 template<int Y_SIZE, int Z_SIZE>
2609 using hip_block_size_yz_direct_unchecked =
2610  hip_indexer_direct_unchecked<hip::block_y<Y_SIZE>, hip::block_z<Z_SIZE>>;
2611 template<int Z_SIZE, int X_SIZE>
2612 using hip_block_size_zx_direct_unchecked =
2613  hip_indexer_direct_unchecked<hip::block_z<Z_SIZE>, hip::block_x<X_SIZE>>;
2614 template<int Z_SIZE, int Y_SIZE>
2615 using hip_block_size_zy_direct_unchecked =
2616  hip_indexer_direct_unchecked<hip::block_z<Z_SIZE>, hip::block_y<Y_SIZE>>;
2617 template<int X_SIZE, int Y_SIZE, int Z_SIZE>
2618 using hip_block_size_xyz_direct_unchecked =
2619  hip_indexer_direct_unchecked<hip::block_x<X_SIZE>,
2620  hip::block_y<Y_SIZE>,
2621  hip::block_z<Z_SIZE>>;
2622 template<int X_SIZE, int Z_SIZE, int Y_SIZE>
2623 using hip_block_size_xzy_direct_unchecked =
2624  hip_indexer_direct_unchecked<hip::block_x<X_SIZE>,
2625  hip::block_z<Z_SIZE>,
2626  hip::block_y<Y_SIZE>>;
2627 template<int Y_SIZE, int X_SIZE, int Z_SIZE>
2628 using hip_block_size_yxz_direct_unchecked =
2629  hip_indexer_direct_unchecked<hip::block_y<Y_SIZE>,
2630  hip::block_x<X_SIZE>,
2631  hip::block_z<Z_SIZE>>;
2632 template<int Y_SIZE, int Z_SIZE, int X_SIZE>
2633 using hip_block_size_yzx_direct_unchecked =
2634  hip_indexer_direct_unchecked<hip::block_y<Y_SIZE>,
2635  hip::block_z<Z_SIZE>,
2636  hip::block_x<X_SIZE>>;
2637 template<int Z_SIZE, int X_SIZE, int Y_SIZE>
2638 using hip_block_size_zxy_direct_unchecked =
2639  hip_indexer_direct_unchecked<hip::block_z<Z_SIZE>,
2640  hip::block_x<X_SIZE>,
2641  hip::block_y<Y_SIZE>>;
2642 template<int Z_SIZE, int Y_SIZE, int X_SIZE>
2643 using hip_block_size_zyx_direct_unchecked =
2644  hip_indexer_direct_unchecked<hip::block_z<Z_SIZE>,
2645  hip::block_y<Y_SIZE>,
2646  hip::block_x<X_SIZE>>;
2647 
2648 template<int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
2649 using hip_global_size_x_direct_unchecked =
2650  hip_indexer_direct_unchecked<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
2651 template<int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
2652 using hip_global_size_y_direct_unchecked =
2653  hip_indexer_direct_unchecked<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
2654 template<int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
2655 using hip_global_size_z_direct_unchecked =
2656  hip_indexer_direct_unchecked<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
2657 template<int X_BLOCK_SIZE,
2658  int Y_BLOCK_SIZE,
2659  int X_GRID_SIZE = named_usage::unspecified,
2660  int Y_GRID_SIZE = named_usage::unspecified>
2661 using hip_global_size_xy_direct_unchecked =
2662  hip_indexer_direct_unchecked<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
2663  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
2664 template<int X_BLOCK_SIZE,
2665  int Z_BLOCK_SIZE,
2666  int X_GRID_SIZE = named_usage::unspecified,
2667  int Z_GRID_SIZE = named_usage::unspecified>
2668 using hip_global_size_xz_direct_unchecked =
2669  hip_indexer_direct_unchecked<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
2670  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
2671 template<int Y_BLOCK_SIZE,
2672  int X_BLOCK_SIZE,
2673  int Y_GRID_SIZE = named_usage::unspecified,
2674  int X_GRID_SIZE = named_usage::unspecified>
2675 using hip_global_size_yx_direct_unchecked =
2676  hip_indexer_direct_unchecked<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
2677  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
2678 template<int Y_BLOCK_SIZE,
2679  int Z_BLOCK_SIZE,
2680  int Y_GRID_SIZE = named_usage::unspecified,
2681  int Z_GRID_SIZE = named_usage::unspecified>
2682 using hip_global_size_yz_direct_unchecked =
2683  hip_indexer_direct_unchecked<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
2684  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
2685 template<int Z_BLOCK_SIZE,
2686  int X_BLOCK_SIZE,
2687  int Z_GRID_SIZE = named_usage::unspecified,
2688  int X_GRID_SIZE = named_usage::unspecified>
2689 using hip_global_size_zx_direct_unchecked =
2690  hip_indexer_direct_unchecked<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
2691  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
2692 template<int Z_BLOCK_SIZE,
2693  int Y_BLOCK_SIZE,
2694  int Z_GRID_SIZE = named_usage::unspecified,
2695  int Y_GRID_SIZE = named_usage::unspecified>
2696 using hip_global_size_zy_direct_unchecked =
2697  hip_indexer_direct_unchecked<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
2698  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
2699 template<int X_BLOCK_SIZE,
2700  int Y_BLOCK_SIZE,
2701  int Z_BLOCK_SIZE,
2702  int X_GRID_SIZE = named_usage::unspecified,
2703  int Y_GRID_SIZE = named_usage::unspecified,
2704  int Z_GRID_SIZE = named_usage::unspecified>
2705 using hip_global_size_xyz_direct_unchecked =
2706  hip_indexer_direct_unchecked<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
2707  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
2708  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
2709 template<int X_BLOCK_SIZE,
2710  int Z_BLOCK_SIZE,
2711  int Y_BLOCK_SIZE,
2712  int X_GRID_SIZE = named_usage::unspecified,
2713  int Z_GRID_SIZE = named_usage::unspecified,
2714  int Y_GRID_SIZE = named_usage::unspecified>
2715 using hip_global_size_xzy_direct_unchecked =
2716  hip_indexer_direct_unchecked<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
2717  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
2718  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
2719 template<int Y_BLOCK_SIZE,
2720  int X_BLOCK_SIZE,
2721  int Z_BLOCK_SIZE,
2722  int Y_GRID_SIZE = named_usage::unspecified,
2723  int X_GRID_SIZE = named_usage::unspecified,
2724  int Z_GRID_SIZE = named_usage::unspecified>
2725 using hip_global_size_yxz_direct_unchecked =
2726  hip_indexer_direct_unchecked<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
2727  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
2728  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
2729 template<int Y_BLOCK_SIZE,
2730  int Z_BLOCK_SIZE,
2731  int X_BLOCK_SIZE,
2732  int Y_GRID_SIZE = named_usage::unspecified,
2733  int Z_GRID_SIZE = named_usage::unspecified,
2734  int X_GRID_SIZE = named_usage::unspecified>
2735 using hip_global_size_yzx_direct_unchecked =
2736  hip_indexer_direct_unchecked<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
2737  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
2738  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
2739 template<int Z_BLOCK_SIZE,
2740  int X_BLOCK_SIZE,
2741  int Y_BLOCK_SIZE,
2742  int Z_GRID_SIZE = named_usage::unspecified,
2743  int X_GRID_SIZE = named_usage::unspecified,
2744  int Y_GRID_SIZE = named_usage::unspecified>
2745 using hip_global_size_zxy_direct_unchecked =
2746  hip_indexer_direct_unchecked<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
2747  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
2748  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
2749 template<int Z_BLOCK_SIZE,
2750  int Y_BLOCK_SIZE,
2751  int X_BLOCK_SIZE,
2752  int Z_GRID_SIZE = named_usage::unspecified,
2753  int Y_GRID_SIZE = named_usage::unspecified,
2754  int X_GRID_SIZE = named_usage::unspecified>
2755 using hip_global_size_zyx_direct_unchecked =
2756  hip_indexer_direct_unchecked<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
2757  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
2758  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
2759 
2765 template<int X_SIZE>
2766 using hip_thread_size_x_direct = hip_indexer_direct<hip::thread_x<X_SIZE>>;
2767 template<int Y_SIZE>
2768 using hip_thread_size_y_direct = hip_indexer_direct<hip::thread_y<Y_SIZE>>;
2769 template<int Z_SIZE>
2770 using hip_thread_size_z_direct = hip_indexer_direct<hip::thread_z<Z_SIZE>>;
2771 template<int X_SIZE, int Y_SIZE>
2772 using hip_thread_size_xy_direct =
2773  hip_indexer_direct<hip::thread_x<X_SIZE>, hip::thread_y<Y_SIZE>>;
2774 template<int X_SIZE, int Z_SIZE>
2775 using hip_thread_size_xz_direct =
2776  hip_indexer_direct<hip::thread_x<X_SIZE>, hip::thread_z<Z_SIZE>>;
2777 template<int Y_SIZE, int X_SIZE>
2778 using hip_thread_size_yx_direct =
2779  hip_indexer_direct<hip::thread_y<Y_SIZE>, hip::thread_x<X_SIZE>>;
2780 template<int Y_SIZE, int Z_SIZE>
2781 using hip_thread_size_yz_direct =
2782  hip_indexer_direct<hip::thread_y<Y_SIZE>, hip::thread_z<Z_SIZE>>;
2783 template<int Z_SIZE, int X_SIZE>
2784 using hip_thread_size_zx_direct =
2785  hip_indexer_direct<hip::thread_z<Z_SIZE>, hip::thread_x<X_SIZE>>;
2786 template<int Z_SIZE, int Y_SIZE>
2787 using hip_thread_size_zy_direct =
2788  hip_indexer_direct<hip::thread_z<Z_SIZE>, hip::thread_y<Y_SIZE>>;
2789 template<int X_SIZE, int Y_SIZE, int Z_SIZE>
2790 using hip_thread_size_xyz_direct = hip_indexer_direct<hip::thread_x<X_SIZE>,
2791  hip::thread_y<Y_SIZE>,
2792  hip::thread_z<Z_SIZE>>;
2793 template<int X_SIZE, int Z_SIZE, int Y_SIZE>
2794 using hip_thread_size_xzy_direct = hip_indexer_direct<hip::thread_x<X_SIZE>,
2795  hip::thread_z<Z_SIZE>,
2796  hip::thread_y<Y_SIZE>>;
2797 template<int Y_SIZE, int X_SIZE, int Z_SIZE>
2798 using hip_thread_size_yxz_direct = hip_indexer_direct<hip::thread_y<Y_SIZE>,
2799  hip::thread_x<X_SIZE>,
2800  hip::thread_z<Z_SIZE>>;
2801 template<int Y_SIZE, int Z_SIZE, int X_SIZE>
2802 using hip_thread_size_yzx_direct = hip_indexer_direct<hip::thread_y<Y_SIZE>,
2803  hip::thread_z<Z_SIZE>,
2804  hip::thread_x<X_SIZE>>;
2805 template<int Z_SIZE, int X_SIZE, int Y_SIZE>
2806 using hip_thread_size_zxy_direct = hip_indexer_direct<hip::thread_z<Z_SIZE>,
2807  hip::thread_x<X_SIZE>,
2808  hip::thread_y<Y_SIZE>>;
2809 template<int Z_SIZE, int Y_SIZE, int X_SIZE>
2810 using hip_thread_size_zyx_direct = hip_indexer_direct<hip::thread_z<Z_SIZE>,
2811  hip::thread_y<Y_SIZE>,
2812  hip::thread_x<X_SIZE>>;
2813 
2814 template<int X_SIZE>
2815 using hip_block_size_x_direct = hip_indexer_direct<hip::block_x<X_SIZE>>;
2816 template<int Y_SIZE>
2817 using hip_block_size_y_direct = hip_indexer_direct<hip::block_y<Y_SIZE>>;
2818 template<int Z_SIZE>
2819 using hip_block_size_z_direct = hip_indexer_direct<hip::block_z<Z_SIZE>>;
2820 template<int X_SIZE, int Y_SIZE>
2821 using hip_block_size_xy_direct =
2822  hip_indexer_direct<hip::block_x<X_SIZE>, hip::block_y<Y_SIZE>>;
2823 template<int X_SIZE, int Z_SIZE>
2824 using hip_block_size_xz_direct =
2825  hip_indexer_direct<hip::block_x<X_SIZE>, hip::block_z<Z_SIZE>>;
2826 template<int Y_SIZE, int X_SIZE>
2827 using hip_block_size_yx_direct =
2828  hip_indexer_direct<hip::block_y<Y_SIZE>, hip::block_x<X_SIZE>>;
2829 template<int Y_SIZE, int Z_SIZE>
2830 using hip_block_size_yz_direct =
2831  hip_indexer_direct<hip::block_y<Y_SIZE>, hip::block_z<Z_SIZE>>;
2832 template<int Z_SIZE, int X_SIZE>
2833 using hip_block_size_zx_direct =
2834  hip_indexer_direct<hip::block_z<Z_SIZE>, hip::block_x<X_SIZE>>;
2835 template<int Z_SIZE, int Y_SIZE>
2836 using hip_block_size_zy_direct =
2837  hip_indexer_direct<hip::block_z<Z_SIZE>, hip::block_y<Y_SIZE>>;
2838 template<int X_SIZE, int Y_SIZE, int Z_SIZE>
2839 using hip_block_size_xyz_direct = hip_indexer_direct<hip::block_x<X_SIZE>,
2840  hip::block_y<Y_SIZE>,
2841  hip::block_z<Z_SIZE>>;
2842 template<int X_SIZE, int Z_SIZE, int Y_SIZE>
2843 using hip_block_size_xzy_direct = hip_indexer_direct<hip::block_x<X_SIZE>,
2844  hip::block_z<Z_SIZE>,
2845  hip::block_y<Y_SIZE>>;
2846 template<int Y_SIZE, int X_SIZE, int Z_SIZE>
2847 using hip_block_size_yxz_direct = hip_indexer_direct<hip::block_y<Y_SIZE>,
2848  hip::block_x<X_SIZE>,
2849  hip::block_z<Z_SIZE>>;
2850 template<int Y_SIZE, int Z_SIZE, int X_SIZE>
2851 using hip_block_size_yzx_direct = hip_indexer_direct<hip::block_y<Y_SIZE>,
2852  hip::block_z<Z_SIZE>,
2853  hip::block_x<X_SIZE>>;
2854 template<int Z_SIZE, int X_SIZE, int Y_SIZE>
2855 using hip_block_size_zxy_direct = hip_indexer_direct<hip::block_z<Z_SIZE>,
2856  hip::block_x<X_SIZE>,
2857  hip::block_y<Y_SIZE>>;
2858 template<int Z_SIZE, int Y_SIZE, int X_SIZE>
2859 using hip_block_size_zyx_direct = hip_indexer_direct<hip::block_z<Z_SIZE>,
2860  hip::block_y<Y_SIZE>,
2861  hip::block_x<X_SIZE>>;
2862 
2863 template<int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
2864 using hip_global_size_x_direct =
2865  hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
2866 template<int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
2867 using hip_global_size_y_direct =
2868  hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
2869 template<int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
2870 using hip_global_size_z_direct =
2871  hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
2872 template<int X_BLOCK_SIZE,
2873  int Y_BLOCK_SIZE,
2874  int X_GRID_SIZE = named_usage::unspecified,
2875  int Y_GRID_SIZE = named_usage::unspecified>
2876 using hip_global_size_xy_direct =
2877  hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
2878  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
2879 template<int X_BLOCK_SIZE,
2880  int Z_BLOCK_SIZE,
2881  int X_GRID_SIZE = named_usage::unspecified,
2882  int Z_GRID_SIZE = named_usage::unspecified>
2883 using hip_global_size_xz_direct =
2884  hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
2885  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
2886 template<int Y_BLOCK_SIZE,
2887  int X_BLOCK_SIZE,
2888  int Y_GRID_SIZE = named_usage::unspecified,
2889  int X_GRID_SIZE = named_usage::unspecified>
2890 using hip_global_size_yx_direct =
2891  hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
2892  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
2893 template<int Y_BLOCK_SIZE,
2894  int Z_BLOCK_SIZE,
2895  int Y_GRID_SIZE = named_usage::unspecified,
2896  int Z_GRID_SIZE = named_usage::unspecified>
2897 using hip_global_size_yz_direct =
2898  hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
2899  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
2900 template<int Z_BLOCK_SIZE,
2901  int X_BLOCK_SIZE,
2902  int Z_GRID_SIZE = named_usage::unspecified,
2903  int X_GRID_SIZE = named_usage::unspecified>
2904 using hip_global_size_zx_direct =
2905  hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
2906  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
2907 template<int Z_BLOCK_SIZE,
2908  int Y_BLOCK_SIZE,
2909  int Z_GRID_SIZE = named_usage::unspecified,
2910  int Y_GRID_SIZE = named_usage::unspecified>
2911 using hip_global_size_zy_direct =
2912  hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
2913  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
2914 template<int X_BLOCK_SIZE,
2915  int Y_BLOCK_SIZE,
2916  int Z_BLOCK_SIZE,
2917  int X_GRID_SIZE = named_usage::unspecified,
2918  int Y_GRID_SIZE = named_usage::unspecified,
2919  int Z_GRID_SIZE = named_usage::unspecified>
2920 using hip_global_size_xyz_direct =
2921  hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
2922  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
2923  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
2924 template<int X_BLOCK_SIZE,
2925  int Z_BLOCK_SIZE,
2926  int Y_BLOCK_SIZE,
2927  int X_GRID_SIZE = named_usage::unspecified,
2928  int Z_GRID_SIZE = named_usage::unspecified,
2929  int Y_GRID_SIZE = named_usage::unspecified>
2930 using hip_global_size_xzy_direct =
2931  hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
2932  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
2933  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
2934 template<int Y_BLOCK_SIZE,
2935  int X_BLOCK_SIZE,
2936  int Z_BLOCK_SIZE,
2937  int Y_GRID_SIZE = named_usage::unspecified,
2938  int X_GRID_SIZE = named_usage::unspecified,
2939  int Z_GRID_SIZE = named_usage::unspecified>
2940 using hip_global_size_yxz_direct =
2941  hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
2942  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
2943  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
2944 template<int Y_BLOCK_SIZE,
2945  int Z_BLOCK_SIZE,
2946  int X_BLOCK_SIZE,
2947  int Y_GRID_SIZE = named_usage::unspecified,
2948  int Z_GRID_SIZE = named_usage::unspecified,
2949  int X_GRID_SIZE = named_usage::unspecified>
2950 using hip_global_size_yzx_direct =
2951  hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
2952  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
2953  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
2954 template<int Z_BLOCK_SIZE,
2955  int X_BLOCK_SIZE,
2956  int Y_BLOCK_SIZE,
2957  int Z_GRID_SIZE = named_usage::unspecified,
2958  int X_GRID_SIZE = named_usage::unspecified,
2959  int Y_GRID_SIZE = named_usage::unspecified>
2960 using hip_global_size_zxy_direct =
2961  hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
2962  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
2963  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
2964 template<int Z_BLOCK_SIZE,
2965  int Y_BLOCK_SIZE,
2966  int X_BLOCK_SIZE,
2967  int Z_GRID_SIZE = named_usage::unspecified,
2968  int Y_GRID_SIZE = named_usage::unspecified,
2969  int X_GRID_SIZE = named_usage::unspecified>
2970 using hip_global_size_zyx_direct =
2971  hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
2972  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
2973  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
2974 
2980 template<int X_SIZE>
2981 using hip_thread_size_x_loop = hip_indexer_loop<hip::thread_x<X_SIZE>>;
2982 template<int Y_SIZE>
2983 using hip_thread_size_y_loop = hip_indexer_loop<hip::thread_y<Y_SIZE>>;
2984 template<int Z_SIZE>
2985 using hip_thread_size_z_loop = hip_indexer_loop<hip::thread_z<Z_SIZE>>;
2986 template<int X_SIZE, int Y_SIZE>
2987 using hip_thread_size_xy_loop =
2988  hip_indexer_loop<hip::thread_x<X_SIZE>, hip::thread_y<Y_SIZE>>;
2989 template<int X_SIZE, int Z_SIZE>
2990 using hip_thread_size_xz_loop =
2991  hip_indexer_loop<hip::thread_x<X_SIZE>, hip::thread_z<Z_SIZE>>;
2992 template<int Y_SIZE, int X_SIZE>
2993 using hip_thread_size_yx_loop =
2994  hip_indexer_loop<hip::thread_y<Y_SIZE>, hip::thread_x<X_SIZE>>;
2995 template<int Y_SIZE, int Z_SIZE>
2996 using hip_thread_size_yz_loop =
2997  hip_indexer_loop<hip::thread_y<Y_SIZE>, hip::thread_z<Z_SIZE>>;
2998 template<int Z_SIZE, int X_SIZE>
2999 using hip_thread_size_zx_loop =
3000  hip_indexer_loop<hip::thread_z<Z_SIZE>, hip::thread_x<X_SIZE>>;
3001 template<int Z_SIZE, int Y_SIZE>
3002 using hip_thread_size_zy_loop =
3003  hip_indexer_loop<hip::thread_z<Z_SIZE>, hip::thread_y<Y_SIZE>>;
3004 template<int X_SIZE, int Y_SIZE, int Z_SIZE>
3005 using hip_thread_size_xyz_loop = hip_indexer_loop<hip::thread_x<X_SIZE>,
3006  hip::thread_y<Y_SIZE>,
3007  hip::thread_z<Z_SIZE>>;
3008 template<int X_SIZE, int Z_SIZE, int Y_SIZE>
3009 using hip_thread_size_xzy_loop = hip_indexer_loop<hip::thread_x<X_SIZE>,
3010  hip::thread_z<Z_SIZE>,
3011  hip::thread_y<Y_SIZE>>;
3012 template<int Y_SIZE, int X_SIZE, int Z_SIZE>
3013 using hip_thread_size_yxz_loop = hip_indexer_loop<hip::thread_y<Y_SIZE>,
3014  hip::thread_x<X_SIZE>,
3015  hip::thread_z<Z_SIZE>>;
3016 template<int Y_SIZE, int Z_SIZE, int X_SIZE>
3017 using hip_thread_size_yzx_loop = hip_indexer_loop<hip::thread_y<Y_SIZE>,
3018  hip::thread_z<Z_SIZE>,
3019  hip::thread_x<X_SIZE>>;
3020 template<int Z_SIZE, int X_SIZE, int Y_SIZE>
3021 using hip_thread_size_zxy_loop = hip_indexer_loop<hip::thread_z<Z_SIZE>,
3022  hip::thread_x<X_SIZE>,
3023  hip::thread_y<Y_SIZE>>;
3024 template<int Z_SIZE, int Y_SIZE, int X_SIZE>
3025 using hip_thread_size_zyx_loop = hip_indexer_loop<hip::thread_z<Z_SIZE>,
3026  hip::thread_y<Y_SIZE>,
3027  hip::thread_x<X_SIZE>>;
3028 
3029 template<int X_SIZE>
3030 using hip_block_size_x_loop = hip_indexer_loop<hip::block_x<X_SIZE>>;
3031 template<int Y_SIZE>
3032 using hip_block_size_y_loop = hip_indexer_loop<hip::block_y<Y_SIZE>>;
3033 template<int Z_SIZE>
3034 using hip_block_size_z_loop = hip_indexer_loop<hip::block_z<Z_SIZE>>;
3035 template<int X_SIZE, int Y_SIZE>
3036 using hip_block_size_xy_loop =
3037  hip_indexer_loop<hip::block_x<X_SIZE>, hip::block_y<Y_SIZE>>;
3038 template<int X_SIZE, int Z_SIZE>
3039 using hip_block_size_xz_loop =
3040  hip_indexer_loop<hip::block_x<X_SIZE>, hip::block_z<Z_SIZE>>;
3041 template<int Y_SIZE, int X_SIZE>
3042 using hip_block_size_yx_loop =
3043  hip_indexer_loop<hip::block_y<Y_SIZE>, hip::block_x<X_SIZE>>;
3044 template<int Y_SIZE, int Z_SIZE>
3045 using hip_block_size_yz_loop =
3046  hip_indexer_loop<hip::block_y<Y_SIZE>, hip::block_z<Z_SIZE>>;
3047 template<int Z_SIZE, int X_SIZE>
3048 using hip_block_size_zx_loop =
3049  hip_indexer_loop<hip::block_z<Z_SIZE>, hip::block_x<X_SIZE>>;
3050 template<int Z_SIZE, int Y_SIZE>
3051 using hip_block_size_zy_loop =
3052  hip_indexer_loop<hip::block_z<Z_SIZE>, hip::block_y<Y_SIZE>>;
3053 template<int X_SIZE, int Y_SIZE, int Z_SIZE>
3054 using hip_block_size_xyz_loop = hip_indexer_loop<hip::block_x<X_SIZE>,
3055  hip::block_y<Y_SIZE>,
3056  hip::block_z<Z_SIZE>>;
3057 template<int X_SIZE, int Z_SIZE, int Y_SIZE>
3058 using hip_block_size_xzy_loop = hip_indexer_loop<hip::block_x<X_SIZE>,
3059  hip::block_z<Z_SIZE>,
3060  hip::block_y<Y_SIZE>>;
3061 template<int Y_SIZE, int X_SIZE, int Z_SIZE>
3062 using hip_block_size_yxz_loop = hip_indexer_loop<hip::block_y<Y_SIZE>,
3063  hip::block_x<X_SIZE>,
3064  hip::block_z<Z_SIZE>>;
3065 template<int Y_SIZE, int Z_SIZE, int X_SIZE>
3066 using hip_block_size_yzx_loop = hip_indexer_loop<hip::block_y<Y_SIZE>,
3067  hip::block_z<Z_SIZE>,
3068  hip::block_x<X_SIZE>>;
3069 template<int Z_SIZE, int X_SIZE, int Y_SIZE>
3070 using hip_block_size_zxy_loop = hip_indexer_loop<hip::block_z<Z_SIZE>,
3071  hip::block_x<X_SIZE>,
3072  hip::block_y<Y_SIZE>>;
3073 template<int Z_SIZE, int Y_SIZE, int X_SIZE>
3074 using hip_block_size_zyx_loop = hip_indexer_loop<hip::block_z<Z_SIZE>,
3075  hip::block_y<Y_SIZE>,
3076  hip::block_x<X_SIZE>>;
3077 
3078 template<int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
3079 using hip_global_size_x_loop =
3080  hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3081 template<int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
3082 using hip_global_size_y_loop =
3083  hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3084 template<int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
3085 using hip_global_size_z_loop =
3086  hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3087 template<int X_BLOCK_SIZE,
3088  int Y_BLOCK_SIZE,
3089  int X_GRID_SIZE = named_usage::unspecified,
3090  int Y_GRID_SIZE = named_usage::unspecified>
3091 using hip_global_size_xy_loop =
3092  hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3093  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3094 template<int X_BLOCK_SIZE,
3095  int Z_BLOCK_SIZE,
3096  int X_GRID_SIZE = named_usage::unspecified,
3097  int Z_GRID_SIZE = named_usage::unspecified>
3098 using hip_global_size_xz_loop =
3099  hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3100  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3101 template<int Y_BLOCK_SIZE,
3102  int X_BLOCK_SIZE,
3103  int Y_GRID_SIZE = named_usage::unspecified,
3104  int X_GRID_SIZE = named_usage::unspecified>
3105 using hip_global_size_yx_loop =
3106  hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3107  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3108 template<int Y_BLOCK_SIZE,
3109  int Z_BLOCK_SIZE,
3110  int Y_GRID_SIZE = named_usage::unspecified,
3111  int Z_GRID_SIZE = named_usage::unspecified>
3112 using hip_global_size_yz_loop =
3113  hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3114  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3115 template<int Z_BLOCK_SIZE,
3116  int X_BLOCK_SIZE,
3117  int Z_GRID_SIZE = named_usage::unspecified,
3118  int X_GRID_SIZE = named_usage::unspecified>
3119 using hip_global_size_zx_loop =
3120  hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3121  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3122 template<int Z_BLOCK_SIZE,
3123  int Y_BLOCK_SIZE,
3124  int Z_GRID_SIZE = named_usage::unspecified,
3125  int Y_GRID_SIZE = named_usage::unspecified>
3126 using hip_global_size_zy_loop =
3127  hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3128  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3129 template<int X_BLOCK_SIZE,
3130  int Y_BLOCK_SIZE,
3131  int Z_BLOCK_SIZE,
3132  int X_GRID_SIZE = named_usage::unspecified,
3133  int Y_GRID_SIZE = named_usage::unspecified,
3134  int Z_GRID_SIZE = named_usage::unspecified>
3135 using hip_global_size_xyz_loop =
3136  hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3137  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3138  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3139 template<int X_BLOCK_SIZE,
3140  int Z_BLOCK_SIZE,
3141  int Y_BLOCK_SIZE,
3142  int X_GRID_SIZE = named_usage::unspecified,
3143  int Z_GRID_SIZE = named_usage::unspecified,
3144  int Y_GRID_SIZE = named_usage::unspecified>
3145 using hip_global_size_xzy_loop =
3146  hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3147  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3148  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3149 template<int Y_BLOCK_SIZE,
3150  int X_BLOCK_SIZE,
3151  int Z_BLOCK_SIZE,
3152  int Y_GRID_SIZE = named_usage::unspecified,
3153  int X_GRID_SIZE = named_usage::unspecified,
3154  int Z_GRID_SIZE = named_usage::unspecified>
3155 using hip_global_size_yxz_loop =
3156  hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3157  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3158  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3159 template<int Y_BLOCK_SIZE,
3160  int Z_BLOCK_SIZE,
3161  int X_BLOCK_SIZE,
3162  int Y_GRID_SIZE = named_usage::unspecified,
3163  int Z_GRID_SIZE = named_usage::unspecified,
3164  int X_GRID_SIZE = named_usage::unspecified>
3165 using hip_global_size_yzx_loop =
3166  hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3167  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3168  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3169 template<int Z_BLOCK_SIZE,
3170  int X_BLOCK_SIZE,
3171  int Y_BLOCK_SIZE,
3172  int Z_GRID_SIZE = named_usage::unspecified,
3173  int X_GRID_SIZE = named_usage::unspecified,
3174  int Y_GRID_SIZE = named_usage::unspecified>
3175 using hip_global_size_zxy_loop =
3176  hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3177  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3178  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3179 template<int Z_BLOCK_SIZE,
3180  int Y_BLOCK_SIZE,
3181  int X_BLOCK_SIZE,
3182  int Z_GRID_SIZE = named_usage::unspecified,
3183  int Y_GRID_SIZE = named_usage::unspecified,
3184  int X_GRID_SIZE = named_usage::unspecified>
3185 using hip_global_size_zyx_loop =
3186  hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3187  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3188  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3189 
3190 /*
3191  * Maps segment indices to flattened HIP threads, blocks, or global threads.
3192  * This is the lowest overhead mapping, but requires that there are the same
3193  * number of physical threads, blocks, or global threads as the map requests.
3194  * Reshapes multiple physical threads, blocks, or global threads into a 1D
3195  * iteration space.
3196  */
3197 template<int X_SIZE>
3198 using hip_flatten_thread_size_x_direct_unchecked =
3199  hip_flatten_indexer_direct_unchecked<hip::thread_x<X_SIZE>>;
3200 template<int Y_SIZE>
3201 using hip_flatten_thread_size_y_direct_unchecked =
3202  hip_flatten_indexer_direct_unchecked<hip::thread_y<Y_SIZE>>;
3203 template<int Z_SIZE>
3204 using hip_flatten_thread_size_z_direct_unchecked =
3205  hip_flatten_indexer_direct_unchecked<hip::thread_z<Z_SIZE>>;
3206 template<int X_SIZE, int Y_SIZE>
3207 using hip_flatten_thread_size_xy_direct_unchecked =
3208  hip_flatten_indexer_direct_unchecked<hip::thread_x<X_SIZE>,
3209  hip::thread_y<Y_SIZE>>;
3210 template<int X_SIZE, int Z_SIZE>
3211 using hip_flatten_thread_size_xz_direct_unchecked =
3212  hip_flatten_indexer_direct_unchecked<hip::thread_x<X_SIZE>,
3213  hip::thread_z<Z_SIZE>>;
3214 template<int Y_SIZE, int X_SIZE>
3215 using hip_flatten_thread_size_yx_direct_unchecked =
3216  hip_flatten_indexer_direct_unchecked<hip::thread_y<Y_SIZE>,
3217  hip::thread_x<X_SIZE>>;
3218 template<int Y_SIZE, int Z_SIZE>
3219 using hip_flatten_thread_size_yz_direct_unchecked =
3220  hip_flatten_indexer_direct_unchecked<hip::thread_y<Y_SIZE>,
3221  hip::thread_z<Z_SIZE>>;
3222 template<int Z_SIZE, int X_SIZE>
3223 using hip_flatten_thread_size_zx_direct_unchecked =
3224  hip_flatten_indexer_direct_unchecked<hip::thread_z<Z_SIZE>,
3225  hip::thread_x<X_SIZE>>;
3226 template<int Z_SIZE, int Y_SIZE>
3227 using hip_flatten_thread_size_zy_direct_unchecked =
3228  hip_flatten_indexer_direct_unchecked<hip::thread_z<Z_SIZE>,
3229  hip::thread_y<Y_SIZE>>;
3230 template<int X_SIZE, int Y_SIZE, int Z_SIZE>
3231 using hip_flatten_thread_size_xyz_direct_unchecked =
3232  hip_flatten_indexer_direct_unchecked<hip::thread_x<X_SIZE>,
3233  hip::thread_y<Y_SIZE>,
3234  hip::thread_z<Z_SIZE>>;
3235 template<int X_SIZE, int Z_SIZE, int Y_SIZE>
3236 using hip_flatten_thread_size_xzy_direct_unchecked =
3237  hip_flatten_indexer_direct_unchecked<hip::thread_x<X_SIZE>,
3238  hip::thread_z<Z_SIZE>,
3239  hip::thread_y<Y_SIZE>>;
3240 template<int Y_SIZE, int X_SIZE, int Z_SIZE>
3241 using hip_flatten_thread_size_yxz_direct_unchecked =
3242  hip_flatten_indexer_direct_unchecked<hip::thread_y<Y_SIZE>,
3243  hip::thread_x<X_SIZE>,
3244  hip::thread_z<Z_SIZE>>;
3245 template<int Y_SIZE, int Z_SIZE, int X_SIZE>
3246 using hip_flatten_thread_size_yzx_direct_unchecked =
3247  hip_flatten_indexer_direct_unchecked<hip::thread_y<Y_SIZE>,
3248  hip::thread_z<Z_SIZE>,
3249  hip::thread_x<X_SIZE>>;
3250 template<int Z_SIZE, int X_SIZE, int Y_SIZE>
3251 using hip_flatten_thread_size_zxy_direct_unchecked =
3252  hip_flatten_indexer_direct_unchecked<hip::thread_z<Z_SIZE>,
3253  hip::thread_x<X_SIZE>,
3254  hip::thread_y<Y_SIZE>>;
3255 template<int Z_SIZE, int Y_SIZE, int X_SIZE>
3256 using hip_flatten_thread_size_zyx_direct_unchecked =
3257  hip_flatten_indexer_direct_unchecked<hip::thread_z<Z_SIZE>,
3258  hip::thread_y<Y_SIZE>,
3259  hip::thread_x<X_SIZE>>;
3260 
3261 template<int X_SIZE>
3262 using hip_flatten_block_size_x_direct_unchecked =
3263  hip_flatten_indexer_direct_unchecked<hip::block_x<X_SIZE>>;
3264 template<int Y_SIZE>
3265 using hip_flatten_block_size_y_direct_unchecked =
3266  hip_flatten_indexer_direct_unchecked<hip::block_y<Y_SIZE>>;
3267 template<int Z_SIZE>
3268 using hip_flatten_block_size_z_direct_unchecked =
3269  hip_flatten_indexer_direct_unchecked<hip::block_z<Z_SIZE>>;
3270 template<int X_SIZE, int Y_SIZE>
3271 using hip_flatten_block_size_xy_direct_unchecked =
3272  hip_flatten_indexer_direct_unchecked<hip::block_x<X_SIZE>,
3273  hip::block_y<Y_SIZE>>;
3274 template<int X_SIZE, int Z_SIZE>
3275 using hip_flatten_block_size_xz_direct_unchecked =
3276  hip_flatten_indexer_direct_unchecked<hip::block_x<X_SIZE>,
3277  hip::block_z<Z_SIZE>>;
3278 template<int Y_SIZE, int X_SIZE>
3279 using hip_flatten_block_size_yx_direct_unchecked =
3280  hip_flatten_indexer_direct_unchecked<hip::block_y<Y_SIZE>,
3281  hip::block_x<X_SIZE>>;
3282 template<int Y_SIZE, int Z_SIZE>
3283 using hip_flatten_block_size_yz_direct_unchecked =
3284  hip_flatten_indexer_direct_unchecked<hip::block_y<Y_SIZE>,
3285  hip::block_z<Z_SIZE>>;
3286 template<int Z_SIZE, int X_SIZE>
3287 using hip_flatten_block_size_zx_direct_unchecked =
3288  hip_flatten_indexer_direct_unchecked<hip::block_z<Z_SIZE>,
3289  hip::block_x<X_SIZE>>;
3290 template<int Z_SIZE, int Y_SIZE>
3291 using hip_flatten_block_size_zy_direct_unchecked =
3292  hip_flatten_indexer_direct_unchecked<hip::block_z<Z_SIZE>,
3293  hip::block_y<Y_SIZE>>;
3294 template<int X_SIZE, int Y_SIZE, int Z_SIZE>
3295 using hip_flatten_block_size_xyz_direct_unchecked =
3296  hip_flatten_indexer_direct_unchecked<hip::block_x<X_SIZE>,
3297  hip::block_y<Y_SIZE>,
3298  hip::block_z<Z_SIZE>>;
3299 template<int X_SIZE, int Z_SIZE, int Y_SIZE>
3300 using hip_flatten_block_size_xzy_direct_unchecked =
3301  hip_flatten_indexer_direct_unchecked<hip::block_x<X_SIZE>,
3302  hip::block_z<Z_SIZE>,
3303  hip::block_y<Y_SIZE>>;
3304 template<int Y_SIZE, int X_SIZE, int Z_SIZE>
3305 using hip_flatten_block_size_yxz_direct_unchecked =
3306  hip_flatten_indexer_direct_unchecked<hip::block_y<Y_SIZE>,
3307  hip::block_x<X_SIZE>,
3308  hip::block_z<Z_SIZE>>;
3309 template<int Y_SIZE, int Z_SIZE, int X_SIZE>
3310 using hip_flatten_block_size_yzx_direct_unchecked =
3311  hip_flatten_indexer_direct_unchecked<hip::block_y<Y_SIZE>,
3312  hip::block_z<Z_SIZE>,
3313  hip::block_x<X_SIZE>>;
3314 template<int Z_SIZE, int X_SIZE, int Y_SIZE>
3315 using hip_flatten_block_size_zxy_direct_unchecked =
3316  hip_flatten_indexer_direct_unchecked<hip::block_z<Z_SIZE>,
3317  hip::block_x<X_SIZE>,
3318  hip::block_y<Y_SIZE>>;
3319 template<int Z_SIZE, int Y_SIZE, int X_SIZE>
3320 using hip_flatten_block_size_zyx_direct_unchecked =
3321  hip_flatten_indexer_direct_unchecked<hip::block_z<Z_SIZE>,
3322  hip::block_y<Y_SIZE>,
3323  hip::block_x<X_SIZE>>;
3324 
3325 template<int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
3326 using hip_flatten_global_size_x_direct_unchecked =
3327  hip_flatten_indexer_direct_unchecked<
3328  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3329 template<int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
3330 using hip_flatten_global_size_y_direct_unchecked =
3331  hip_flatten_indexer_direct_unchecked<
3332  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3333 template<int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
3334 using hip_flatten_global_size_z_direct_unchecked =
3335  hip_flatten_indexer_direct_unchecked<
3336  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3337 template<int X_BLOCK_SIZE,
3338  int Y_BLOCK_SIZE,
3339  int X_GRID_SIZE = named_usage::unspecified,
3340  int Y_GRID_SIZE = named_usage::unspecified>
3341 using hip_flatten_global_size_xy_direct_unchecked =
3342  hip_flatten_indexer_direct_unchecked<
3343  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3344  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3345 template<int X_BLOCK_SIZE,
3346  int Z_BLOCK_SIZE,
3347  int X_GRID_SIZE = named_usage::unspecified,
3348  int Z_GRID_SIZE = named_usage::unspecified>
3349 using hip_flatten_global_size_xz_direct_unchecked =
3350  hip_flatten_indexer_direct_unchecked<
3351  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3352  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3353 template<int Y_BLOCK_SIZE,
3354  int X_BLOCK_SIZE,
3355  int Y_GRID_SIZE = named_usage::unspecified,
3356  int X_GRID_SIZE = named_usage::unspecified>
3357 using hip_flatten_global_size_yx_direct_unchecked =
3358  hip_flatten_indexer_direct_unchecked<
3359  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3360  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3361 template<int Y_BLOCK_SIZE,
3362  int Z_BLOCK_SIZE,
3363  int Y_GRID_SIZE = named_usage::unspecified,
3364  int Z_GRID_SIZE = named_usage::unspecified>
3365 using hip_flatten_global_size_yz_direct_unchecked =
3366  hip_flatten_indexer_direct_unchecked<
3367  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3368  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3369 template<int Z_BLOCK_SIZE,
3370  int X_BLOCK_SIZE,
3371  int Z_GRID_SIZE = named_usage::unspecified,
3372  int X_GRID_SIZE = named_usage::unspecified>
3373 using hip_flatten_global_size_zx_direct_unchecked =
3374  hip_flatten_indexer_direct_unchecked<
3375  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3376  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3377 template<int Z_BLOCK_SIZE,
3378  int Y_BLOCK_SIZE,
3379  int Z_GRID_SIZE = named_usage::unspecified,
3380  int Y_GRID_SIZE = named_usage::unspecified>
3381 using hip_flatten_global_size_zy_direct_unchecked =
3382  hip_flatten_indexer_direct_unchecked<
3383  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3384  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3385 template<int X_BLOCK_SIZE,
3386  int Y_BLOCK_SIZE,
3387  int Z_BLOCK_SIZE,
3388  int X_GRID_SIZE = named_usage::unspecified,
3389  int Y_GRID_SIZE = named_usage::unspecified,
3390  int Z_GRID_SIZE = named_usage::unspecified>
3391 using hip_flatten_global_size_xyz_direct_unchecked =
3392  hip_flatten_indexer_direct_unchecked<
3393  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3394  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3395  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3396 template<int X_BLOCK_SIZE,
3397  int Z_BLOCK_SIZE,
3398  int Y_BLOCK_SIZE,
3399  int X_GRID_SIZE = named_usage::unspecified,
3400  int Z_GRID_SIZE = named_usage::unspecified,
3401  int Y_GRID_SIZE = named_usage::unspecified>
3402 using hip_flatten_global_size_xzy_direct_unchecked =
3403  hip_flatten_indexer_direct_unchecked<
3404  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3405  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3406  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3407 template<int Y_BLOCK_SIZE,
3408  int X_BLOCK_SIZE,
3409  int Z_BLOCK_SIZE,
3410  int Y_GRID_SIZE = named_usage::unspecified,
3411  int X_GRID_SIZE = named_usage::unspecified,
3412  int Z_GRID_SIZE = named_usage::unspecified>
3413 using hip_flatten_global_size_yxz_direct_unchecked =
3414  hip_flatten_indexer_direct_unchecked<
3415  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3416  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3417  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3418 template<int Y_BLOCK_SIZE,
3419  int Z_BLOCK_SIZE,
3420  int X_BLOCK_SIZE,
3421  int Y_GRID_SIZE = named_usage::unspecified,
3422  int Z_GRID_SIZE = named_usage::unspecified,
3423  int X_GRID_SIZE = named_usage::unspecified>
3424 using hip_flatten_global_size_yzx_direct_unchecked =
3425  hip_flatten_indexer_direct_unchecked<
3426  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3427  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3428  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3429 template<int Z_BLOCK_SIZE,
3430  int X_BLOCK_SIZE,
3431  int Y_BLOCK_SIZE,
3432  int Z_GRID_SIZE = named_usage::unspecified,
3433  int X_GRID_SIZE = named_usage::unspecified,
3434  int Y_GRID_SIZE = named_usage::unspecified>
3435 using hip_flatten_global_size_zxy_direct_unchecked =
3436  hip_flatten_indexer_direct_unchecked<
3437  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3438  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3439  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3440 template<int Z_BLOCK_SIZE,
3441  int Y_BLOCK_SIZE,
3442  int X_BLOCK_SIZE,
3443  int Z_GRID_SIZE = named_usage::unspecified,
3444  int Y_GRID_SIZE = named_usage::unspecified,
3445  int X_GRID_SIZE = named_usage::unspecified>
3446 using hip_flatten_global_size_zyx_direct_unchecked =
3447  hip_flatten_indexer_direct_unchecked<
3448  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3449  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3450  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3451 
3452 /*
3453  * Maps segment indices to flattened HIP threads, blocks, or global threads.
3454  * This is a low overhead mapping, but requires that there are enough
3455  * physical threads, blocks, or global threads to fit all of the direct map
3456  * requests.
3457  * Reshapes multiple physical threads, blocks, or global threads into a 1D
3458  * iteration space.
3459  */
3460 template<int X_SIZE>
3461 using hip_flatten_thread_size_x_direct =
3462  hip_flatten_indexer_direct<hip::thread_x<X_SIZE>>;
3463 template<int Y_SIZE>
3464 using hip_flatten_thread_size_y_direct =
3465  hip_flatten_indexer_direct<hip::thread_y<Y_SIZE>>;
3466 template<int Z_SIZE>
3467 using hip_flatten_thread_size_z_direct =
3468  hip_flatten_indexer_direct<hip::thread_z<Z_SIZE>>;
3469 template<int X_SIZE, int Y_SIZE>
3470 using hip_flatten_thread_size_xy_direct =
3471  hip_flatten_indexer_direct<hip::thread_x<X_SIZE>, hip::thread_y<Y_SIZE>>;
3472 template<int X_SIZE, int Z_SIZE>
3473 using hip_flatten_thread_size_xz_direct =
3474  hip_flatten_indexer_direct<hip::thread_x<X_SIZE>, hip::thread_z<Z_SIZE>>;
3475 template<int Y_SIZE, int X_SIZE>
3476 using hip_flatten_thread_size_yx_direct =
3477  hip_flatten_indexer_direct<hip::thread_y<Y_SIZE>, hip::thread_x<X_SIZE>>;
3478 template<int Y_SIZE, int Z_SIZE>
3479 using hip_flatten_thread_size_yz_direct =
3480  hip_flatten_indexer_direct<hip::thread_y<Y_SIZE>, hip::thread_z<Z_SIZE>>;
3481 template<int Z_SIZE, int X_SIZE>
3482 using hip_flatten_thread_size_zx_direct =
3483  hip_flatten_indexer_direct<hip::thread_z<Z_SIZE>, hip::thread_x<X_SIZE>>;
3484 template<int Z_SIZE, int Y_SIZE>
3485 using hip_flatten_thread_size_zy_direct =
3486  hip_flatten_indexer_direct<hip::thread_z<Z_SIZE>, hip::thread_y<Y_SIZE>>;
3487 template<int X_SIZE, int Y_SIZE, int Z_SIZE>
3488 using hip_flatten_thread_size_xyz_direct =
3489  hip_flatten_indexer_direct<hip::thread_x<X_SIZE>,
3490  hip::thread_y<Y_SIZE>,
3491  hip::thread_z<Z_SIZE>>;
3492 template<int X_SIZE, int Z_SIZE, int Y_SIZE>
3493 using hip_flatten_thread_size_xzy_direct =
3494  hip_flatten_indexer_direct<hip::thread_x<X_SIZE>,
3495  hip::thread_z<Z_SIZE>,
3496  hip::thread_y<Y_SIZE>>;
3497 template<int Y_SIZE, int X_SIZE, int Z_SIZE>
3498 using hip_flatten_thread_size_yxz_direct =
3499  hip_flatten_indexer_direct<hip::thread_y<Y_SIZE>,
3500  hip::thread_x<X_SIZE>,
3501  hip::thread_z<Z_SIZE>>;
3502 template<int Y_SIZE, int Z_SIZE, int X_SIZE>
3503 using hip_flatten_thread_size_yzx_direct =
3504  hip_flatten_indexer_direct<hip::thread_y<Y_SIZE>,
3505  hip::thread_z<Z_SIZE>,
3506  hip::thread_x<X_SIZE>>;
3507 template<int Z_SIZE, int X_SIZE, int Y_SIZE>
3508 using hip_flatten_thread_size_zxy_direct =
3509  hip_flatten_indexer_direct<hip::thread_z<Z_SIZE>,
3510  hip::thread_x<X_SIZE>,
3511  hip::thread_y<Y_SIZE>>;
3512 template<int Z_SIZE, int Y_SIZE, int X_SIZE>
3513 using hip_flatten_thread_size_zyx_direct =
3514  hip_flatten_indexer_direct<hip::thread_z<Z_SIZE>,
3515  hip::thread_y<Y_SIZE>,
3516  hip::thread_x<X_SIZE>>;
3517 
3518 template<int X_SIZE>
3519 using hip_flatten_block_size_x_direct =
3520  hip_flatten_indexer_direct<hip::block_x<X_SIZE>>;
3521 template<int Y_SIZE>
3522 using hip_flatten_block_size_y_direct =
3523  hip_flatten_indexer_direct<hip::block_y<Y_SIZE>>;
3524 template<int Z_SIZE>
3525 using hip_flatten_block_size_z_direct =
3526  hip_flatten_indexer_direct<hip::block_z<Z_SIZE>>;
3527 template<int X_SIZE, int Y_SIZE>
3528 using hip_flatten_block_size_xy_direct =
3529  hip_flatten_indexer_direct<hip::block_x<X_SIZE>, hip::block_y<Y_SIZE>>;
3530 template<int X_SIZE, int Z_SIZE>
3531 using hip_flatten_block_size_xz_direct =
3532  hip_flatten_indexer_direct<hip::block_x<X_SIZE>, hip::block_z<Z_SIZE>>;
3533 template<int Y_SIZE, int X_SIZE>
3534 using hip_flatten_block_size_yx_direct =
3535  hip_flatten_indexer_direct<hip::block_y<Y_SIZE>, hip::block_x<X_SIZE>>;
3536 template<int Y_SIZE, int Z_SIZE>
3537 using hip_flatten_block_size_yz_direct =
3538  hip_flatten_indexer_direct<hip::block_y<Y_SIZE>, hip::block_z<Z_SIZE>>;
3539 template<int Z_SIZE, int X_SIZE>
3540 using hip_flatten_block_size_zx_direct =
3541  hip_flatten_indexer_direct<hip::block_z<Z_SIZE>, hip::block_x<X_SIZE>>;
3542 template<int Z_SIZE, int Y_SIZE>
3543 using hip_flatten_block_size_zy_direct =
3544  hip_flatten_indexer_direct<hip::block_z<Z_SIZE>, hip::block_y<Y_SIZE>>;
3545 template<int X_SIZE, int Y_SIZE, int Z_SIZE>
3546 using hip_flatten_block_size_xyz_direct =
3547  hip_flatten_indexer_direct<hip::block_x<X_SIZE>,
3548  hip::block_y<Y_SIZE>,
3549  hip::block_z<Z_SIZE>>;
3550 template<int X_SIZE, int Z_SIZE, int Y_SIZE>
3551 using hip_flatten_block_size_xzy_direct =
3552  hip_flatten_indexer_direct<hip::block_x<X_SIZE>,
3553  hip::block_z<Z_SIZE>,
3554  hip::block_y<Y_SIZE>>;
3555 template<int Y_SIZE, int X_SIZE, int Z_SIZE>
3556 using hip_flatten_block_size_yxz_direct =
3557  hip_flatten_indexer_direct<hip::block_y<Y_SIZE>,
3558  hip::block_x<X_SIZE>,
3559  hip::block_z<Z_SIZE>>;
3560 template<int Y_SIZE, int Z_SIZE, int X_SIZE>
3561 using hip_flatten_block_size_yzx_direct =
3562  hip_flatten_indexer_direct<hip::block_y<Y_SIZE>,
3563  hip::block_z<Z_SIZE>,
3564  hip::block_x<X_SIZE>>;
3565 template<int Z_SIZE, int X_SIZE, int Y_SIZE>
3566 using hip_flatten_block_size_zxy_direct =
3567  hip_flatten_indexer_direct<hip::block_z<Z_SIZE>,
3568  hip::block_x<X_SIZE>,
3569  hip::block_y<Y_SIZE>>;
3570 template<int Z_SIZE, int Y_SIZE, int X_SIZE>
3571 using hip_flatten_block_size_zyx_direct =
3572  hip_flatten_indexer_direct<hip::block_z<Z_SIZE>,
3573  hip::block_y<Y_SIZE>,
3574  hip::block_x<X_SIZE>>;
3575 
3576 template<int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
3577 using hip_flatten_global_size_x_direct =
3578  hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3579 template<int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
3580 using hip_flatten_global_size_y_direct =
3581  hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3582 template<int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
3583 using hip_flatten_global_size_z_direct =
3584  hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3585 template<int X_BLOCK_SIZE,
3586  int Y_BLOCK_SIZE,
3587  int X_GRID_SIZE = named_usage::unspecified,
3588  int Y_GRID_SIZE = named_usage::unspecified>
3589 using hip_flatten_global_size_xy_direct =
3590  hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3591  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3592 template<int X_BLOCK_SIZE,
3593  int Z_BLOCK_SIZE,
3594  int X_GRID_SIZE = named_usage::unspecified,
3595  int Z_GRID_SIZE = named_usage::unspecified>
3596 using hip_flatten_global_size_xz_direct =
3597  hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3598  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3599 template<int Y_BLOCK_SIZE,
3600  int X_BLOCK_SIZE,
3601  int Y_GRID_SIZE = named_usage::unspecified,
3602  int X_GRID_SIZE = named_usage::unspecified>
3603 using hip_flatten_global_size_yx_direct =
3604  hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3605  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3606 template<int Y_BLOCK_SIZE,
3607  int Z_BLOCK_SIZE,
3608  int Y_GRID_SIZE = named_usage::unspecified,
3609  int Z_GRID_SIZE = named_usage::unspecified>
3610 using hip_flatten_global_size_yz_direct =
3611  hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3612  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3613 template<int Z_BLOCK_SIZE,
3614  int X_BLOCK_SIZE,
3615  int Z_GRID_SIZE = named_usage::unspecified,
3616  int X_GRID_SIZE = named_usage::unspecified>
3617 using hip_flatten_global_size_zx_direct =
3618  hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3619  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3620 template<int Z_BLOCK_SIZE,
3621  int Y_BLOCK_SIZE,
3622  int Z_GRID_SIZE = named_usage::unspecified,
3623  int Y_GRID_SIZE = named_usage::unspecified>
3624 using hip_flatten_global_size_zy_direct =
3625  hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3626  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3627 template<int X_BLOCK_SIZE,
3628  int Y_BLOCK_SIZE,
3629  int Z_BLOCK_SIZE,
3630  int X_GRID_SIZE = named_usage::unspecified,
3631  int Y_GRID_SIZE = named_usage::unspecified,
3632  int Z_GRID_SIZE = named_usage::unspecified>
3633 using hip_flatten_global_size_xyz_direct =
3634  hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3635  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3636  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3637 template<int X_BLOCK_SIZE,
3638  int Z_BLOCK_SIZE,
3639  int Y_BLOCK_SIZE,
3640  int X_GRID_SIZE = named_usage::unspecified,
3641  int Z_GRID_SIZE = named_usage::unspecified,
3642  int Y_GRID_SIZE = named_usage::unspecified>
3643 using hip_flatten_global_size_xzy_direct =
3644  hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3645  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3646  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3647 template<int Y_BLOCK_SIZE,
3648  int X_BLOCK_SIZE,
3649  int Z_BLOCK_SIZE,
3650  int Y_GRID_SIZE = named_usage::unspecified,
3651  int X_GRID_SIZE = named_usage::unspecified,
3652  int Z_GRID_SIZE = named_usage::unspecified>
3653 using hip_flatten_global_size_yxz_direct =
3654  hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3655  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3656  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3657 template<int Y_BLOCK_SIZE,
3658  int Z_BLOCK_SIZE,
3659  int X_BLOCK_SIZE,
3660  int Y_GRID_SIZE = named_usage::unspecified,
3661  int Z_GRID_SIZE = named_usage::unspecified,
3662  int X_GRID_SIZE = named_usage::unspecified>
3663 using hip_flatten_global_size_yzx_direct =
3664  hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3665  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3666  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3667 template<int Z_BLOCK_SIZE,
3668  int X_BLOCK_SIZE,
3669  int Y_BLOCK_SIZE,
3670  int Z_GRID_SIZE = named_usage::unspecified,
3671  int X_GRID_SIZE = named_usage::unspecified,
3672  int Y_GRID_SIZE = named_usage::unspecified>
3673 using hip_flatten_global_size_zxy_direct =
3674  hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3675  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3676  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3677 template<int Z_BLOCK_SIZE,
3678  int Y_BLOCK_SIZE,
3679  int X_BLOCK_SIZE,
3680  int Z_GRID_SIZE = named_usage::unspecified,
3681  int Y_GRID_SIZE = named_usage::unspecified,
3682  int X_GRID_SIZE = named_usage::unspecified>
3683 using hip_flatten_global_size_zyx_direct =
3684  hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3685  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3686  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3687 
3688 /*
3689  * Maps segment indices to flattened HIP threads, blocks, or global threads.
3690  * Reshapes multiple physical threads, blocks, or global threads into a 1D
3691  * iteration space.
3692  * Uses block-stride or grid-stride looping to exceed the maximum number of
3693  * physical threads, blocks, or global threads.
3694  */
3695 template<int X_SIZE>
3696 using hip_flatten_thread_size_x_loop =
3697  hip_flatten_indexer_loop<hip::thread_x<X_SIZE>>;
3698 template<int Y_SIZE>
3699 using hip_flatten_thread_size_y_loop =
3700  hip_flatten_indexer_loop<hip::thread_y<Y_SIZE>>;
3701 template<int Z_SIZE>
3702 using hip_flatten_thread_size_z_loop =
3703  hip_flatten_indexer_loop<hip::thread_z<Z_SIZE>>;
3704 template<int X_SIZE, int Y_SIZE>
3705 using hip_flatten_thread_size_xy_loop =
3706  hip_flatten_indexer_loop<hip::thread_x<X_SIZE>, hip::thread_y<Y_SIZE>>;
3707 template<int X_SIZE, int Z_SIZE>
3708 using hip_flatten_thread_size_xz_loop =
3709  hip_flatten_indexer_loop<hip::thread_x<X_SIZE>, hip::thread_z<Z_SIZE>>;
3710 template<int Y_SIZE, int X_SIZE>
3711 using hip_flatten_thread_size_yx_loop =
3712  hip_flatten_indexer_loop<hip::thread_y<Y_SIZE>, hip::thread_x<X_SIZE>>;
3713 template<int Y_SIZE, int Z_SIZE>
3714 using hip_flatten_thread_size_yz_loop =
3715  hip_flatten_indexer_loop<hip::thread_y<Y_SIZE>, hip::thread_z<Z_SIZE>>;
3716 template<int Z_SIZE, int X_SIZE>
3717 using hip_flatten_thread_size_zx_loop =
3718  hip_flatten_indexer_loop<hip::thread_z<Z_SIZE>, hip::thread_x<X_SIZE>>;
3719 template<int Z_SIZE, int Y_SIZE>
3720 using hip_flatten_thread_size_zy_loop =
3721  hip_flatten_indexer_loop<hip::thread_z<Z_SIZE>, hip::thread_y<Y_SIZE>>;
3722 template<int X_SIZE, int Y_SIZE, int Z_SIZE>
3723 using hip_flatten_thread_size_xyz_loop =
3724  hip_flatten_indexer_loop<hip::thread_x<X_SIZE>,
3725  hip::thread_y<Y_SIZE>,
3726  hip::thread_z<Z_SIZE>>;
3727 template<int X_SIZE, int Z_SIZE, int Y_SIZE>
3728 using hip_flatten_thread_size_xzy_loop =
3729  hip_flatten_indexer_loop<hip::thread_x<X_SIZE>,
3730  hip::thread_z<Z_SIZE>,
3731  hip::thread_y<Y_SIZE>>;
3732 template<int Y_SIZE, int X_SIZE, int Z_SIZE>
3733 using hip_flatten_thread_size_yxz_loop =
3734  hip_flatten_indexer_loop<hip::thread_y<Y_SIZE>,
3735  hip::thread_x<X_SIZE>,
3736  hip::thread_z<Z_SIZE>>;
3737 template<int Y_SIZE, int Z_SIZE, int X_SIZE>
3738 using hip_flatten_thread_size_yzx_loop =
3739  hip_flatten_indexer_loop<hip::thread_y<Y_SIZE>,
3740  hip::thread_z<Z_SIZE>,
3741  hip::thread_x<X_SIZE>>;
3742 template<int Z_SIZE, int X_SIZE, int Y_SIZE>
3743 using hip_flatten_thread_size_zxy_loop =
3744  hip_flatten_indexer_loop<hip::thread_z<Z_SIZE>,
3745  hip::thread_x<X_SIZE>,
3746  hip::thread_y<Y_SIZE>>;
3747 template<int Z_SIZE, int Y_SIZE, int X_SIZE>
3748 using hip_flatten_thread_size_zyx_loop =
3749  hip_flatten_indexer_loop<hip::thread_z<Z_SIZE>,
3750  hip::thread_y<Y_SIZE>,
3751  hip::thread_x<X_SIZE>>;
3752 
3753 template<int X_SIZE>
3754 using hip_flatten_block_size_x_loop =
3755  hip_flatten_indexer_loop<hip::block_x<X_SIZE>>;
3756 template<int Y_SIZE>
3757 using hip_flatten_block_size_y_loop =
3758  hip_flatten_indexer_loop<hip::block_y<Y_SIZE>>;
3759 template<int Z_SIZE>
3760 using hip_flatten_block_size_z_loop =
3761  hip_flatten_indexer_loop<hip::block_z<Z_SIZE>>;
3762 template<int X_SIZE, int Y_SIZE>
3763 using hip_flatten_block_size_xy_loop =
3764  hip_flatten_indexer_loop<hip::block_x<X_SIZE>, hip::block_y<Y_SIZE>>;
3765 template<int X_SIZE, int Z_SIZE>
3766 using hip_flatten_block_size_xz_loop =
3767  hip_flatten_indexer_loop<hip::block_x<X_SIZE>, hip::block_z<Z_SIZE>>;
3768 template<int Y_SIZE, int X_SIZE>
3769 using hip_flatten_block_size_yx_loop =
3770  hip_flatten_indexer_loop<hip::block_y<Y_SIZE>, hip::block_x<X_SIZE>>;
3771 template<int Y_SIZE, int Z_SIZE>
3772 using hip_flatten_block_size_yz_loop =
3773  hip_flatten_indexer_loop<hip::block_y<Y_SIZE>, hip::block_z<Z_SIZE>>;
3774 template<int Z_SIZE, int X_SIZE>
3775 using hip_flatten_block_size_zx_loop =
3776  hip_flatten_indexer_loop<hip::block_z<Z_SIZE>, hip::block_x<X_SIZE>>;
3777 template<int Z_SIZE, int Y_SIZE>
3778 using hip_flatten_block_size_zy_loop =
3779  hip_flatten_indexer_loop<hip::block_z<Z_SIZE>, hip::block_y<Y_SIZE>>;
3780 template<int X_SIZE, int Y_SIZE, int Z_SIZE>
3781 using hip_flatten_block_size_xyz_loop =
3782  hip_flatten_indexer_loop<hip::block_x<X_SIZE>,
3783  hip::block_y<Y_SIZE>,
3784  hip::block_z<Z_SIZE>>;
3785 template<int X_SIZE, int Z_SIZE, int Y_SIZE>
3786 using hip_flatten_block_size_xzy_loop =
3787  hip_flatten_indexer_loop<hip::block_x<X_SIZE>,
3788  hip::block_z<Z_SIZE>,
3789  hip::block_y<Y_SIZE>>;
3790 template<int Y_SIZE, int X_SIZE, int Z_SIZE>
3791 using hip_flatten_block_size_yxz_loop =
3792  hip_flatten_indexer_loop<hip::block_y<Y_SIZE>,
3793  hip::block_x<X_SIZE>,
3794  hip::block_z<Z_SIZE>>;
3795 template<int Y_SIZE, int Z_SIZE, int X_SIZE>
3796 using hip_flatten_block_size_yzx_loop =
3797  hip_flatten_indexer_loop<hip::block_y<Y_SIZE>,
3798  hip::block_z<Z_SIZE>,
3799  hip::block_x<X_SIZE>>;
3800 template<int Z_SIZE, int X_SIZE, int Y_SIZE>
3801 using hip_flatten_block_size_zxy_loop =
3802  hip_flatten_indexer_loop<hip::block_z<Z_SIZE>,
3803  hip::block_x<X_SIZE>,
3804  hip::block_y<Y_SIZE>>;
3805 template<int Z_SIZE, int Y_SIZE, int X_SIZE>
3806 using hip_flatten_block_size_zyx_loop =
3807  hip_flatten_indexer_loop<hip::block_z<Z_SIZE>,
3808  hip::block_y<Y_SIZE>,
3809  hip::block_x<X_SIZE>>;
3810 
3811 template<int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
3812 using hip_flatten_global_size_x_loop =
3813  hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3814 template<int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
3815 using hip_flatten_global_size_y_loop =
3816  hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3817 template<int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
3818 using hip_flatten_global_size_z_loop =
3819  hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3820 template<int X_BLOCK_SIZE,
3821  int Y_BLOCK_SIZE,
3822  int X_GRID_SIZE = named_usage::unspecified,
3823  int Y_GRID_SIZE = named_usage::unspecified>
3824 using hip_flatten_global_size_xy_loop =
3825  hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3826  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3827 template<int X_BLOCK_SIZE,
3828  int Z_BLOCK_SIZE,
3829  int X_GRID_SIZE = named_usage::unspecified,
3830  int Z_GRID_SIZE = named_usage::unspecified>
3831 using hip_flatten_global_size_xz_loop =
3832  hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3833  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3834 template<int Y_BLOCK_SIZE,
3835  int X_BLOCK_SIZE,
3836  int Y_GRID_SIZE = named_usage::unspecified,
3837  int X_GRID_SIZE = named_usage::unspecified>
3838 using hip_flatten_global_size_yx_loop =
3839  hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3840  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3841 template<int Y_BLOCK_SIZE,
3842  int Z_BLOCK_SIZE,
3843  int Y_GRID_SIZE = named_usage::unspecified,
3844  int Z_GRID_SIZE = named_usage::unspecified>
3845 using hip_flatten_global_size_yz_loop =
3846  hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3847  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3848 template<int Z_BLOCK_SIZE,
3849  int X_BLOCK_SIZE,
3850  int Z_GRID_SIZE = named_usage::unspecified,
3851  int X_GRID_SIZE = named_usage::unspecified>
3852 using hip_flatten_global_size_zx_loop =
3853  hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3854  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3855 template<int Z_BLOCK_SIZE,
3856  int Y_BLOCK_SIZE,
3857  int Z_GRID_SIZE = named_usage::unspecified,
3858  int Y_GRID_SIZE = named_usage::unspecified>
3859 using hip_flatten_global_size_zy_loop =
3860  hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3861  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3862 template<int X_BLOCK_SIZE,
3863  int Y_BLOCK_SIZE,
3864  int Z_BLOCK_SIZE,
3865  int X_GRID_SIZE = named_usage::unspecified,
3866  int Y_GRID_SIZE = named_usage::unspecified,
3867  int Z_GRID_SIZE = named_usage::unspecified>
3868 using hip_flatten_global_size_xyz_loop =
3869  hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3870  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3871  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3872 template<int X_BLOCK_SIZE,
3873  int Z_BLOCK_SIZE,
3874  int Y_BLOCK_SIZE,
3875  int X_GRID_SIZE = named_usage::unspecified,
3876  int Z_GRID_SIZE = named_usage::unspecified,
3877  int Y_GRID_SIZE = named_usage::unspecified>
3878 using hip_flatten_global_size_xzy_loop =
3879  hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3880  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3881  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3882 template<int Y_BLOCK_SIZE,
3883  int X_BLOCK_SIZE,
3884  int Z_BLOCK_SIZE,
3885  int Y_GRID_SIZE = named_usage::unspecified,
3886  int X_GRID_SIZE = named_usage::unspecified,
3887  int Z_GRID_SIZE = named_usage::unspecified>
3888 using hip_flatten_global_size_yxz_loop =
3889  hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3890  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3891  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
3892 template<int Y_BLOCK_SIZE,
3893  int Z_BLOCK_SIZE,
3894  int X_BLOCK_SIZE,
3895  int Y_GRID_SIZE = named_usage::unspecified,
3896  int Z_GRID_SIZE = named_usage::unspecified,
3897  int X_GRID_SIZE = named_usage::unspecified>
3898 using hip_flatten_global_size_yzx_loop =
3899  hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3900  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3901  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3902 template<int Z_BLOCK_SIZE,
3903  int X_BLOCK_SIZE,
3904  int Y_BLOCK_SIZE,
3905  int Z_GRID_SIZE = named_usage::unspecified,
3906  int X_GRID_SIZE = named_usage::unspecified,
3907  int Y_GRID_SIZE = named_usage::unspecified>
3908 using hip_flatten_global_size_zxy_loop =
3909  hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3910  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
3911  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
3912 template<int Z_BLOCK_SIZE,
3913  int Y_BLOCK_SIZE,
3914  int X_BLOCK_SIZE,
3915  int Z_GRID_SIZE = named_usage::unspecified,
3916  int Y_GRID_SIZE = named_usage::unspecified,
3917  int X_GRID_SIZE = named_usage::unspecified>
3918 using hip_flatten_global_size_zyx_loop =
3919  hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
3920  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
3921  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
3922 
3923 
3924 /*
3925  * Deprecated policies
3926  */
3927 using hip_global_thread_x = hip_global_x_direct;
3928 using hip_global_thread_y = hip_global_y_direct;
3929 using hip_global_thread_z = hip_global_z_direct;
3930 
3931 using hip_global_thread_xy = hip_global_xy_direct;
3932 using hip_global_thread_xz = hip_global_xz_direct;
3933 using hip_global_thread_yx = hip_global_yx_direct;
3934 using hip_global_thread_yz = hip_global_yz_direct;
3935 using hip_global_thread_zx = hip_global_zx_direct;
3936 using hip_global_thread_zy = hip_global_zy_direct;
3937 
3938 using hip_global_thread_xyz = hip_global_xyz_direct;
3939 using hip_global_thread_xzy = hip_global_xzy_direct;
3940 using hip_global_thread_yxz = hip_global_yxz_direct;
3941 using hip_global_thread_yzx = hip_global_yzx_direct;
3942 using hip_global_thread_zxy = hip_global_zxy_direct;
3943 using hip_global_thread_zyx = hip_global_zyx_direct;
3944 
3945 using hip_flatten_block_threads_xy_direct = hip_flatten_thread_xy_direct;
3946 using hip_flatten_block_threads_xz_direct = hip_flatten_thread_xz_direct;
3947 using hip_flatten_block_threads_yx_direct = hip_flatten_thread_yx_direct;
3948 using hip_flatten_block_threads_yz_direct = hip_flatten_thread_yz_direct;
3949 using hip_flatten_block_threads_zx_direct = hip_flatten_thread_zx_direct;
3950 using hip_flatten_block_threads_zy_direct = hip_flatten_thread_zy_direct;
3951 
3952 using hip_flatten_block_threads_xyz_direct = hip_flatten_thread_xyz_direct;
3953 using hip_flatten_block_threads_xzy_direct = hip_flatten_thread_xzy_direct;
3954 using hip_flatten_block_threads_yxz_direct = hip_flatten_thread_yxz_direct;
3955 using hip_flatten_block_threads_yzx_direct = hip_flatten_thread_yzx_direct;
3956 using hip_flatten_block_threads_zxy_direct = hip_flatten_thread_zxy_direct;
3957 using hip_flatten_block_threads_zyx_direct = hip_flatten_thread_zyx_direct;
3958 
3959 using hip_flatten_block_threads_xy_loop = hip_flatten_thread_xy_loop;
3960 using hip_flatten_block_threads_xz_loop = hip_flatten_thread_xz_loop;
3961 using hip_flatten_block_threads_yx_loop = hip_flatten_thread_yx_loop;
3962 using hip_flatten_block_threads_yz_loop = hip_flatten_thread_yz_loop;
3963 using hip_flatten_block_threads_zx_loop = hip_flatten_thread_zx_loop;
3964 using hip_flatten_block_threads_zy_loop = hip_flatten_thread_zy_loop;
3965 
3966 using hip_flatten_block_threads_xyz_loop = hip_flatten_thread_xyz_loop;
3967 using hip_flatten_block_threads_xzy_loop = hip_flatten_thread_xzy_loop;
3968 using hip_flatten_block_threads_yxz_loop = hip_flatten_thread_yxz_loop;
3969 using hip_flatten_block_threads_yzx_loop = hip_flatten_thread_yzx_loop;
3970 using hip_flatten_block_threads_zxy_loop = hip_flatten_thread_zxy_loop;
3971 using hip_flatten_block_threads_zyx_loop = hip_flatten_thread_zyx_loop;
3972 
3973 using hip_block_xy_nested_direct = hip_block_xy_direct;
3974 using hip_block_xz_nested_direct = hip_block_xz_direct;
3975 using hip_block_yx_nested_direct = hip_block_yx_direct;
3976 using hip_block_yz_nested_direct = hip_block_yz_direct;
3977 using hip_block_zx_nested_direct = hip_block_zx_direct;
3978 using hip_block_zy_nested_direct = hip_block_zy_direct;
3979 
3980 using hip_block_xyz_nested_direct = hip_block_xyz_direct;
3981 using hip_block_xzy_nested_direct = hip_block_xzy_direct;
3982 using hip_block_yxz_nested_direct = hip_block_yxz_direct;
3983 using hip_block_yzx_nested_direct = hip_block_yzx_direct;
3984 using hip_block_zxy_nested_direct = hip_block_zxy_direct;
3985 using hip_block_zyx_nested_direct = hip_block_zyx_direct;
3986 
3987 using hip_block_xy_nested_loop = hip_block_xy_loop;
3988 using hip_block_xz_nested_loop = hip_block_xz_loop;
3989 using hip_block_yx_nested_loop = hip_block_yx_loop;
3990 using hip_block_yz_nested_loop = hip_block_yz_loop;
3991 using hip_block_zx_nested_loop = hip_block_zx_loop;
3992 using hip_block_zy_nested_loop = hip_block_zy_loop;
3993 
3994 using hip_block_xyz_nested_loop = hip_block_xyz_loop;
3995 using hip_block_xzy_nested_loop = hip_block_xzy_loop;
3996 using hip_block_yxz_nested_loop = hip_block_yxz_loop;
3997 using hip_block_yzx_nested_loop = hip_block_yzx_loop;
3998 using hip_block_zxy_nested_loop = hip_block_zxy_loop;
3999 using hip_block_zyx_nested_loop = hip_block_zyx_loop;
4000 
4001 } // namespace RAJA
4002 
4003 #endif // RAJA_ENABLE_HIP
4004 #endif
RAJA header file defining Simple Offset Calculators.
Header file for RAJA operator definitions.
Header file for basic RAJA policy mechanics.
Header file containing RAJA intrinsics templates for HIP execution.
#define RAJA_HOST_DEVICE
Definition: macros.hpp:65
#define RAJA_UNUSED_ARG(x)
Definition: macros.hpp:97
#define RAJA_DIVIDE_CEILING_INT(dividend, divisor)
Definition: macros.hpp:122
#define RAJA_DEVICE
Definition: macros.hpp:66
Header file providing RAJA math templates.
multi_reduce_algorithm
Definition: policy.hpp:31
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result min(Args... args)
Definition: foldl.hpp:161
named_dim
Definition: types.hpp:53
Launch
Definition: PolicyBase.hpp:60
RAJA_HOST_DEVICE constexpr RAJA_INLINE T next_pow2(T n) noexcept
"round up" to the next greatest power of 2
Definition: math.hpp:63
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
kernel_sync_requirement
Definition: types.hpp:63
named_usage
Definition: types.hpp:44
@ ignored
Definition: types.hpp:45
@ unspecified
Definition: types.hpp:46
PolicyBaseT< Policy_, Pattern_, Launch_, Platform::undefined, Args... > make_policy_pattern_launch_t
Definition: PolicyBase.hpp:180
RAJA_HOST_DEVICE constexpr RAJA_INLINE T prev_pow2(T n) noexcept
"round down" to the largest power of 2 that is less than or equal to n
Definition: math.hpp:85
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155
Header file providing RAJA reduction declarations.
Header file containing RAJA sequential policy definitions.
static constexpr int_t multiply(int_t val) noexcept
Definition: types.hpp:255
Definition: PolicyBase.hpp:75
Header file for RAJA type definitions.