23 #ifndef RAJA_hip_reduce_HPP
24 #define RAJA_hip_reduce_HPP
26 #include "RAJA/config.hpp"
28 #if defined(RAJA_ENABLE_HIP)
30 #include <type_traits>
33 #include <hip/hip_runtime.h>
61 template<
typename Combiner>
67 RAJA_DEVICE RAJA_INLINE
void operator()(T& val,
const T v)
76 RAJA_DEVICE RAJA_INLINE
void operator()(T& val,
const T v)
85 RAJA_DEVICE RAJA_INLINE
void operator()(T& val,
const T v)
92 struct atomic<and_bit<T>>
94 RAJA_DEVICE RAJA_INLINE
void operator()(T& val,
const T v)
101 struct atomic<or_bit<T>>
103 RAJA_DEVICE RAJA_INLINE
void operator()(T& val,
const T v)
110 struct hip_atomic_available
112 static constexpr
const bool value =
113 (std::is_integral<T>::value && (4 ==
sizeof(T) || 8 ==
sizeof(T))) ||
114 std::is_same<T, float>::value || std::is_same<T, double>::value;
129 template<
typename Combiner,
134 typename TempIterator>
135 RAJA_DEVICE RAJA_INLINE
int grid_reduce_last_block(T& val,
137 TempIterator in_device_mem,
138 unsigned int* device_count)
140 typename TempIterator::template rebind_accessor<Accessor> device_mem(
143 int threadId = threadIdx.x + blockDim.x * threadIdx.y +
144 (blockDim.x * blockDim.y) * threadIdx.z;
145 int numThreads = blockDim.x * blockDim.y * blockDim.z;
147 int blockId = blockIdx.x + gridDim.x * blockIdx.y +
148 (gridDim.x * gridDim.y) * blockIdx.z;
149 int numBlocks = gridDim.x * gridDim.y * gridDim.z;
151 int replicationId = blockId % replication;
152 int slotId = blockId / replication;
154 int maxNumSlots = (numBlocks + replication - 1) / replication;
155 unsigned int numSlots = (numBlocks / replication) +
156 ((replicationId < (numBlocks % replication)) ? 1 : 0);
158 int atomicOffset = replicationId * atomic_stride;
159 int beginSlots = replicationId * maxNumSlots;
160 int blockSlot = beginSlots + slotId;
162 T temp = block_reduce<Combiner>(val, identity);
170 return (threadId == 0) ? replicationId : replication;
174 __shared__
bool isLastBlock;
177 device_mem.set(blockSlot, temp);
179 Accessor::fence_release();
181 unsigned int old_count =
182 ::atomicInc(&device_count[atomicOffset], (numSlots - 1));
183 isLastBlock = (old_count == (numSlots - 1));
193 Accessor::fence_acquire();
195 for (
unsigned int i = threadId; i < numSlots; i += numThreads)
197 Combiner {}(temp, device_mem.get(beginSlots + i));
200 temp = block_reduce<Combiner>(temp, identity);
209 return (isLastBlock && threadId == 0) ? replicationId : replication;
215 template<
typename ThreadIterationGetter,
typename Combiner,
typename T>
216 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
218 const int numThreads = ThreadIterationGetter::size();
219 const int threadId = ThreadIterationGetter::index();
221 const int warpId = threadId % RAJA::policy::hip::device_constants.WARP_SIZE;
222 const int warpNum = threadId / RAJA::policy::hip::device_constants.WARP_SIZE;
226 if (numThreads % RAJA::policy::hip::device_constants.WARP_SIZE == 0)
230 for (
int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2)
232 T rhs = RAJA::hip::impl::shfl_xor_sync(temp, i);
233 temp = Combiner {}(temp, rhs);
240 for (
int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2)
242 int srcLane = threadId ^ i;
243 T rhs = RAJA::hip::impl::shfl_sync(temp, srcLane);
245 if (srcLane < numThreads)
247 temp = Combiner {}(temp, rhs);
252 static_assert(RAJA::policy::hip::device_constants.MAX_WARPS <=
253 RAJA::policy::hip::device_constants.WARP_SIZE,
254 "Max Warps must be less than or equal to Warp Size for this "
255 "algorithm to work");
258 if (numThreads > RAJA::policy::hip::device_constants.WARP_SIZE)
262 __shared__
unsigned char tmpsd[
sizeof(
264 RAJA::policy::hip::device_constants.MAX_WARPS>)];
270 T, RAJA::policy::hip::device_constants.MAX_WARPS
>*>(tmpsd);
275 sd->
set(warpNum, temp);
284 if (warpId * RAJA::policy::hip::device_constants.WARP_SIZE < numThreads)
286 temp = sd->get(warpId);
293 for (
int i = 1; i < RAJA::policy::hip::device_constants.MAX_WARPS; i *= 2)
295 T rhs = RAJA::hip::impl::shfl_xor_sync(temp, i);
296 temp = Combiner {}(temp, rhs);
306 template<
typename GlobalIterationGetter,
typename OP,
typename T>
311 unsigned int* device_count)
313 using BlockIterationGetter =
314 typename get_index_block<GlobalIterationGetter>::type;
315 using ThreadIterationGetter =
316 typename get_index_thread<GlobalIterationGetter>::type;
318 const int numBlocks = BlockIterationGetter::size();
319 const int numThreads = ThreadIterationGetter::size();
320 const unsigned int wrap_around = numBlocks - 1;
322 const int blockId = BlockIterationGetter::index();
323 const int threadId = ThreadIterationGetter::index();
325 T temp = block_reduce<ThreadIterationGetter, OP>(val, OP::identity());
328 bool lastBlock =
false;
331 device_mem.
set(blockId, temp);
335 unsigned int old_count =
::atomicInc(device_count, wrap_around);
336 lastBlock = (old_count == wrap_around);
340 lastBlock = __syncthreads_or(lastBlock);
345 temp = OP::identity();
348 for (
int i = threadId; i < numBlocks; i += numThreads)
350 temp = OP {}(temp, device_mem.
get(i));
353 temp = block_reduce<ThreadIterationGetter, OP>(temp, OP::identity());
358 *device_target = temp;
367 template<
typename Combiner,
372 RAJA_DEVICE RAJA_INLINE
int grid_reduce_atomic_device_init(
376 unsigned int* device_count)
378 int threadId = threadIdx.x + blockDim.x * threadIdx.y +
379 (blockDim.x * blockDim.y) * threadIdx.z;
381 int blockId = blockIdx.x + gridDim.x * blockIdx.y +
382 (gridDim.x * gridDim.y) * blockIdx.z;
383 int numBlocks = gridDim.x * gridDim.y * gridDim.z;
385 int replicationId = (blockId % replication);
386 int atomicOffset = replicationId * atomic_stride;
388 unsigned int numSlots = (numBlocks / replication) +
389 ((replicationId < (numBlocks % replication)) ? 1 : 0);
393 T temp = block_reduce<Combiner>(val, identity);
398 return (threadId == 0) ? replicationId : replication;
404 unsigned int old_val =
::atomicCAS(&device_count[atomicOffset], 0u, 1u);
407 Accessor::set(device_mem, atomicOffset, identity);
408 Accessor::fence_release();
413 T temp = block_reduce<Combiner>(val, identity);
416 bool isLastBlock =
false;
420 while (::
atomicAdd(&device_count[atomicOffset], 0u) < 2u)
422 Accessor::fence_acquire();
423 RAJA::reduce::hip::atomic<Combiner> {}(device_mem[atomicOffset], temp);
424 Accessor::fence_release();
426 unsigned int old_count =
427 ::atomicInc(&device_count[atomicOffset], (numSlots + 1));
428 isLastBlock = (old_count == (numSlots + 1));
433 Accessor::fence_acquire();
438 return isLastBlock ? replicationId : replication;
442 template<
typename Combiner,
int replication,
int atomic_str
ide,
typename T>
443 RAJA_DEVICE RAJA_INLINE
void grid_reduce_atomic_host_init(T& val,
447 int threadId = threadIdx.x + blockDim.x * threadIdx.y +
448 (blockDim.x * blockDim.y) * threadIdx.z;
450 int blockId = blockIdx.x + gridDim.x * blockIdx.y +
451 (gridDim.x * gridDim.y) * blockIdx.z;
453 int replicationId = (blockId % replication);
454 int atomicOffset = replicationId * atomic_stride;
456 T temp = block_reduce<Combiner>(val, identity);
459 if (threadId == 0 && temp != identity)
461 RAJA::reduce::hip::atomic<Combiner> {}(device_mem[atomicOffset], temp);
469 template<
typename T,
size_t num_slots,
typename mempool>
484 ::RAJA::resources::Hip res;
489 class ResourceIterator
492 ResourceIterator() =
delete;
494 ResourceIterator(ResourceNode* rn) : m_rn(rn) {}
496 const ResourceIterator& operator++()
502 ResourceIterator operator++(
int)
504 ResourceIterator ret = *
this;
509 ::RAJA::resources::Hip&
operator*() {
return m_rn->res; }
511 bool operator==(
const ResourceIterator& rhs)
const
513 return m_rn == rhs.m_rn;
516 bool operator!=(
const ResourceIterator& rhs)
const
518 return !this->operator==(rhs);
526 class ResourceNodeIterator
529 ResourceNodeIterator() =
delete;
531 ResourceNodeIterator(ResourceNode* rn, Node* n) : m_rn(rn), m_n(n) {}
533 const ResourceNodeIterator& operator++()
542 m_n = m_rn->node_list;
552 ResourceNodeIterator operator++(
int)
554 ResourceNodeIterator ret = *
this;
559 auto operator*() -> T (&)[num_slots] {
return m_n->values; }
561 bool operator==(
const ResourceNodeIterator& rhs)
const
563 return m_n == rhs.m_n;
566 bool operator!=(
const ResourceNodeIterator& rhs)
const
568 return !this->operator==(rhs);
576 PinnedTally() : resource_list(nullptr) {}
578 PinnedTally(
const PinnedTally&) =
delete;
581 ResourceIterator resourceBegin() {
return {resource_list}; }
584 ResourceIterator resourceEnd() {
return {
nullptr}; }
587 ResourceNodeIterator begin()
589 return {resource_list, resource_list ? resource_list->node_list :
nullptr};
593 ResourceNodeIterator end() {
return {
nullptr,
nullptr}; }
596 auto new_value(::RAJA::resources::Hip res) -> T (&)[num_slots]
598 std::lock_guard<std::mutex> lock(m_mutex);
599 ResourceNode* rn = resource_list;
602 if (rn->res.get_stream() == res.get_stream())
break;
607 rn = (ResourceNode*)malloc(
sizeof(ResourceNode));
608 rn->next = resource_list;
610 rn->node_list =
nullptr;
613 Node* n = mempool::getInstance().template malloc<Node>(1);
614 n->next = rn->node_list;
620 void synchronize_resources()
622 auto end = resourceEnd();
623 for (
auto r = resourceBegin(); r != end; ++r)
632 while (resource_list)
634 ResourceNode* rn = resource_list;
635 while (rn->node_list)
637 Node* n = rn->node_list;
638 rn->node_list = n->next;
639 mempool::getInstance().free(n);
641 resource_list = rn->next;
646 ~PinnedTally() { free_list(); }
651 ResourceNode* resource_list;
664 template<
typename Combiner,
668 size_t atomic_stride>
669 struct ReduceLastBlock_Data
671 using tally_mempool_type = pinned_mempool_type;
672 using data_mempool_type = device_mempool_type;
673 using count_mempool_type = device_zeroed_mempool_type;
675 static constexpr
size_t tally_slots = replication;
679 unsigned int* device_count;
683 ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()) {};
690 ReduceLastBlock_Data(T initValue, T identity_)
692 identity {identity_},
693 device_count {nullptr},
695 own_device_ptr {false}
699 ReduceLastBlock_Data(
const ReduceLastBlock_Data& other)
700 : value {other.identity},
701 identity {other.identity},
702 device_count {other.device_count},
703 device {other.device},
704 own_device_ptr {false}
707 ReduceLastBlock_Data& operator=(
const ReduceLastBlock_Data&) =
default;
711 T* init_grid_vals(T (&output)[tally_slots])
713 for (
size_t r = 0; r < tally_slots; ++r)
715 output[r] = identity;
722 void grid_reduce(T* output)
725 size_t replicationId =
726 impl::grid_reduce_last_block<Combiner, Accessor, replication,
727 atomic_stride>(temp, identity, device,
729 if (replicationId != replication)
731 output[replicationId] = temp;
737 bool setupForDevice()
739 bool act = !device.allocated() && setupReducers();
742 hip_dim_t gridDim = currentGridDim();
743 size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
744 size_t maxNumSlots = (numBlocks + replication - 1) / replication;
745 device.allocate(maxNumSlots * replication);
747 count_mempool_type::getInstance().template malloc<unsigned int>(
748 replication * atomic_stride);
749 own_device_ptr =
true;
756 bool teardownForDevice()
758 bool act = own_device_ptr;
762 count_mempool_type::getInstance().free(device_count);
763 device_count =
nullptr;
764 own_device_ptr =
false;
771 template<
typename Combiner,
774 size_t atomic_stride>
775 struct ReduceAtomicHostInit_Data
777 using tally_mempool_type = device_pinned_mempool_type;
779 static constexpr
size_t tally_slots = replication * atomic_stride;
786 ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()) {}
788 ReduceAtomicHostInit_Data(T initValue, T identity_)
790 identity {identity_},
792 own_device_ptr {false}
796 ReduceAtomicHostInit_Data(
const ReduceAtomicHostInit_Data& other)
797 : value {other.identity},
798 identity {other.identity},
799 is_setup {other.is_setup},
800 own_device_ptr {false}
803 ReduceAtomicHostInit_Data& operator=(
const ReduceAtomicHostInit_Data&) =
808 T* init_grid_vals(T (&output)[tally_slots])
810 for (
size_t r = 0; r < tally_slots; ++r)
812 output[r] = identity;
819 void grid_reduce(T* output)
823 impl::grid_reduce_atomic_host_init<Combiner, replication, atomic_stride>(
824 temp, identity, output);
829 bool setupForDevice()
831 bool act = !is_setup && setupReducers();
835 own_device_ptr =
true;
842 bool teardownForDevice()
844 bool act = own_device_ptr;
848 own_device_ptr =
false;
855 template<
typename Combiner,
859 size_t atomic_stride>
860 struct ReduceAtomicDeviceInit_Data
862 using tally_mempool_type = pinned_mempool_type;
863 using data_mempool_type = device_mempool_type;
864 using count_mempool_type = device_zeroed_mempool_type;
866 static constexpr
size_t tally_slots = replication;
870 unsigned int* device_count;
874 ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()) {}
876 ReduceAtomicDeviceInit_Data(T initValue, T identity_)
878 identity {identity_},
879 device_count {nullptr},
881 own_device_ptr {false}
885 ReduceAtomicDeviceInit_Data(
const ReduceAtomicDeviceInit_Data& other)
886 : value {other.identity},
887 identity {other.identity},
888 device_count {other.device_count},
889 device {other.device},
890 own_device_ptr {false}
893 ReduceAtomicDeviceInit_Data& operator=(
const ReduceAtomicDeviceInit_Data&) =
898 T* init_grid_vals(T (&output)[tally_slots])
900 for (
size_t r = 0; r < tally_slots; ++r)
902 output[r] = identity;
909 void grid_reduce(T* output)
913 size_t replicationId =
914 impl::grid_reduce_atomic_device_init<Combiner, Accessor, replication,
916 temp, identity, device, device_count);
917 if (replicationId != replication)
919 output[replicationId] = temp;
925 bool setupForDevice()
927 bool act = !device && setupReducers();
930 device = data_mempool_type::getInstance().template malloc<T>(
931 replication * atomic_stride);
933 count_mempool_type::getInstance().template malloc<unsigned int>(
934 replication * atomic_stride);
935 own_device_ptr =
true;
942 bool teardownForDevice()
944 bool act = own_device_ptr;
947 data_mempool_type::getInstance().free(device);
949 count_mempool_type::getInstance().free(device_count);
950 device_count =
nullptr;
951 own_device_ptr =
false;
958 template<
typename Combiner,
typename T,
typename tuning>
961 static constexpr
size_t replication =
962 (tuning::replication > 0) ? tuning::replication : 32;
963 static constexpr
size_t atomic_stride =
964 (tuning::atomic_stride > 0)
965 ? tuning::atomic_stride
966 : ((policy::hip::device_constants
967 .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE >
sizeof(T))
969 policy::hip::device_constants
970 .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE,
974 using Accessor = std::conditional_t<
975 (tuning::comm_mode == block_communication_mode::block_fence),
976 impl::AccessorDeviceScopeUseBlockFence,
977 std::conditional_t<(tuning::comm_mode ==
978 block_communication_mode::device_fence),
979 impl::AccessorDeviceScopeUseDeviceFence,
982 static constexpr
bool atomic_policy =
983 (tuning::algorithm ==
984 reduce_algorithm::init_device_combine_atomic_block) ||
985 (tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block);
986 static constexpr
bool atomic_available =
987 RAJA::reduce::hip::hip_atomic_available<T>::value;
990 using reduce_data_type = std::conditional_t<
991 (tuning::algorithm == reduce_algorithm::combine_last_block) ||
992 (atomic_policy && !atomic_available),
993 hip::ReduceLastBlock_Data<Combiner,
1001 (tuning::algorithm ==
1002 reduce_algorithm::init_device_combine_atomic_block),
1003 hip::ReduceAtomicDeviceInit_Data<Combiner,
1009 (tuning::algorithm ==
1010 reduce_algorithm::init_host_combine_atomic_block),
1011 hip::ReduceAtomicHostInit_Data<Combiner,
1018 static constexpr
size_t tally_slots = reduce_data_type::tally_slots;
1020 using TallyType = PinnedTally<T,
1022 typename reduce_data_type::tally_mempool_type>;
1030 constexpr tally_u(TallyType* l) : list(l) {};
1031 constexpr tally_u(T* v_ptr) : val_ptr(v_ptr) {};
1039 explicit Reduce(T init_val, T identity_ = Combiner::identity())
1041 tally_or_val_ptr {new TallyType},
1042 val(init_val, identity_)
1045 void reset(T in_val, T identity_ = Combiner::identity())
1048 val = reduce_data_type(in_val, identity_);
1056 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
1057 : parent {other.parent},
1061 tally_or_val_ptr {other.tally_or_val_ptr},
1064 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
1067 if (val.setupForDevice())
1069 tally_or_val_ptr.val_ptr = val.init_grid_vals(
1070 tally_or_val_ptr.list->new_value(currentResource()));
1082 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
1085 delete tally_or_val_ptr.list;
1086 tally_or_val_ptr.list =
nullptr;
1090 if (val.value != val.identity)
1092 std::lock_guard<std::mutex> lock(tally_or_val_ptr.list->m_mutex);
1093 parent->combine(val.value);
1098 if (val.teardownForDevice())
1100 tally_or_val_ptr.val_ptr =
nullptr;
1104 if (!parent->parent)
1106 val.grid_reduce(tally_or_val_ptr.val_ptr);
1110 parent->combine(val.value);
1118 auto n = tally_or_val_ptr.list->begin();
1119 auto end = tally_or_val_ptr.list->end();
1122 tally_or_val_ptr.list->synchronize_resources();
1124 std::move(val.value));
1125 for (; n != end; ++n)
1127 T(&values)[tally_slots] = *n;
1128 for (
size_t r = 0; r < tally_slots; ++r)
1130 reducer.combine(std::move(values[r]));
1133 val.value = reducer.get_and_reset();
1134 tally_or_val_ptr.list->free_list();
1140 T
get() {
return operator T(); }
1144 void combine(T other)
const { Combiner {}(val.value, other); }
1149 T& local()
const {
return val.value; }
1151 T get_combined()
const {
return val.value; }
1155 tally_u tally_or_val_ptr;
1156 reduce_data_type val;
1162 template<
typename tuning,
typename T>
1163 class ReduceSum<
RAJA::policy::hip::hip_reduce_policy<tuning>, T>
1164 :
public hip::Reduce<RAJA::reduce::sum<T>, T, tuning>
1168 using Base = hip::Reduce<RAJA::reduce::sum<T>, T, tuning>;
1173 const ReduceSum& operator+=(T rhs)
const
1181 template<
typename tuning,
typename T>
1182 class ReduceBitOr<
RAJA::policy::hip::hip_reduce_policy<tuning>, T>
1183 :
public hip::Reduce<RAJA::reduce::or_bit<T>, T, tuning>
1187 using Base = hip::Reduce<RAJA::reduce::or_bit<T>, T, tuning>;
1192 const ReduceBitOr& operator|=(T rhs)
const
1200 template<
typename tuning,
typename T>
1201 class ReduceBitAnd<
RAJA::policy::hip::hip_reduce_policy<tuning>, T>
1202 :
public hip::Reduce<RAJA::reduce::and_bit<T>, T, tuning>
1206 using Base = hip::Reduce<RAJA::reduce::and_bit<T>, T, tuning>;
1211 const ReduceBitAnd& operator&=(T rhs)
const
1219 template<
typename tuning,
typename T>
1220 class ReduceMin<
RAJA::policy::hip::hip_reduce_policy<tuning>, T>
1221 :
public hip::Reduce<RAJA::reduce::min<T>, T, tuning>
1225 using Base = hip::Reduce<RAJA::reduce::min<T>, T, tuning>;
1230 const ReduceMin&
min(T rhs)
const
1238 template<
typename tuning,
typename T>
1239 class ReduceMax<
RAJA::policy::hip::hip_reduce_policy<tuning>, T>
1240 :
public hip::Reduce<RAJA::reduce::max<T>, T, tuning>
1244 using Base = hip::Reduce<RAJA::reduce::max<T>, T, tuning>;
1249 const ReduceMax&
max(T rhs)
const
1257 template<
typename tuning,
typename T,
typename IndexType>
1258 class ReduceMinLoc<
RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
1260 RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
1261 RAJA::reduce::detail::ValueLoc<T, IndexType>,
1269 using Base = hip::Reduce<Combiner, value_type, tuning>;
1273 ReduceMinLoc(T init_val,
1275 T identity_val = NonLocCombiner::identity(),
1276 IndexType identity_idx =
1278 : Base(value_type(init_val, init_idx),
1279 value_type(identity_val, identity_idx))
1284 void reset(T init_val,
1286 T identity_val = NonLocCombiner::identity(),
1287 IndexType identity_idx =
1290 Base::reset(value_type(init_val, init_idx),
1291 value_type(identity_val, identity_idx));
1296 const ReduceMinLoc& minloc(T rhs, IndexType loc)
const
1298 this->combine(value_type(rhs, loc));
1303 IndexType getLoc() {
return Base::get().getLoc(); }
1313 template<
typename tuning,
typename T,
typename IndexType>
1314 class ReduceMaxLoc<
RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
1317 RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
1318 RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
1325 using Base = hip::Reduce<Combiner, value_type, tuning>;
1329 ReduceMaxLoc(T init_val,
1331 T identity_val = NonLocCombiner::identity(),
1332 IndexType identity_idx =
1334 : Base(value_type(init_val, init_idx),
1335 value_type(identity_val, identity_idx))
1340 void reset(T init_val,
1342 T identity_val = NonLocCombiner::identity(),
1343 IndexType identity_idx =
1346 Base::reset(value_type(init_val, init_idx),
1347 value_type(identity_val, identity_idx));
1352 const ReduceMaxLoc& maxloc(T rhs, IndexType loc)
const
1354 this->combine(value_type(rhs, loc));
1359 IndexType getLoc() {
return Base::get().getLoc(); }
Header file defining prototypes for routines used to manage memory for HIP reductions and other opera...
Header file for common RAJA internal definitions.
Header file for common RAJA internal definitions.
RAJA header file containing an implementation of a memory pool.
Array class specialized for Struct of Array data layout.
Definition: SoAArray.hpp:42
constexpr RAJA_HOST_DEVICE void set(size_t i, value_type val)
Definition: SoAArray.hpp:48
Pointer class specialized for Struct of Array data layout allocated via RAJA basic_mempools.
Definition: SoAPtr.hpp:52
constexpr RAJA_HOST_DEVICE void set(size_t i, value_type val)
Definition: SoAPtr.hpp:100
constexpr RAJA_HOST_DEVICE value_type get(size_t i) const
Definition: SoAPtr.hpp:95
Definition: reduce.hpp:131
Header file containing RAJA intrinsics templates for HIP execution.
Header file containing RAJA HIP policy definitions.
Header file for common RAJA internal macro definitions.
#define RAJA_HOST_DEVICE
Definition: macros.hpp:65
#define RAJA_DIVIDE_CEILING_INT(dividend, divisor)
Definition: macros.hpp:122
#define RAJA_DEVICE
Definition: macros.hpp:66
constexpr auto Reduce(T *target)
Definition: reducer.hpp:231
RAJA_INLINE RAJA_HOST_DEVICE auto operator*(LHS const &left_operand, RHS const &right_operand) -> TensorMultiply< typename NormalizeOperandHelper< LHS >::return_type, RHS >
Definition: TensorMultiply.hpp:155
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T *acc, T value)
Atomic bitwise AND equivalent to (*acc) = (*acc) & value This only works with integral data types.
Definition: atomic.hpp:224
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result min(Args... args)
Definition: foldl.hpp:161
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T *acc, T value)
Atomic bitwise OR equivalent to (*acc) = (*acc) | value This only works with integral data types.
Definition: atomic.hpp:240
RAJA_HOST_DEVICE RAJA_INLINE Iter next(Iter it)
returns iterator to next item
Definition: algorithm.hpp:90
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T *acc, T value)
Atomic add.
Definition: atomic.hpp:117
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result sum(Args... args)
Definition: foldl.hpp:143
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
std::conditional_t< RAJA::operators::is_fp_associative< T >::value, BinaryTreeReduce< T, BinaryOp >, LeftFoldReduce< T, BinaryOp > > HighAccuracyReduce
Definition: reduce.hpp:357
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T *acc, T value)
Atomic maximum equivalent to (*acc) = std::max(*acc, value)
Definition: atomic.hpp:156
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T *acc, T compare, T value)
Atomic compare and swap.
Definition: atomic.hpp:286
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc)
Atomic increment.
Definition: atomic.hpp:168
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T *acc, T value)
Atomic minimum equivalent to (*acc) = std::min(*acc, value)
Definition: atomic.hpp:143
void synchronize()
Synchronize all current RAJA executions for the specified policy.
Definition: synchronize.hpp:44
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155
Base types used in common for RAJA reducer objects.
Header file providing RAJA reduction declarations.
RAJA header file defining atomic operations for HIP.
Header file containing utility methods used in HIP operations.
Definition: reduce.hpp:115
Definition: reduce.hpp:95
Definition: reduce.hpp:91
Header file for RAJA type definitions.
Header file providing RAJA sort templates.