23 #ifndef RAJA_cuda_reduce_HPP
24 #define RAJA_cuda_reduce_HPP
26 #include "RAJA/config.hpp"
28 #if defined(RAJA_ENABLE_CUDA)
30 #include <type_traits>
48 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
67 template<
typename Combiner>
73 RAJA_DEVICE RAJA_INLINE
void operator()(T& val,
const T v)
82 RAJA_DEVICE RAJA_INLINE
void operator()(T& val,
const T v)
91 RAJA_DEVICE RAJA_INLINE
void operator()(T& val,
const T v)
98 struct atomic<and_bit<T>>
100 RAJA_DEVICE RAJA_INLINE
void operator()(T& val,
const T v)
107 struct atomic<or_bit<T>>
109 RAJA_DEVICE RAJA_INLINE
void operator()(T& val,
const T v)
116 struct cuda_atomic_available
118 static constexpr
const bool value =
119 (std::is_integral<T>::value && (4 ==
sizeof(T) || 8 ==
sizeof(T))) ||
120 std::is_same<T, float>::value || std::is_same<T, double>::value;
135 template<
typename Combiner,
140 typename TempIterator>
141 RAJA_DEVICE RAJA_INLINE
int grid_reduce_last_block(T& val,
143 TempIterator in_device_mem,
144 unsigned int* device_count)
146 typename TempIterator::template rebind_accessor<Accessor> device_mem(
149 int threadId = threadIdx.x + blockDim.x * threadIdx.y +
150 (blockDim.x * blockDim.y) * threadIdx.z;
151 int numThreads = blockDim.x * blockDim.y * blockDim.z;
153 int blockId = blockIdx.x + gridDim.x * blockIdx.y +
154 (gridDim.x * gridDim.y) * blockIdx.z;
155 int numBlocks = gridDim.x * gridDim.y * gridDim.z;
157 int replicationId = blockId % replication;
158 int slotId = blockId / replication;
160 int maxNumSlots = (numBlocks + replication - 1) / replication;
161 unsigned int numSlots = (numBlocks / replication) +
162 ((replicationId < (numBlocks % replication)) ? 1 : 0);
164 int atomicOffset = replicationId * atomic_stride;
165 int beginSlots = replicationId * maxNumSlots;
166 int blockSlot = beginSlots + slotId;
168 T temp = block_reduce<Combiner>(val, identity);
176 return (threadId == 0) ? replicationId : replication;
180 bool isLastBlock =
false;
183 device_mem.set(blockSlot, temp);
185 Accessor::fence_release();
187 unsigned int old_count =
188 ::atomicInc(&device_count[atomicOffset], (numSlots - 1));
189 isLastBlock = (old_count == (numSlots - 1));
193 isLastBlock = __syncthreads_or(isLastBlock);
199 Accessor::fence_acquire();
201 for (
unsigned int i = threadId; i < numSlots; i += numThreads)
203 Combiner {}(temp, device_mem.get(beginSlots + i));
206 temp = block_reduce<Combiner>(temp, identity);
215 return (isLastBlock && threadId == 0) ? replicationId : replication;
221 template<
typename ThreadIterationGetter,
typename Combiner,
typename T>
222 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
224 const int numThreads = ThreadIterationGetter::size();
225 const int threadId = ThreadIterationGetter::index();
227 const int warpId = threadId % RAJA::policy::cuda::device_constants.WARP_SIZE;
228 const int warpNum = threadId / RAJA::policy::cuda::device_constants.WARP_SIZE;
232 if (numThreads % RAJA::policy::cuda::device_constants.WARP_SIZE == 0)
236 for (
int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2)
238 T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i);
239 temp = Combiner {}(temp, rhs);
246 for (
int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2)
248 int srcLane = threadId ^ i;
249 T rhs = RAJA::cuda::impl::shfl_sync(temp, srcLane);
251 if (srcLane < numThreads)
253 temp = Combiner {}(temp, rhs);
258 static_assert(RAJA::policy::cuda::device_constants.MAX_WARPS <=
259 RAJA::policy::cuda::device_constants.WARP_SIZE,
260 "Max Warps must be less than or equal to Warp Size for this "
261 "algorithm to work");
264 if (numThreads > RAJA::policy::cuda::device_constants.WARP_SIZE)
268 __shared__
unsigned char
270 T, RAJA::policy::cuda::device_constants.MAX_WARPS>)];
276 T, RAJA::policy::cuda::device_constants.MAX_WARPS
>*>(tmpsd);
281 sd->
set(warpNum, temp);
290 if (warpId * RAJA::policy::cuda::device_constants.WARP_SIZE < numThreads)
292 temp = sd->get(warpId);
299 for (
int i = 1; i < RAJA::policy::cuda::device_constants.MAX_WARPS;
302 T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i);
303 temp = Combiner {}(temp, rhs);
313 template<
typename GlobalIterationGetter,
typename OP,
typename T>
318 unsigned int* device_count)
320 using BlockIterationGetter =
321 typename get_index_block<GlobalIterationGetter>::type;
322 using ThreadIterationGetter =
323 typename get_index_thread<GlobalIterationGetter>::type;
325 const int numBlocks = BlockIterationGetter::size();
326 const int numThreads = ThreadIterationGetter::size();
327 const unsigned int wrap_around = numBlocks - 1;
329 const int blockId = BlockIterationGetter::index();
330 const int threadId = ThreadIterationGetter::index();
332 T temp = block_reduce<ThreadIterationGetter, OP>(val, OP::identity());
335 bool lastBlock =
false;
338 device_mem.
set(blockId, temp);
342 unsigned int old_count =
::atomicInc(device_count, wrap_around);
343 lastBlock = (old_count == wrap_around);
347 lastBlock = __syncthreads_or(lastBlock);
352 temp = OP::identity();
355 for (
int i = threadId; i < numBlocks; i += numThreads)
357 temp = OP {}(temp, device_mem.
get(i));
360 temp = block_reduce<ThreadIterationGetter, OP>(temp, OP::identity());
365 *device_target = temp;
374 template<
typename Combiner,
379 RAJA_DEVICE RAJA_INLINE
int grid_reduce_atomic_device_init(
383 unsigned int* device_count)
385 int threadId = threadIdx.x + blockDim.x * threadIdx.y +
386 (blockDim.x * blockDim.y) * threadIdx.z;
388 int blockId = blockIdx.x + gridDim.x * blockIdx.y +
389 (gridDim.x * gridDim.y) * blockIdx.z;
390 int numBlocks = gridDim.x * gridDim.y * gridDim.z;
392 int replicationId = (blockId % replication);
393 int atomicOffset = replicationId * atomic_stride;
395 unsigned int numSlots = (numBlocks / replication) +
396 ((replicationId < (numBlocks % replication)) ? 1 : 0);
400 T temp = block_reduce<Combiner>(val, identity);
405 return (threadId == 0) ? replicationId : replication;
411 unsigned int old_val =
::atomicCAS(&device_count[atomicOffset], 0u, 1u);
414 Accessor::set(device_mem, atomicOffset, identity);
415 Accessor::fence_release();
420 T temp = block_reduce<Combiner>(val, identity);
423 bool isLastBlock =
false;
427 while (::
atomicAdd(&device_count[atomicOffset], 0u) < 2u)
429 Accessor::fence_acquire();
430 RAJA::reduce::cuda::atomic<Combiner> {}(device_mem[atomicOffset], temp);
431 Accessor::fence_release();
433 unsigned int old_count =
434 ::atomicInc(&device_count[atomicOffset], (numSlots + 1));
435 isLastBlock = (old_count == (numSlots + 1));
440 Accessor::fence_acquire();
445 return isLastBlock ? replicationId : replication;
449 template<
typename Combiner,
int replication,
int atomic_str
ide,
typename T>
450 RAJA_DEVICE RAJA_INLINE
void grid_reduce_atomic_host_init(T& val,
454 int threadId = threadIdx.x + blockDim.x * threadIdx.y +
455 (blockDim.x * blockDim.y) * threadIdx.z;
457 int blockId = blockIdx.x + gridDim.x * blockIdx.y +
458 (gridDim.x * gridDim.y) * blockIdx.z;
460 int replicationId = (blockId % replication);
461 int atomicOffset = replicationId * atomic_stride;
463 T temp = block_reduce<Combiner>(val, identity);
466 if (threadId == 0 && temp != identity)
468 RAJA::reduce::cuda::atomic<Combiner> {}(device_mem[atomicOffset], temp);
476 template<
typename T,
size_t num_slots,
typename mempool>
491 ::RAJA::resources::Cuda res;
496 class ResourceIterator
499 ResourceIterator() =
delete;
501 ResourceIterator(ResourceNode* rn) : m_rn(rn) {}
503 const ResourceIterator& operator++()
509 ResourceIterator operator++(
int)
511 ResourceIterator ret = *
this;
516 ::RAJA::resources::Cuda&
operator*() {
return m_rn->res; }
518 bool operator==(
const ResourceIterator& rhs)
const
520 return m_rn == rhs.m_rn;
523 bool operator!=(
const ResourceIterator& rhs)
const
525 return !this->operator==(rhs);
533 class ResourceNodeIterator
536 ResourceNodeIterator() =
delete;
538 ResourceNodeIterator(ResourceNode* rn, Node* n) : m_rn(rn), m_n(n) {}
540 const ResourceNodeIterator& operator++()
549 m_n = m_rn->node_list;
559 ResourceNodeIterator operator++(
int)
561 ResourceNodeIterator ret = *
this;
566 auto operator*() -> T (&)[num_slots] {
return m_n->values; }
568 bool operator==(
const ResourceNodeIterator& rhs)
const
570 return m_n == rhs.m_n;
573 bool operator!=(
const ResourceNodeIterator& rhs)
const
575 return !this->operator==(rhs);
583 PinnedTally() : resource_list(nullptr) {}
585 PinnedTally(
const PinnedTally&) =
delete;
588 ResourceIterator resourceBegin() {
return {resource_list}; }
591 ResourceIterator resourceEnd() {
return {
nullptr}; }
594 ResourceNodeIterator begin()
596 return {resource_list, resource_list ? resource_list->node_list :
nullptr};
600 ResourceNodeIterator end() {
return {
nullptr,
nullptr}; }
603 auto new_value(::RAJA::resources::Cuda res) -> T (&)[num_slots]
605 std::lock_guard<std::mutex> lock(m_mutex);
606 ResourceNode* rn = resource_list;
609 if (rn->res.get_stream() == res.get_stream())
break;
614 rn = (ResourceNode*)malloc(
sizeof(ResourceNode));
615 rn->next = resource_list;
617 rn->node_list =
nullptr;
620 Node* n = mempool::getInstance().template malloc<Node>(1);
621 n->next = rn->node_list;
627 void synchronize_resources()
629 auto end = resourceEnd();
630 for (
auto r = resourceBegin(); r != end; ++r)
639 while (resource_list)
641 ResourceNode* rn = resource_list;
642 while (rn->node_list)
644 Node* n = rn->node_list;
645 rn->node_list = n->next;
646 mempool::getInstance().free(n);
648 resource_list = rn->next;
653 ~PinnedTally() { free_list(); }
658 ResourceNode* resource_list;
671 template<
typename Combiner,
675 size_t atomic_stride>
676 struct ReduceLastBlock_Data
678 using tally_mempool_type = pinned_mempool_type;
679 using data_mempool_type = device_mempool_type;
680 using count_mempool_type = device_zeroed_mempool_type;
682 static constexpr
size_t tally_slots = replication;
686 unsigned int* device_count;
688 bool owns_device_pointer;
690 ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()) {}
697 ReduceLastBlock_Data(T initValue, T identity_)
699 identity {identity_},
700 device_count {nullptr},
702 owns_device_pointer {false}
706 ReduceLastBlock_Data(
const ReduceLastBlock_Data& other)
707 : value {other.identity},
708 identity {other.identity},
709 device_count {other.device_count},
710 device {other.device},
711 owns_device_pointer {false}
714 ReduceLastBlock_Data& operator=(
const ReduceLastBlock_Data&) =
default;
718 T* init_grid_vals(T (&output)[tally_slots])
720 for (
size_t r = 0; r < tally_slots; ++r)
722 output[r] = identity;
729 void grid_reduce(T* output)
733 size_t replicationId =
734 impl::grid_reduce_last_block<Combiner, Accessor, replication,
735 atomic_stride>(temp, identity, device,
737 if (replicationId != replication)
739 output[replicationId] = temp;
745 bool setupForDevice()
747 bool act = !device.allocated() && setupReducers();
750 cuda_dim_t gridDim = currentGridDim();
751 size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
752 size_t maxNumSlots = (numBlocks + replication - 1) / replication;
753 device.allocate(maxNumSlots * replication);
755 count_mempool_type::getInstance().template malloc<unsigned int>(
756 replication * atomic_stride);
757 owns_device_pointer =
true;
764 bool teardownForDevice()
766 bool act = owns_device_pointer;
770 count_mempool_type::getInstance().free(device_count);
771 device_count =
nullptr;
772 owns_device_pointer =
false;
779 template<
typename Combiner,
782 size_t atomic_stride>
783 struct ReduceAtomicHostInit_Data
785 using tally_mempool_type = device_pinned_mempool_type;
787 static constexpr
size_t tally_slots = replication * atomic_stride;
792 bool owns_device_pointer;
794 ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()) {};
796 ReduceAtomicHostInit_Data(T initValue, T identity_)
798 identity {identity_},
800 owns_device_pointer {false}
804 ReduceAtomicHostInit_Data(
const ReduceAtomicHostInit_Data& other)
805 : value {other.identity},
806 identity {other.identity},
807 is_setup {other.is_setup},
808 owns_device_pointer {false}
811 ReduceAtomicHostInit_Data& operator=(
const ReduceAtomicHostInit_Data&) =
816 T* init_grid_vals(T (&output)[tally_slots])
818 for (
size_t r = 0; r < tally_slots; ++r)
820 output[r] = identity;
827 void grid_reduce(T* output)
831 impl::grid_reduce_atomic_host_init<Combiner, replication, atomic_stride>(
832 temp, identity, output);
837 bool setupForDevice()
839 bool act = !is_setup && setupReducers();
843 owns_device_pointer =
true;
850 bool teardownForDevice()
852 bool act = owns_device_pointer;
856 owns_device_pointer =
false;
863 template<
typename Combiner,
867 size_t atomic_stride>
868 struct ReduceAtomicDeviceInit_Data
870 using tally_mempool_type = pinned_mempool_type;
871 using data_mempool_type = device_mempool_type;
872 using count_mempool_type = device_zeroed_mempool_type;
874 static constexpr
size_t tally_slots = replication;
878 unsigned int* device_count;
880 bool owns_device_pointer;
882 ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()) {};
884 ReduceAtomicDeviceInit_Data(T initValue, T identity_)
886 identity {identity_},
887 device_count {nullptr},
889 owns_device_pointer {false}
893 ReduceAtomicDeviceInit_Data(
const ReduceAtomicDeviceInit_Data& other)
894 : value {other.identity},
895 identity {other.identity},
896 device_count {other.device_count},
897 device {other.device},
898 owns_device_pointer {false}
901 ReduceAtomicDeviceInit_Data& operator=(
const ReduceAtomicDeviceInit_Data&) =
906 T* init_grid_vals(T (&output)[tally_slots])
908 for (
size_t r = 0; r < tally_slots; ++r)
910 output[r] = identity;
917 void grid_reduce(T* output)
921 size_t replicationId =
922 impl::grid_reduce_atomic_device_init<Combiner, Accessor, replication,
924 temp, identity, device, device_count);
925 if (replicationId != replication)
927 output[replicationId] = temp;
933 bool setupForDevice()
935 bool act = !device && setupReducers();
938 device = data_mempool_type::getInstance().template malloc<T>(
939 replication * atomic_stride);
941 count_mempool_type::getInstance().template malloc<unsigned int>(
942 replication * atomic_stride);
943 owns_device_pointer =
true;
950 bool teardownForDevice()
952 bool act = owns_device_pointer;
955 data_mempool_type::getInstance().free(device);
957 count_mempool_type::getInstance().free(device_count);
958 device_count =
nullptr;
959 owns_device_pointer =
false;
966 template<
typename Combiner,
typename T,
typename tuning>
969 static constexpr
size_t replication =
970 (tuning::replication > 0) ? tuning::replication : 1;
971 static constexpr
size_t atomic_stride =
972 (tuning::atomic_stride > 0)
973 ? tuning::atomic_stride
974 : ((policy::cuda::device_constants
975 .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE >
sizeof(T))
977 policy::cuda::device_constants
978 .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE,
982 using Accessor = std::conditional_t<
983 (tuning::comm_mode == block_communication_mode::block_fence),
984 impl::AccessorDeviceScopeUseBlockFence,
985 std::conditional_t<(tuning::comm_mode ==
986 block_communication_mode::device_fence),
987 impl::AccessorDeviceScopeUseDeviceFence,
990 static constexpr
bool atomic_policy =
991 (tuning::algorithm ==
992 reduce_algorithm::init_device_combine_atomic_block) ||
993 (tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block);
994 static constexpr
bool atomic_available =
995 RAJA::reduce::cuda::cuda_atomic_available<T>::value;
998 using reduce_data_type = std::conditional_t<
999 (tuning::algorithm == reduce_algorithm::combine_last_block) ||
1000 (atomic_policy && !atomic_available),
1001 cuda::ReduceLastBlock_Data<Combiner,
1009 (tuning::algorithm ==
1010 reduce_algorithm::init_device_combine_atomic_block),
1011 cuda::ReduceAtomicDeviceInit_Data<Combiner,
1017 (tuning::algorithm ==
1018 reduce_algorithm::init_host_combine_atomic_block),
1019 cuda::ReduceAtomicHostInit_Data<Combiner,
1026 static constexpr
size_t tally_slots = reduce_data_type::tally_slots;
1028 using TallyType = PinnedTally<T,
1030 typename reduce_data_type::tally_mempool_type>;
1038 constexpr tally_u(TallyType* l) : list(l) {};
1039 constexpr tally_u(T* v_ptr) : val_ptr(v_ptr) {};
1047 explicit Reduce(T init_val, T identity_ = Combiner::identity())
1049 tally_or_val_ptr {new TallyType},
1050 val(init_val, identity_)
1053 void reset(T in_val, T identity_ = Combiner::identity())
1056 val = reduce_data_type(in_val, identity_);
1064 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
1065 : parent {other.parent},
1069 tally_or_val_ptr {other.tally_or_val_ptr},
1072 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
1075 if (val.setupForDevice())
1077 tally_or_val_ptr.val_ptr = val.init_grid_vals(
1078 tally_or_val_ptr.list->new_value(currentResource()));
1090 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
1093 delete tally_or_val_ptr.list;
1094 tally_or_val_ptr.list =
nullptr;
1098 if (val.value != val.identity)
1100 std::lock_guard<std::mutex> lock(tally_or_val_ptr.list->m_mutex);
1101 parent->combine(val.value);
1106 if (val.teardownForDevice())
1108 tally_or_val_ptr.val_ptr =
nullptr;
1112 if (!parent->parent)
1114 val.grid_reduce(tally_or_val_ptr.val_ptr);
1118 parent->combine(val.value);
1126 auto n = tally_or_val_ptr.list->begin();
1127 auto end = tally_or_val_ptr.list->end();
1130 tally_or_val_ptr.list->synchronize_resources();
1132 std::move(val.value));
1133 for (; n != end; ++n)
1135 T(&values)[tally_slots] = *n;
1136 for (
size_t r = 0; r < tally_slots; ++r)
1138 reducer.combine(std::move(values[r]));
1141 val.value = reducer.get_and_reset();
1142 tally_or_val_ptr.list->free_list();
1148 T
get() {
return operator T(); }
1152 void combine(T other)
const { Combiner {}(val.value, other); }
1157 T& local()
const {
return val.value; }
1159 T get_combined()
const {
return val.value; }
1163 tally_u tally_or_val_ptr;
1164 reduce_data_type val;
1170 template<
typename tuning,
typename T>
1171 class ReduceSum<
RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
1176 using Base = cuda::Reduce<RAJA::reduce::sum<T>, T, tuning>;
1181 const ReduceSum& operator+=(T rhs)
const
1189 template<
typename tuning,
typename T>
1190 class ReduceBitOr<
RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
1191 :
public cuda::Reduce<RAJA::reduce::or_bit<T>, T, tuning>
1195 using Base = cuda::Reduce<RAJA::reduce::or_bit<T>, T, tuning>;
1200 const ReduceBitOr& operator|=(T rhs)
const
1208 template<
typename tuning,
typename T>
1209 class ReduceBitAnd<
RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
1210 :
public cuda::Reduce<RAJA::reduce::and_bit<T>, T, tuning>
1214 using Base = cuda::Reduce<RAJA::reduce::and_bit<T>, T, tuning>;
1219 const ReduceBitAnd& operator&=(T rhs)
const
1227 template<
typename tuning,
typename T>
1228 class ReduceMin<
RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
1233 using Base = cuda::Reduce<RAJA::reduce::min<T>, T, tuning>;
1238 const ReduceMin&
min(T rhs)
const
1246 template<
typename tuning,
typename T>
1247 class ReduceMax<
RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
1252 using Base = cuda::Reduce<RAJA::reduce::max<T>, T, tuning>;
1257 const ReduceMax&
max(T rhs)
const
1265 template<
typename tuning,
typename T,
typename IndexType>
1266 class ReduceMinLoc<
RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
1268 RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
1269 RAJA::reduce::detail::ValueLoc<T, IndexType>,
1277 using Base = cuda::Reduce<Combiner, value_type, tuning>;
1281 ReduceMinLoc(T init_val,
1283 T identity_val = NonLocCombiner::identity(),
1284 IndexType identity_idx =
1286 : Base(value_type(init_val, init_idx),
1287 value_type(identity_val, identity_idx))
1292 void reset(T init_val,
1294 T identity_val = NonLocCombiner::identity(),
1295 IndexType identity_idx =
1298 Base::reset(value_type(init_val, init_idx),
1299 value_type(identity_val, identity_idx));
1304 const ReduceMinLoc& minloc(T rhs, IndexType loc)
const
1306 this->combine(value_type(rhs, loc));
1311 IndexType getLoc() {
return Base::get().getLoc(); }
1321 template<
typename tuning,
typename T,
typename IndexType>
1322 class ReduceMaxLoc<
RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
1325 RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
1326 RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
1333 using Base = cuda::Reduce<Combiner, value_type, tuning>;
1337 ReduceMaxLoc(T init_val,
1339 T identity_val = NonLocCombiner::identity(),
1340 IndexType identity_idx =
1342 : Base(value_type(init_val, init_idx),
1343 value_type(identity_val, identity_idx))
1348 void reset(T init_val,
1350 T identity_val = NonLocCombiner::identity(),
1351 IndexType identity_idx =
1354 Base::reset(value_type(init_val, init_idx),
1355 value_type(identity_val, identity_idx));
1360 const ReduceMaxLoc& maxloc(T rhs, IndexType loc)
const
1362 this->combine(value_type(rhs, loc));
1367 IndexType getLoc() {
return Base::get().getLoc(); }
Header file defining prototypes for routines used to manage memory for CUDA reductions and other oper...
Header file for common RAJA internal definitions.
Header file for common RAJA internal definitions.
RAJA header file containing an implementation of a memory pool.
Array class specialized for Struct of Array data layout.
Definition: SoAArray.hpp:42
constexpr RAJA_HOST_DEVICE void set(size_t i, value_type val)
Definition: SoAArray.hpp:48
Pointer class specialized for Struct of Array data layout allocated via RAJA basic_mempools.
Definition: SoAPtr.hpp:52
constexpr RAJA_HOST_DEVICE void set(size_t i, value_type val)
Definition: SoAPtr.hpp:100
constexpr RAJA_HOST_DEVICE value_type get(size_t i) const
Definition: SoAPtr.hpp:95
Definition: reduce.hpp:131
Header file containing RAJA intrinsics templates for CUDA execution.
Header file containing RAJA CUDA policy definitions.
Header file for common RAJA internal macro definitions.
#define RAJA_HOST_DEVICE
Definition: macros.hpp:65
#define RAJA_DIVIDE_CEILING_INT(dividend, divisor)
Definition: macros.hpp:122
#define RAJA_DEVICE
Definition: macros.hpp:66
constexpr auto Reduce(T *target)
Definition: reducer.hpp:231
RAJA_INLINE RAJA_HOST_DEVICE auto operator*(LHS const &left_operand, RHS const &right_operand) -> TensorMultiply< typename NormalizeOperandHelper< LHS >::return_type, RHS >
Definition: TensorMultiply.hpp:155
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T *acc, T value)
Atomic bitwise AND equivalent to (*acc) = (*acc) & value This only works with integral data types.
Definition: atomic.hpp:224
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result min(Args... args)
Definition: foldl.hpp:161
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T *acc, T value)
Atomic bitwise OR equivalent to (*acc) = (*acc) | value This only works with integral data types.
Definition: atomic.hpp:240
RAJA_HOST_DEVICE RAJA_INLINE Iter next(Iter it)
returns iterator to next item
Definition: algorithm.hpp:90
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T *acc, T value)
Atomic add.
Definition: atomic.hpp:117
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result sum(Args... args)
Definition: foldl.hpp:143
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
std::conditional_t< RAJA::operators::is_fp_associative< T >::value, BinaryTreeReduce< T, BinaryOp >, LeftFoldReduce< T, BinaryOp > > HighAccuracyReduce
Definition: reduce.hpp:357
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T *acc, T value)
Atomic maximum equivalent to (*acc) = std::max(*acc, value)
Definition: atomic.hpp:156
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T *acc, T compare, T value)
Atomic compare and swap.
Definition: atomic.hpp:286
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc)
Atomic increment.
Definition: atomic.hpp:168
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T *acc, T value)
Atomic minimum equivalent to (*acc) = std::min(*acc, value)
Definition: atomic.hpp:143
void synchronize()
Synchronize all current RAJA executions for the specified policy.
Definition: synchronize.hpp:44
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155
Base types used in common for RAJA reducer objects.
Header file providing RAJA reduction declarations.
RAJA header file defining atomic operations for CUDA.
Header file containing utility methods used in CUDA operations.
Definition: reduce.hpp:115
Definition: reduce.hpp:95
Definition: reduce.hpp:91
Header file for RAJA type definitions.
Header file providing RAJA sort templates.