23 #ifndef RAJA_cuda_multi_reduce_HPP
24 #define RAJA_cuda_multi_reduce_HPP
26 #include "RAJA/config.hpp"
28 #if defined(RAJA_ENABLE_CUDA)
30 #include <type_traits>
51 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
81 template<
typename Combiner,
82 typename GetTallyIndex,
84 typename GetTallyOffset>
85 RAJA_DEVICE RAJA_INLINE
void block_multi_reduce_combine_global_atomic(
91 GetTallyOffset get_tally_offset,
92 int tally_replication,
95 if (value == identity)
101 GetTallyIndex::template index<int>();
104 get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
105 RAJA::reduce::cuda::atomic<Combiner> {}(tally_mem[tally_offset], value);
110 RAJA_DEVICE RAJA_INLINE
void block_multi_reduce_init_shmem(
114 int shared_replication)
116 int threadId = threadIdx.x + blockDim.x * threadIdx.y +
117 (blockDim.x * blockDim.y) * threadIdx.z;
118 int numThreads = blockDim.x * blockDim.y * blockDim.z;
120 for (
int shmem_offset = threadId;
121 shmem_offset < shared_replication * num_bins; shmem_offset += numThreads)
123 shared_mem[shmem_offset] = identity;
129 template<
typename Combiner,
130 typename GetSharedIndex,
132 typename GetSharedOffset>
133 RAJA_DEVICE RAJA_INLINE
void block_multi_reduce_combine_shmem_atomic(
139 GetSharedOffset get_shared_offset,
140 int shared_replication)
142 if (value == identity)
148 GetSharedIndex::template index<int>();
151 get_shared_offset(bin, num_bins, shared_rep, shared_replication);
153 RAJA::reduce::cuda::atomic<Combiner> {}(shared_mem[shmem_offset], value);
157 template<
typename Combiner,
159 typename GetSharedOffset,
160 typename GetTallyOffset>
161 RAJA_DEVICE RAJA_INLINE
void grid_multi_reduce_shmem_to_global_atomic(
165 GetSharedOffset get_shared_offset,
166 int shared_replication,
168 GetTallyOffset get_tally_offset,
169 int tally_replication,
172 int threadId = threadIdx.x + blockDim.x * threadIdx.y +
173 (blockDim.x * blockDim.y) * threadIdx.z;
174 int numThreads = blockDim.x * blockDim.y * blockDim.z;
176 int blockId = blockIdx.x + gridDim.x * blockIdx.y +
177 (gridDim.x * gridDim.y) * blockIdx.z;
180 for (
int bin = threadId; bin < num_bins; bin += numThreads)
184 for (
int shared_rep = 0; shared_rep < shared_replication; ++shared_rep)
187 get_shared_offset(bin, num_bins, shared_rep, shared_replication);
188 Combiner {}(value, shared_mem[shmem_offset]);
191 if (value != identity)
195 get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
196 RAJA::reduce::cuda::atomic<Combiner> {}(tally_mem[tally_offset], value);
212 template<
typename Combiner,
216 struct MultiReduceGridAtomicHostInit_TallyData
219 template<
typename Container>
220 MultiReduceGridAtomicHostInit_TallyData(Container
const& container,
222 : m_tally_mem(nullptr),
223 m_identity(identity),
224 m_num_bins(container.size()),
225 m_tally_bins(get_tally_bins(m_num_bins)),
226 m_tally_replication(get_tally_replication())
228 m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
229 m_tally_replication);
232 MultiReduceGridAtomicHostInit_TallyData() =
delete;
233 MultiReduceGridAtomicHostInit_TallyData(
234 MultiReduceGridAtomicHostInit_TallyData
const&) =
default;
235 MultiReduceGridAtomicHostInit_TallyData(
236 MultiReduceGridAtomicHostInit_TallyData&&) =
delete;
237 MultiReduceGridAtomicHostInit_TallyData& operator=(
238 MultiReduceGridAtomicHostInit_TallyData
const&) =
default;
239 MultiReduceGridAtomicHostInit_TallyData& operator=(
240 MultiReduceGridAtomicHostInit_TallyData&&) =
delete;
241 ~MultiReduceGridAtomicHostInit_TallyData() =
default;
244 template<
typename Container>
245 void reset_permanent(Container
const& container, T
const& identity)
247 int new_num_bins = container.size();
248 if (new_num_bins != m_num_bins)
250 teardown_permanent();
251 m_num_bins = new_num_bins;
252 m_tally_bins = get_tally_bins(m_num_bins);
253 m_tally_replication = get_tally_replication();
254 m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
255 m_tally_replication);
262 for (
auto const& value : container)
264 m_tally_mem[GetTallyOffset {}(bin, m_tally_bins, tally_rep,
265 m_tally_replication)] = value;
269 for (
int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep)
271 for (
int bin = 0; bin < m_num_bins; ++bin)
273 m_tally_mem[GetTallyOffset {}(bin, m_tally_bins, tally_rep,
274 m_tally_replication)] = identity;
278 m_identity = identity;
282 void teardown_permanent()
284 destroy_tally(m_tally_mem, m_num_bins, m_tally_bins, m_tally_replication);
292 for (
int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep)
295 GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
296 reducer.combine(m_tally_mem[tally_offset]);
298 return reducer.get_and_reset();
301 int num_bins()
const {
return m_num_bins; }
303 T identity()
const {
return m_identity; }
306 static constexpr
size_t s_tally_alignment =
std::max(
308 policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
309 size_t(RAJA::DATA_ALIGN));
310 static constexpr
size_t s_tally_bunch_size =
313 using tally_mempool_type = device_pinned_mempool_type;
314 using tally_tuning =
typename tuning::GlobalAtomicReplicationTuning;
315 using TallyAtomicReplicationConcretizer =
316 typename tally_tuning::AtomicReplicationConcretizer;
317 using GetTallyOffset_rebind_rebunch =
typename tally_tuning::OffsetCalculator;
318 using GetTallyOffset_rebind =
319 typename GetTallyOffset_rebind_rebunch::template rebunch<
322 static int get_tally_bins(
int num_bins)
328 static int get_tally_replication()
330 int min_tally_replication = RAJA::get_max_threads<ThreadPolicy>();
334 int func_min_global_replication;
335 } func_data {min_tally_replication};
337 return TallyAtomicReplicationConcretizer {}
338 .template get_global_replication<int>(func_data);
341 template<
typename Container>
342 static T* create_tally(Container
const& container,
346 int tally_replication)
348 if (num_bins ==
size_t(0))
353 T* tally_mem = tally_mempool_type::getInstance().template malloc<T>(
354 tally_replication * tally_bins, s_tally_alignment);
356 if (tally_replication > 0)
361 for (
auto const& value : container)
364 GetTallyOffset {}(bin, tally_bins, tally_rep, tally_replication);
365 new (&tally_mem[tally_offset]) T(value);
369 for (
int tally_rep = 1; tally_rep < tally_replication; ++tally_rep)
371 for (
int bin = 0; bin < num_bins; ++bin)
374 GetTallyOffset {}(bin, tally_bins, tally_rep, tally_replication);
375 new (&tally_mem[tally_offset]) T(identity);
382 static void destroy_tally(T*& tally_mem,
385 int tally_replication)
387 if (num_bins ==
size_t(0))
392 for (
int tally_rep = tally_replication + 1; tally_rep > 0; --tally_rep)
394 for (
int bin = num_bins; bin > 0; --bin)
396 int tally_offset = GetTallyOffset {}(bin - 1, tally_bins, tally_rep - 1,
398 tally_mem[tally_offset].~T();
401 tally_mempool_type::getInstance().free(tally_mem);
406 using GetTallyIndex =
typename tally_tuning::ReplicationIndexer;
407 using GetTallyOffset =
typename GetTallyOffset_rebind::template rebind<int>;
413 int m_tally_replication;
418 template<
typename Combiner,
422 struct MultiReduceGridAtomicHostInit_Data
423 : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
426 MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
430 using TallyData::identity;
431 using TallyData::num_bins;
432 using TallyData::reset_permanent;
433 using TallyData::TallyData;
434 using TallyData::teardown_permanent;
440 void teardown_launch() {}
444 void setup_device() {}
448 void finalize_device() {}
452 void combine_device(
int bin, T value)
454 impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
455 m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset {},
456 m_tally_replication, m_tally_bins);
460 void combine_host(
int bin, T value)
462 int tally_rep = RAJA::get_thread_num<ThreadPolicy>();
464 GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
465 Combiner {}(m_tally_mem[tally_offset], value);
469 using typename TallyData::GetTallyIndex;
470 using typename TallyData::GetTallyOffset;
472 using TallyData::m_identity;
473 using TallyData::m_num_bins;
474 using TallyData::m_tally_bins;
475 using TallyData::m_tally_mem;
476 using TallyData::m_tally_replication;
480 template<
typename Combiner,
484 struct MultiReduceBlockThenGridAtomicHostInit_Data
485 : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
488 MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
491 template<
typename Container>
492 MultiReduceBlockThenGridAtomicHostInit_Data(Container
const& container,
494 : TallyData(container, identity),
495 m_shared_offset(s_shared_offset_unknown),
496 m_shared_replication(0)
499 MultiReduceBlockThenGridAtomicHostInit_Data() =
delete;
500 MultiReduceBlockThenGridAtomicHostInit_Data(
501 MultiReduceBlockThenGridAtomicHostInit_Data
const&) =
default;
502 MultiReduceBlockThenGridAtomicHostInit_Data(
503 MultiReduceBlockThenGridAtomicHostInit_Data&&) =
delete;
504 MultiReduceBlockThenGridAtomicHostInit_Data& operator=(
505 MultiReduceBlockThenGridAtomicHostInit_Data
const&) =
default;
506 MultiReduceBlockThenGridAtomicHostInit_Data& operator=(
507 MultiReduceBlockThenGridAtomicHostInit_Data&&) =
delete;
508 ~MultiReduceBlockThenGridAtomicHostInit_Data() =
default;
513 using TallyData::identity;
514 using TallyData::num_bins;
515 using TallyData::reset_permanent;
516 using TallyData::teardown_permanent;
519 void setup_launch(
size_t block_size)
521 if (m_num_bins ==
size_t(0))
523 m_shared_offset = s_shared_offset_invalid;
527 size_t shared_replication = 0;
528 const size_t shared_offset =
529 allocateDynamicShmem<T>([&](
size_t max_shmem_size) {
532 size_t func_threads_per_block;
533 size_t func_max_shared_replication_per_block;
534 } func_data {block_size, max_shmem_size / m_num_bins};
537 SharedAtomicReplicationConcretizer {}
538 .template get_shared_replication<size_t>(func_data);
539 return m_num_bins * shared_replication;
542 if (shared_offset != dynamic_smem_allocation_failure)
544 m_shared_replication =
static_cast<int>(shared_replication);
545 m_shared_offset =
static_cast<int>(shared_offset);
549 m_shared_offset = s_shared_offset_invalid;
554 void teardown_launch()
556 m_shared_replication = 0;
557 m_shared_offset = s_shared_offset_unknown;
564 T* shared_mem = get_shared_mem();
565 if (shared_mem !=
nullptr)
567 impl::block_multi_reduce_init_shmem(m_num_bins, m_identity, shared_mem,
568 m_shared_replication);
574 void finalize_device()
576 T* shared_mem = get_shared_mem();
577 if (shared_mem !=
nullptr)
579 impl::grid_multi_reduce_shmem_to_global_atomic<Combiner>(
580 m_num_bins, m_identity, shared_mem, GetSharedOffset {},
581 m_shared_replication, m_tally_mem, GetTallyOffset {},
582 m_tally_replication, m_tally_bins);
588 void combine_device(
int bin, T value)
590 T* shared_mem = get_shared_mem();
591 if (shared_mem !=
nullptr)
593 impl::block_multi_reduce_combine_shmem_atomic<Combiner, GetSharedIndex>(
594 m_num_bins, m_identity, bin, value, shared_mem, GetSharedOffset {},
595 m_shared_replication);
599 impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
600 m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset {},
601 m_tally_replication, m_tally_bins);
606 void combine_host(
int bin, T value)
608 int tally_rep = RAJA::get_thread_num<ThreadPolicy>();
610 GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
611 Combiner {}(m_tally_mem[tally_offset], value);
615 using shared_tuning =
typename tuning::SharedAtomicReplicationTuning;
616 using SharedAtomicReplicationConcretizer =
617 typename shared_tuning::AtomicReplicationConcretizer;
618 using GetSharedIndex =
typename shared_tuning::ReplicationIndexer;
619 using GetSharedOffset_rebind =
typename shared_tuning::OffsetCalculator;
620 using GetSharedOffset =
typename GetSharedOffset_rebind::template rebind<int>;
622 using typename TallyData::GetTallyIndex;
623 using typename TallyData::GetTallyOffset;
626 static constexpr
int s_shared_offset_unknown =
628 static constexpr
int s_shared_offset_invalid =
632 using TallyData::m_identity;
633 using TallyData::m_num_bins;
634 using TallyData::m_tally_bins;
635 using TallyData::m_tally_mem;
636 using TallyData::m_tally_replication;
639 int m_shared_replication;
642 T* get_shared_mem()
const
644 if (m_shared_offset == s_shared_offset_invalid)
648 extern __shared__
char shared_mem[];
649 return reinterpret_cast<T*
>(&shared_mem[m_shared_offset]);
671 template<
typename T,
typename t_MultiReduceOp,
typename tuning>
672 struct MultiReduceDataCuda
674 static constexpr
bool atomic_available =
675 RAJA::reduce::cuda::cuda_atomic_available<T>::value;
678 using reduce_data_type = std::conditional_t<
681 (tuning::algorithm ==
683 init_host_combine_block_atomic_then_grid_atomic),
684 cuda::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp,
688 (tuning::algorithm ==
689 multi_reduce_algorithm::init_host_combine_global_atomic),
690 cuda::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp,
697 using SyncList = std::vector<resources::Cuda>;
700 using value_type = T;
701 using MultiReduceOp = t_MultiReduceOp;
703 MultiReduceDataCuda() =
delete;
705 template<
typename Container,
707 !std::is_same<Container, MultiReduceDataCuda>::value>* =
nullptr>
708 MultiReduceDataCuda(Container
const& container, T identity)
710 m_sync_list(new SyncList),
711 m_data(container, identity),
712 m_own_launch_data(false)
719 MultiReduceDataCuda(MultiReduceDataCuda
const& other)
720 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
721 : m_parent(other.m_parent)
726 m_sync_list(other.m_sync_list),
727 m_data(other.m_data),
728 m_own_launch_data(
false)
730 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
736 add_resource_to_synchronization_list(currentResource());
737 m_data.setup_launch(currentBlockSize());
738 m_own_launch_data =
true;
743 if (!m_parent->m_parent)
746 m_data.setup_device();
751 MultiReduceDataCuda(MultiReduceDataCuda&&) =
delete;
752 MultiReduceDataCuda& operator=(MultiReduceDataCuda
const&) =
delete;
753 MultiReduceDataCuda& operator=(MultiReduceDataCuda&&) =
delete;
758 ~MultiReduceDataCuda()
760 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
761 if (m_parent ==
this)
764 synchronize_resources_and_clear_list();
766 m_sync_list =
nullptr;
767 m_data.teardown_permanent();
775 if (m_own_launch_data)
778 m_data.teardown_launch();
779 m_own_launch_data =
false;
783 if (!m_parent->m_parent)
786 m_data.finalize_device();
791 template<
typename Container>
792 void reset(Container
const& container, T identity)
794 synchronize_resources_and_clear_list();
795 m_data.reset_permanent(container, identity);
800 void combine(
int bin, T
const& value)
802 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
803 m_data.combine_host(bin, value);
805 m_data.combine_device(bin, value);
812 synchronize_resources_and_clear_list();
813 return m_data.get(bin);
816 size_t num_bins()
const {
return m_data.num_bins(); }
818 T identity()
const {
return m_data.identity(); }
822 MultiReduceDataCuda
const* m_parent;
823 SyncList* m_sync_list;
824 reduce_data_type m_data;
825 bool m_own_launch_data;
827 void add_resource_to_synchronization_list(resources::Cuda res)
829 for (resources::Cuda& list_res : *m_sync_list)
831 if (list_res.get_stream() == res.get_stream())
836 m_sync_list->emplace_back(res);
839 void synchronize_resources_and_clear_list()
841 for (resources::Cuda& list_res : *m_sync_list)
845 m_sync_list->clear();
852 cuda::MultiReduceDataCuda)
Header file defining prototypes for routines used to manage memory for CUDA reductions and other oper...
RAJA header file defining Simple Offset Calculators.
Header file containing RAJA intrinsics templates for CUDA execution.
Header file containing RAJA CUDA policy definitions.
Header file for common RAJA internal macro definitions.
#define RAJA_HOST_DEVICE
Definition: macros.hpp:65
#define RAJA_UNUSED_ARG(x)
Definition: macros.hpp:97
#define RAJA_DIVIDE_CEILING_INT(dividend, divisor)
Definition: macros.hpp:122
#define RAJA_DEVICE
Definition: macros.hpp:66
Header file providing RAJA math templates.
multi_reduce_algorithm
Definition: policy.hpp:51
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
std::conditional_t< RAJA::operators::is_fp_associative< T >::value, BinaryTreeReduce< T, BinaryOp >, LeftFoldReduce< T, BinaryOp > > HighAccuracyReduce
Definition: reduce.hpp:357
RAJA_HOST_DEVICE constexpr RAJA_INLINE auto power_of_2_mod(L lhs, R rhs) noexcept
compute lhs mod rhs where lhs is non-negative and rhs is a power of 2
Definition: math.hpp:102
void synchronize()
Synchronize all current RAJA executions for the specified policy.
Definition: synchronize.hpp:44
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155
Base types used in common for RAJA reducer objects.
#define RAJA_DECLARE_ALL_MULTI_REDUCERS(POL, DATA)
Definition: multi_reduce.hpp:49
Header file providing RAJA reduction declarations.
RAJA header file defining thread operations.
RAJA header file defining atomic operations for CUDA.
Header file containing utility methods used in CUDA operations.
Definition: policy.hpp:130
Header file for RAJA type definitions.
Header file providing RAJA sort templates.