RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
multi_reduce.hpp
Go to the documentation of this file.
1 
14 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
15 // Copyright (c) Lawrence Livermore National Security, LLC and other
16 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
17 // files for dates and other details. No copyright assignment is required
18 // to contribute to RAJA.
19 //
20 // SPDX-License-Identifier: (BSD-3-Clause)
21 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
22 
23 #ifndef RAJA_hip_multi_reduce_HPP
24 #define RAJA_hip_multi_reduce_HPP
25 
26 #include "RAJA/config.hpp"
27 
28 #if defined(RAJA_ENABLE_HIP)
29 
30 #include <type_traits>
31 #include <limits>
32 #include <mutex>
33 #include <utility>
34 #include <vector>
35 
36 
37 #include "hip/hip_runtime.h"
38 
39 #include "RAJA/util/macros.hpp"
40 #include "RAJA/util/math.hpp"
41 #include "RAJA/util/types.hpp"
42 #include "RAJA/util/reduce.hpp"
44 
47 
50 
51 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
53 #else
55 #endif
56 
59 
60 #include "RAJA/pattern/thread.hpp"
61 
62 namespace RAJA
63 {
64 
65 namespace hip
66 {
67 
68 namespace impl
69 {
70 
71 
72 //
74 //
75 // MultiReduction algorithms.
76 //
78 //
79 
81 template<typename Combiner,
82  typename GetTallyIndex,
83  typename T,
84  typename GetTallyOffset>
85 RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(
86  int RAJA_UNUSED_ARG(num_bins),
87  T identity,
88  int bin,
89  T value,
90  T* tally_mem,
91  GetTallyOffset get_tally_offset,
92  int tally_replication,
93  int tally_bins)
94 {
95  if (value == identity)
96  {
97  return;
98  }
99 
100  int tally_index =
101  GetTallyIndex::template index<int>(); // globalWarpId by default
102  int tally_rep = ::RAJA::power_of_2_mod(tally_index, tally_replication);
103  int tally_offset =
104  get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
105  RAJA::reduce::hip::atomic<Combiner> {}(tally_mem[tally_offset], value);
106 }
107 
109 template<typename T>
110 RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(
111  int num_bins,
112  T identity,
113  T* shared_mem,
114  int shared_replication)
115 {
116  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
117  (blockDim.x * blockDim.y) * threadIdx.z;
118  int numThreads = blockDim.x * blockDim.y * blockDim.z;
119 
120  for (int shmem_offset = threadId;
121  shmem_offset < shared_replication * num_bins; shmem_offset += numThreads)
122  {
123  shared_mem[shmem_offset] = identity;
124  }
125  __syncthreads();
126 }
127 
129 template<typename Combiner,
130  typename GetSharedIndex,
131  typename T,
132  typename GetSharedOffset>
133 RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(
134  int num_bins,
135  T identity,
136  int bin,
137  T value,
138  T* shared_mem,
139  GetSharedOffset get_shared_offset,
140  int shared_replication)
141 {
142  if (value == identity)
143  {
144  return;
145  }
146 
147  int shared_index =
148  GetSharedIndex::template index<int>(); // threadId by default
149  int shared_rep = ::RAJA::power_of_2_mod(shared_index, shared_replication);
150  int shmem_offset =
151  get_shared_offset(bin, num_bins, shared_rep, shared_replication);
152 
153  RAJA::reduce::hip::atomic<Combiner> {}(shared_mem[shmem_offset], value);
154 }
155 
157 template<typename Combiner,
158  typename T,
159  typename GetSharedOffset,
160  typename GetTallyOffset>
161 RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(
162  int num_bins,
163  T identity,
164  T* shared_mem,
165  GetSharedOffset get_shared_offset,
166  int shared_replication,
167  T* tally_mem,
168  GetTallyOffset get_tally_offset,
169  int tally_replication,
170  int tally_bins)
171 {
172  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
173  (blockDim.x * blockDim.y) * threadIdx.z;
174  int numThreads = blockDim.x * blockDim.y * blockDim.z;
175 
176  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
177  (gridDim.x * gridDim.y) * blockIdx.z;
178 
179  __syncthreads();
180  for (int bin = threadId; bin < num_bins; bin += numThreads)
181  {
182 
183  T value = identity;
184  for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep)
185  {
186  int shmem_offset =
187  get_shared_offset(bin, num_bins, shared_rep, shared_replication);
188  Combiner {}(value, shared_mem[shmem_offset]);
189  }
190 
191  if (value != identity)
192  {
193  int tally_rep = ::RAJA::power_of_2_mod(blockId, tally_replication);
194  int tally_offset =
195  get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
196  RAJA::reduce::hip::atomic<Combiner> {}(tally_mem[tally_offset], value);
197  }
198  }
199 }
200 
201 } // namespace impl
202 
203 //
205 //
206 // MultiReduction classes.
207 //
209 //
210 
212 template<typename Combiner,
213  typename T,
214  typename tuning,
215  typename ThreadPolicy = RAJA::detail::active_auto_thread>
216 struct MultiReduceGridAtomicHostInit_TallyData
217 {
219  template<typename Container>
220  MultiReduceGridAtomicHostInit_TallyData(Container const& container,
221  T const& identity)
222  : m_tally_mem(nullptr),
223  m_identity(identity),
224  m_num_bins(container.size()),
225  m_tally_bins(get_tally_bins(m_num_bins)),
226  m_tally_replication(get_tally_replication())
227  {
228  m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
229  m_tally_replication);
230  }
231 
232  MultiReduceGridAtomicHostInit_TallyData() = delete;
233  MultiReduceGridAtomicHostInit_TallyData(
234  MultiReduceGridAtomicHostInit_TallyData const&) = default;
235  MultiReduceGridAtomicHostInit_TallyData(
236  MultiReduceGridAtomicHostInit_TallyData&&) = delete;
237  MultiReduceGridAtomicHostInit_TallyData& operator=(
238  MultiReduceGridAtomicHostInit_TallyData const&) = default;
239  MultiReduceGridAtomicHostInit_TallyData& operator=(
240  MultiReduceGridAtomicHostInit_TallyData&&) = delete;
241  ~MultiReduceGridAtomicHostInit_TallyData() = default;
242 
244  template<typename Container>
245  void reset_permanent(Container const& container, T const& identity)
246  {
247  int new_num_bins = container.size();
248  if (new_num_bins != m_num_bins)
249  {
250  teardown_permanent();
251  m_num_bins = new_num_bins;
252  m_tally_bins = get_tally_bins(m_num_bins);
253  m_tally_replication = get_tally_replication();
254  m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
255  m_tally_replication);
256  }
257  else
258  {
259  {
260  int tally_rep = 0;
261  int bin = 0;
262  for (auto const& value : container)
263  {
264  m_tally_mem[GetTallyOffset {}(bin, m_tally_bins, tally_rep,
265  m_tally_replication)] = value;
266  ++bin;
267  }
268  }
269  for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep)
270  {
271  for (int bin = 0; bin < m_num_bins; ++bin)
272  {
273  m_tally_mem[GetTallyOffset {}(bin, m_tally_bins, tally_rep,
274  m_tally_replication)] = identity;
275  }
276  }
277  }
278  m_identity = identity;
279  }
280 
282  void teardown_permanent()
283  {
284  destroy_tally(m_tally_mem, m_num_bins, m_tally_bins, m_tally_replication);
285  }
286 
288  T get(int bin) const
289  {
291  m_identity);
292  for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep)
293  {
294  int tally_offset =
295  GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
296  reducer.combine(m_tally_mem[tally_offset]);
297  }
298  return reducer.get_and_reset();
299  }
300 
301  int num_bins() const { return m_num_bins; }
302 
303  T identity() const { return m_identity; }
304 
305 private:
306  static constexpr size_t s_tally_alignment = std::max(
307  size_t(
308  policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
309  size_t(RAJA::DATA_ALIGN));
310  static constexpr size_t s_tally_bunch_size =
311  RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
312 
313  using tally_mempool_type = device_pinned_mempool_type;
314  using tally_tuning = typename tuning::GlobalAtomicReplicationTuning;
315  using TallyAtomicReplicationConcretizer =
316  typename tally_tuning::AtomicReplicationConcretizer;
317  using GetTallyOffset_rebind_rebunch = typename tally_tuning::OffsetCalculator;
318  using GetTallyOffset_rebind =
319  typename GetTallyOffset_rebind_rebunch::template rebunch<
320  s_tally_bunch_size>;
321 
322  static int get_tally_bins(int num_bins)
323  {
324  return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) *
325  s_tally_bunch_size;
326  }
327 
328  static int get_tally_replication()
329  {
330  int min_tally_replication = RAJA::get_max_threads<ThreadPolicy>();
331 
332  struct
333  {
334  int func_min_global_replication;
335  } func_data {min_tally_replication};
336 
337  return TallyAtomicReplicationConcretizer {}
338  .template get_global_replication<int>(func_data);
339  }
340 
341  template<typename Container>
342  static T* create_tally(Container const& container,
343  T const& identity,
344  int num_bins,
345  int tally_bins,
346  int tally_replication)
347  {
348  if (num_bins == size_t(0))
349  {
350  return nullptr;
351  }
352 
353  T* tally_mem = tally_mempool_type::getInstance().template malloc<T>(
354  tally_replication * tally_bins, s_tally_alignment);
355 
356  if (tally_replication > 0)
357  {
358  {
359  int tally_rep = 0;
360  int bin = 0;
361  for (auto const& value : container)
362  {
363  int tally_offset =
364  GetTallyOffset {}(bin, tally_bins, tally_rep, tally_replication);
365  new (&tally_mem[tally_offset]) T(value);
366  ++bin;
367  }
368  }
369  for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep)
370  {
371  for (int bin = 0; bin < num_bins; ++bin)
372  {
373  int tally_offset =
374  GetTallyOffset {}(bin, tally_bins, tally_rep, tally_replication);
375  new (&tally_mem[tally_offset]) T(identity);
376  }
377  }
378  }
379  return tally_mem;
380  }
381 
382  static void destroy_tally(T*& tally_mem,
383  int num_bins,
384  int tally_bins,
385  int tally_replication)
386  {
387  if (num_bins == size_t(0))
388  {
389  return;
390  }
391 
392  for (int tally_rep = tally_replication + 1; tally_rep > 0; --tally_rep)
393  {
394  for (int bin = num_bins; bin > 0; --bin)
395  {
396  int tally_offset = GetTallyOffset {}(bin - 1, tally_bins, tally_rep - 1,
397  tally_replication);
398  tally_mem[tally_offset].~T();
399  }
400  }
401  tally_mempool_type::getInstance().free(tally_mem);
402  tally_mem = nullptr;
403  }
404 
405 protected:
406  using GetTallyIndex = typename tally_tuning::ReplicationIndexer;
407  using GetTallyOffset = typename GetTallyOffset_rebind::template rebind<int>;
408 
409  T* m_tally_mem;
410  T m_identity;
411  int m_num_bins;
412  int m_tally_bins;
413  int m_tally_replication; // power of 2, at least the max number of omp
414  // threads
415 };
416 
418 template<typename Combiner,
419  typename T,
420  typename tuning,
421  typename ThreadPolicy = RAJA::detail::active_auto_thread>
422 struct MultiReduceGridAtomicHostInit_Data
423  : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
424 {
425  using TallyData =
426  MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
427 
429  using TallyData::get;
430  using TallyData::identity;
431  using TallyData::num_bins;
432  using TallyData::reset_permanent;
433  using TallyData::TallyData;
434  using TallyData::teardown_permanent;
435 
437  void setup_launch(size_t RAJA_UNUSED_ARG(block_size)) {}
438 
440  void teardown_launch() {}
441 
444  void setup_device() {}
445 
448  void finalize_device() {}
449 
452  void combine_device(int bin, T value)
453  {
454  impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
455  m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset {},
456  m_tally_replication, m_tally_bins);
457  }
458 
460  void combine_host(int bin, T value)
461  {
462  int tally_rep = RAJA::get_thread_num<ThreadPolicy>();
463  int tally_offset =
464  GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
465  Combiner {}(m_tally_mem[tally_offset], value);
466  }
467 
468 private:
469  using typename TallyData::GetTallyIndex;
470  using typename TallyData::GetTallyOffset;
471 
472  using TallyData::m_identity;
473  using TallyData::m_num_bins;
474  using TallyData::m_tally_bins;
475  using TallyData::m_tally_mem;
476  using TallyData::m_tally_replication;
477 };
478 
480 template<typename Combiner,
481  typename T,
482  typename tuning,
483  typename ThreadPolicy = RAJA::detail::active_auto_thread>
484 struct MultiReduceBlockThenGridAtomicHostInit_Data
485  : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
486 {
487  using TallyData =
488  MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
489 
491  template<typename Container>
492  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container,
493  T const& identity)
494  : TallyData(container, identity),
495  m_shared_offset(s_shared_offset_unknown),
496  m_shared_replication(0)
497  {}
498 
499  MultiReduceBlockThenGridAtomicHostInit_Data() = delete;
500  MultiReduceBlockThenGridAtomicHostInit_Data(
501  MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
502  MultiReduceBlockThenGridAtomicHostInit_Data(
503  MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
504  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(
505  MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
506  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(
507  MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
508  ~MultiReduceBlockThenGridAtomicHostInit_Data() = default;
509 
510 
512  using TallyData::get;
513  using TallyData::identity;
514  using TallyData::num_bins;
515  using TallyData::reset_permanent;
516  using TallyData::teardown_permanent;
517 
519  void setup_launch(size_t block_size)
520  {
521  if (m_num_bins == size_t(0))
522  {
523  m_shared_offset = s_shared_offset_invalid;
524  return;
525  }
526 
527  size_t shared_replication = 0;
528  const size_t shared_offset =
529  allocateDynamicShmem<T>([&](size_t max_shmem_size) {
530  struct
531  {
532  size_t func_threads_per_block;
533  size_t func_max_shared_replication_per_block;
534  } func_data {block_size, max_shmem_size / m_num_bins};
535 
536  shared_replication =
537  SharedAtomicReplicationConcretizer {}
538  .template get_shared_replication<size_t>(func_data);
539  return m_num_bins * shared_replication;
540  });
541 
542  if (shared_offset != dynamic_smem_allocation_failure)
543  {
544  m_shared_replication = static_cast<int>(shared_replication);
545  m_shared_offset = static_cast<int>(shared_offset);
546  }
547  else
548  {
549  m_shared_offset = s_shared_offset_invalid;
550  }
551  }
552 
554  void teardown_launch()
555  {
556  m_shared_replication = 0;
557  m_shared_offset = s_shared_offset_unknown;
558  }
559 
562  void setup_device()
563  {
564  T* shared_mem = get_shared_mem();
565  if (shared_mem != nullptr)
566  {
567  impl::block_multi_reduce_init_shmem(m_num_bins, m_identity, shared_mem,
568  m_shared_replication);
569  }
570  }
571 
574  void finalize_device()
575  {
576  T* shared_mem = get_shared_mem();
577  if (shared_mem != nullptr)
578  {
579  impl::grid_multi_reduce_shmem_to_global_atomic<Combiner>(
580  m_num_bins, m_identity, shared_mem, GetSharedOffset {},
581  m_shared_replication, m_tally_mem, GetTallyOffset {},
582  m_tally_replication, m_tally_bins);
583  }
584  }
585 
588  void combine_device(int bin, T value)
589  {
590  T* shared_mem = get_shared_mem();
591  if (shared_mem != nullptr)
592  {
593  impl::block_multi_reduce_combine_shmem_atomic<Combiner, GetSharedIndex>(
594  m_num_bins, m_identity, bin, value, shared_mem, GetSharedOffset {},
595  m_shared_replication);
596  }
597  else
598  {
599  impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
600  m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset {},
601  m_tally_replication, m_tally_bins);
602  }
603  }
604 
606  void combine_host(int bin, T value)
607  {
608  int tally_rep = RAJA::get_thread_num<ThreadPolicy>();
609  int tally_offset =
610  GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
611  Combiner {}(m_tally_mem[tally_offset], value);
612  }
613 
614 private:
615  using shared_tuning = typename tuning::SharedAtomicReplicationTuning;
616  using SharedAtomicReplicationConcretizer =
617  typename shared_tuning::AtomicReplicationConcretizer;
618  using GetSharedIndex = typename shared_tuning::ReplicationIndexer;
619  using GetSharedOffset_rebind = typename shared_tuning::OffsetCalculator;
620  using GetSharedOffset = typename GetSharedOffset_rebind::template rebind<int>;
621 
622  using typename TallyData::GetTallyIndex;
623  using typename TallyData::GetTallyOffset;
624 
625 
626  static constexpr int s_shared_offset_unknown =
628  static constexpr int s_shared_offset_invalid =
630 
631 
632  using TallyData::m_identity;
633  using TallyData::m_num_bins;
634  using TallyData::m_tally_bins;
635  using TallyData::m_tally_mem;
636  using TallyData::m_tally_replication;
637 
638  int m_shared_offset; // in bytes
639  int m_shared_replication; // power of 2
640 
642  T* get_shared_mem() const
643  {
644  if (m_shared_offset == s_shared_offset_invalid)
645  {
646  return nullptr;
647  }
648  extern __shared__ char shared_mem[];
649  return reinterpret_cast<T*>(&shared_mem[m_shared_offset]);
650  }
651 };
652 
671 template<typename T, typename t_MultiReduceOp, typename tuning>
672 struct MultiReduceDataHip
673 {
674  static constexpr bool atomic_available =
675  RAJA::reduce::hip::hip_atomic_available<T>::value;
676 
678  using reduce_data_type = std::conditional_t<
679  (atomic_available),
680  std::conditional_t<
681  (tuning::algorithm ==
683  init_host_combine_block_atomic_then_grid_atomic),
684  hip::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp,
685  T,
686  tuning>,
687  std::conditional_t<
688  (tuning::algorithm ==
689  multi_reduce_algorithm::init_host_combine_global_atomic),
690  hip::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp,
691  T,
692  tuning>,
693  void>>,
694  void>;
695 
696 
697  using SyncList = std::vector<resources::Hip>;
698 
699 public:
700  using value_type = T;
701  using MultiReduceOp = t_MultiReduceOp;
702 
703  MultiReduceDataHip() = delete;
704 
705  template<typename Container,
706  std::enable_if_t<
707  !std::is_same<Container, MultiReduceDataHip>::value>* = nullptr>
708  MultiReduceDataHip(Container const& container, T identity)
709  : m_parent(this),
710  m_sync_list(new SyncList),
711  m_data(container, identity),
712  m_own_launch_data(false)
713  {}
714 
716  // init val_ptr to avoid uninitialized read caused by host copy of
717  // reducer in host device lambda not being used on device.
719  MultiReduceDataHip(MultiReduceDataHip const& other)
720 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
721  : m_parent(other.m_parent)
722 #else
723  : m_parent(&other)
724 #endif
725  ,
726  m_sync_list(other.m_sync_list),
727  m_data(other.m_data),
728  m_own_launch_data(false)
729  {
730 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
731  if (m_parent)
732  {
733  if (setupReducers())
734  {
735  // the copy made in make_launch_body does this setup
736  add_resource_to_synchronization_list(currentResource());
737  m_data.setup_launch(currentBlockSize());
738  m_own_launch_data = true;
739  m_parent = nullptr;
740  }
741  }
742 #else
743  if (!m_parent->m_parent)
744  {
745  // the first copy on device enters this branch
746  m_data.setup_device();
747  }
748 #endif
749  }
750 
751  MultiReduceDataHip(MultiReduceDataHip&&) = delete;
752  MultiReduceDataHip& operator=(MultiReduceDataHip const&) = delete;
753  MultiReduceDataHip& operator=(MultiReduceDataHip&&) = delete;
754 
756  // on device store in pinned buffer on host
758  ~MultiReduceDataHip()
759  {
760 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
761  if (m_parent == this)
762  {
763  // the original object, owns permanent storage
764  synchronize_resources_and_clear_list();
765  delete m_sync_list;
766  m_sync_list = nullptr;
767  m_data.teardown_permanent();
768  }
769  else if (m_parent)
770  {
771  // do nothing
772  }
773  else
774  {
775  if (m_own_launch_data)
776  {
777  // the copy made in make_launch_body, owns launch data
778  m_data.teardown_launch();
779  m_own_launch_data = false;
780  }
781  }
782 #else
783  if (!m_parent->m_parent)
784  {
785  // the first copy on device, does finalization on the device
786  m_data.finalize_device();
787  }
788 #endif
789  }
790 
791  template<typename Container>
792  void reset(Container const& container, T identity)
793  {
794  synchronize_resources_and_clear_list();
795  m_data.reset_permanent(container, identity);
796  }
797 
800  void combine(int bin, T const& value)
801  {
802 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
803  m_data.combine_host(bin, value);
804 #else
805  m_data.combine_device(bin, value);
806 #endif
807  }
808 
810  T get(int bin)
811  {
812  synchronize_resources_and_clear_list();
813  return m_data.get(bin);
814  }
815 
816  size_t num_bins() const { return m_data.num_bins(); }
817 
818  T identity() const { return m_data.identity(); }
819 
820 
821 private:
822  MultiReduceDataHip const* m_parent;
823  SyncList* m_sync_list;
824  reduce_data_type m_data;
825  bool m_own_launch_data;
826 
827  void add_resource_to_synchronization_list(resources::Hip res)
828  {
829  for (resources::Hip& list_res : *m_sync_list)
830  {
831  if (list_res.get_stream() == res.get_stream())
832  {
833  return;
834  }
835  }
836  m_sync_list->emplace_back(res);
837  }
838 
839  void synchronize_resources_and_clear_list()
840  {
841  for (resources::Hip& list_res : *m_sync_list)
842  {
843  ::RAJA::hip::synchronize(list_res);
844  }
845  m_sync_list->clear();
846  }
847 };
848 
849 } // end namespace hip
850 
851 RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::hip::hip_multi_reduce_policy,
852  hip::MultiReduceDataHip)
853 
854 } // namespace RAJA
855 
856 #endif // closing endif for RAJA_ENABLE_HIP guard
857 
858 #endif // closing endif for header file include guard
Header file defining prototypes for routines used to manage memory for HIP reductions and other opera...
RAJA header file defining Simple Offset Calculators.
Header file containing RAJA intrinsics templates for HIP execution.
Header file containing RAJA HIP policy definitions.
Header file for common RAJA internal macro definitions.
#define RAJA_HOST_DEVICE
Definition: macros.hpp:65
#define RAJA_UNUSED_ARG(x)
Definition: macros.hpp:97
#define RAJA_DIVIDE_CEILING_INT(dividend, divisor)
Definition: macros.hpp:122
#define RAJA_DEVICE
Definition: macros.hpp:66
Header file providing RAJA math templates.
multi_reduce_algorithm
Definition: policy.hpp:51
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
std::conditional_t< RAJA::operators::is_fp_associative< T >::value, BinaryTreeReduce< T, BinaryOp >, LeftFoldReduce< T, BinaryOp > > HighAccuracyReduce
Definition: reduce.hpp:357
RAJA_HOST_DEVICE constexpr RAJA_INLINE auto power_of_2_mod(L lhs, R rhs) noexcept
compute lhs mod rhs where lhs is non-negative and rhs is a power of 2
Definition: math.hpp:102
void synchronize()
Synchronize all current RAJA executions for the specified policy.
Definition: synchronize.hpp:44
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155
Base types used in common for RAJA reducer objects.
#define RAJA_DECLARE_ALL_MULTI_REDUCERS(POL, DATA)
Definition: multi_reduce.hpp:49
Header file providing RAJA reduction declarations.
RAJA header file defining thread operations.
RAJA header file defining atomic operations for HIP.
Header file containing utility methods used in HIP operations.
Definition: policy.hpp:130
Header file for RAJA type definitions.
Header file providing RAJA sort templates.