RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
reduce.hpp
Go to the documentation of this file.
1 
14 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
15 // Copyright (c) Lawrence Livermore National Security, LLC and other
16 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
17 // files for dates and other details. No copyright assignment is required
18 // to contribute to RAJA.
19 //
20 // SPDX-License-Identifier: (BSD-3-Clause)
21 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
22 
23 #ifndef RAJA_hip_reduce_HPP
24 #define RAJA_hip_reduce_HPP
25 
26 #include "RAJA/config.hpp"
27 
28 #if defined(RAJA_ENABLE_HIP)
29 
30 #include <type_traits>
31 #include <mutex>
32 
33 #include <hip/hip_runtime.h>
34 
35 #include "RAJA/util/macros.hpp"
36 #include "RAJA/util/SoAArray.hpp"
37 #include "RAJA/util/SoAPtr.hpp"
39 #include "RAJA/util/types.hpp"
40 #include "RAJA/util/reduce.hpp"
41 
43 #include "RAJA/pattern/reduce.hpp"
44 
50 
51 namespace RAJA
52 {
53 
54 namespace reduce
55 {
56 
57 namespace hip
58 {
59 
61 template<typename Combiner>
62 struct atomic;
63 
64 template<typename T>
65 struct atomic<sum<T>>
66 {
67  RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
68  {
69  RAJA::atomicAdd(RAJA::hip_atomic {}, &val, v);
70  }
71 };
72 
73 template<typename T>
74 struct atomic<min<T>>
75 {
76  RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
77  {
78  RAJA::atomicMin(RAJA::hip_atomic {}, &val, v);
79  }
80 };
81 
82 template<typename T>
83 struct atomic<max<T>>
84 {
85  RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
86  {
87  RAJA::atomicMax(RAJA::hip_atomic {}, &val, v);
88  }
89 };
90 
91 template<typename T>
92 struct atomic<and_bit<T>>
93 {
94  RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
95  {
96  RAJA::atomicAnd(RAJA::hip_atomic {}, &val, v);
97  }
98 };
99 
100 template<typename T>
101 struct atomic<or_bit<T>>
102 {
103  RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
104  {
105  RAJA::atomicOr(RAJA::hip_atomic {}, &val, v);
106  }
107 };
108 
109 template<typename T>
110 struct hip_atomic_available
111 {
112  static constexpr const bool value =
113  (std::is_integral<T>::value && (4 == sizeof(T) || 8 == sizeof(T))) ||
114  std::is_same<T, float>::value || std::is_same<T, double>::value;
115 };
116 
117 } // namespace hip
118 
119 } // namespace reduce
120 
121 namespace hip
122 {
123 
124 namespace impl
125 {
126 
128 // returns true if put reduced value in val
129 template<typename Combiner,
130  typename Accessor,
131  int replication,
132  int atomic_stride,
133  typename T,
134  typename TempIterator>
135 RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
136  T identity,
137  TempIterator in_device_mem,
138  unsigned int* device_count)
139 {
140  typename TempIterator::template rebind_accessor<Accessor> device_mem(
141  in_device_mem);
142 
143  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
144  (blockDim.x * blockDim.y) * threadIdx.z;
145  int numThreads = blockDim.x * blockDim.y * blockDim.z;
146 
147  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
148  (gridDim.x * gridDim.y) * blockIdx.z;
149  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
150 
151  int replicationId = blockId % replication;
152  int slotId = blockId / replication;
153 
154  int maxNumSlots = (numBlocks + replication - 1) / replication;
155  unsigned int numSlots = (numBlocks / replication) +
156  ((replicationId < (numBlocks % replication)) ? 1 : 0);
157 
158  int atomicOffset = replicationId * atomic_stride;
159  int beginSlots = replicationId * maxNumSlots;
160  int blockSlot = beginSlots + slotId;
161 
162  T temp = block_reduce<Combiner>(val, identity);
163 
164  if (numSlots <= 1u)
165  {
166  if (threadId == 0)
167  {
168  val = temp;
169  }
170  return (threadId == 0) ? replicationId : replication;
171  }
172 
173  // one thread per block writes to device_mem
174  __shared__ bool isLastBlock;
175  if (threadId == 0)
176  {
177  device_mem.set(blockSlot, temp);
178  // ensure write visible to all threadblocks
179  Accessor::fence_release();
180  // increment counter, (wraps back to zero if old count == (numSlots-1))
181  unsigned int old_count =
182  ::atomicInc(&device_count[atomicOffset], (numSlots - 1));
183  isLastBlock = (old_count == (numSlots - 1));
184  }
185 
186  // returns non-zero value if any thread passes in a non-zero value
187  __syncthreads();
188 
189  // last block accumulates values from device_mem
190  if (isLastBlock)
191  {
192  temp = identity;
193  Accessor::fence_acquire();
194 
195  for (unsigned int i = threadId; i < numSlots; i += numThreads)
196  {
197  Combiner {}(temp, device_mem.get(beginSlots + i));
198  }
199 
200  temp = block_reduce<Combiner>(temp, identity);
201 
202  // one thread returns value
203  if (threadId == 0)
204  {
205  val = temp;
206  }
207  }
208 
209  return (isLastBlock && threadId == 0) ? replicationId : replication;
210 }
211 
212 namespace expt
213 {
214 
215 template<typename ThreadIterationGetter, typename Combiner, typename T>
216 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
217 {
218  const int numThreads = ThreadIterationGetter::size();
219  const int threadId = ThreadIterationGetter::index();
220 
221  const int warpId = threadId % RAJA::policy::hip::device_constants.WARP_SIZE;
222  const int warpNum = threadId / RAJA::policy::hip::device_constants.WARP_SIZE;
223 
224  T temp = val;
225 
226  if (numThreads % RAJA::policy::hip::device_constants.WARP_SIZE == 0)
227  {
228 
229  // reduce each warp
230  for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2)
231  {
232  T rhs = RAJA::hip::impl::shfl_xor_sync(temp, i);
233  temp = Combiner {}(temp, rhs);
234  }
235  }
236  else
237  {
238 
239  // reduce each warp
240  for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2)
241  {
242  int srcLane = threadId ^ i;
243  T rhs = RAJA::hip::impl::shfl_sync(temp, srcLane);
244  // only add from threads that exist (don't double count own value)
245  if (srcLane < numThreads)
246  {
247  temp = Combiner {}(temp, rhs);
248  }
249  }
250  }
251 
252  static_assert(RAJA::policy::hip::device_constants.MAX_WARPS <=
253  RAJA::policy::hip::device_constants.WARP_SIZE,
254  "Max Warps must be less than or equal to Warp Size for this "
255  "algorithm to work");
256 
257  // reduce per warp values
258  if (numThreads > RAJA::policy::hip::device_constants.WARP_SIZE)
259  {
260 
261  // Need to separate declaration and initialization for clang-hip
262  __shared__ unsigned char tmpsd[sizeof(
264  RAJA::policy::hip::device_constants.MAX_WARPS>)];
265 
266  // Partial placement new: Should call new(tmpsd) here but recasting memory
267  // to avoid calling constructor/destructor in shared memory.
268  RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS>*
269  sd = reinterpret_cast<RAJA::detail::SoAArray<
270  T, RAJA::policy::hip::device_constants.MAX_WARPS>*>(tmpsd);
271 
272  // write per warp values to shared memory
273  if (warpId == 0)
274  {
275  sd->set(warpNum, temp);
276  }
277 
278  __syncthreads();
279 
280  if (warpNum == 0)
281  {
282 
283  // read per warp values
284  if (warpId * RAJA::policy::hip::device_constants.WARP_SIZE < numThreads)
285  {
286  temp = sd->get(warpId);
287  }
288  else
289  {
290  temp = identity;
291  }
292 
293  for (int i = 1; i < RAJA::policy::hip::device_constants.MAX_WARPS; i *= 2)
294  {
295  T rhs = RAJA::hip::impl::shfl_xor_sync(temp, i);
296  temp = Combiner {}(temp, rhs);
297  }
298  }
299 
300  __syncthreads();
301  }
302 
303  return temp;
304 }
305 
306 template<typename GlobalIterationGetter, typename OP, typename T>
307 RAJA_DEVICE RAJA_INLINE void grid_reduce(
308  T* device_target,
309  T val,
311  unsigned int* device_count)
312 {
313  using BlockIterationGetter =
314  typename get_index_block<GlobalIterationGetter>::type;
315  using ThreadIterationGetter =
316  typename get_index_thread<GlobalIterationGetter>::type;
317 
318  const int numBlocks = BlockIterationGetter::size();
319  const int numThreads = ThreadIterationGetter::size();
320  const unsigned int wrap_around = numBlocks - 1;
321 
322  const int blockId = BlockIterationGetter::index();
323  const int threadId = ThreadIterationGetter::index();
324 
325  T temp = block_reduce<ThreadIterationGetter, OP>(val, OP::identity());
326 
327  // one thread per block writes to device_mem
328  bool lastBlock = false;
329  if (threadId == 0)
330  {
331  device_mem.set(blockId, temp);
332  // ensure write visible to all threadblocks
333  __threadfence();
334  // increment counter, (wraps back to zero if old count == wrap_around)
335  unsigned int old_count = ::atomicInc(device_count, wrap_around);
336  lastBlock = (old_count == wrap_around);
337  }
338 
339  // returns non-zero value if any thread passes in a non-zero value
340  lastBlock = __syncthreads_or(lastBlock);
341 
342  // last block accumulates values from device_mem
343  if (lastBlock)
344  {
345  temp = OP::identity();
346  __threadfence();
347 
348  for (int i = threadId; i < numBlocks; i += numThreads)
349  {
350  temp = OP {}(temp, device_mem.get(i));
351  }
352 
353  temp = block_reduce<ThreadIterationGetter, OP>(temp, OP::identity());
354 
355  // one thread returns value
356  if (threadId == 0)
357  {
358  *device_target = temp;
359  }
360  }
361 }
362 
363 } // namespace expt
364 
366 // returns true if put reduced value in val
367 template<typename Combiner,
368  typename Accessor,
369  int replication,
370  int atomic_stride,
371  typename T>
372 RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(
373  T& val,
374  T identity,
375  T* device_mem,
376  unsigned int* device_count)
377 {
378  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
379  (blockDim.x * blockDim.y) * threadIdx.z;
380 
381  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
382  (gridDim.x * gridDim.y) * blockIdx.z;
383  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
384 
385  int replicationId = (blockId % replication);
386  int atomicOffset = replicationId * atomic_stride;
387 
388  unsigned int numSlots = (numBlocks / replication) +
389  ((replicationId < (numBlocks % replication)) ? 1 : 0);
390 
391  if (numSlots <= 1u)
392  {
393  T temp = block_reduce<Combiner>(val, identity);
394  if (threadId == 0)
395  {
396  val = temp;
397  }
398  return (threadId == 0) ? replicationId : replication;
399  }
400 
401  // the first block of each replication initializes device_mem
402  if (threadId == 0)
403  {
404  unsigned int old_val = ::atomicCAS(&device_count[atomicOffset], 0u, 1u);
405  if (old_val == 0u)
406  {
407  Accessor::set(device_mem, atomicOffset, identity);
408  Accessor::fence_release();
409  ::atomicAdd(&device_count[atomicOffset], 1u);
410  }
411  }
412 
413  T temp = block_reduce<Combiner>(val, identity);
414 
415  // one thread per block performs an atomic on device_mem
416  bool isLastBlock = false;
417  if (threadId == 0)
418  {
419  // wait for device_mem to be initialized
420  while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u)
421  ;
422  Accessor::fence_acquire();
423  RAJA::reduce::hip::atomic<Combiner> {}(device_mem[atomicOffset], temp);
424  Accessor::fence_release();
425  // increment counter, (wraps back to zero if old count == (numSlots+1))
426  unsigned int old_count =
427  ::atomicInc(&device_count[atomicOffset], (numSlots + 1));
428  isLastBlock = (old_count == (numSlots + 1));
429 
430  // the last block for each replication gets the value from device_mem
431  if (isLastBlock)
432  {
433  Accessor::fence_acquire();
434  val = Accessor::get(device_mem, atomicOffset);
435  }
436  }
437 
438  return isLastBlock ? replicationId : replication;
439 }
440 
442 template<typename Combiner, int replication, int atomic_stride, typename T>
443 RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
444  T identity,
445  T* device_mem)
446 {
447  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
448  (blockDim.x * blockDim.y) * threadIdx.z;
449 
450  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
451  (gridDim.x * gridDim.y) * blockIdx.z;
452 
453  int replicationId = (blockId % replication);
454  int atomicOffset = replicationId * atomic_stride;
455 
456  T temp = block_reduce<Combiner>(val, identity);
457 
458  // one thread per block performs an atomic on device_mem
459  if (threadId == 0 && temp != identity)
460  {
461  RAJA::reduce::hip::atomic<Combiner> {}(device_mem[atomicOffset], temp);
462  }
463 }
464 
465 } // namespace impl
466 
468 // use one per reducer object
469 template<typename T, size_t num_slots, typename mempool>
470 class PinnedTally
471 {
472 public:
474  struct Node
475  {
476  Node* next;
477  T values[num_slots];
478  };
479 
481  struct ResourceNode
482  {
483  ResourceNode* next;
484  ::RAJA::resources::Hip res;
485  Node* node_list;
486  };
487 
489  class ResourceIterator
490  {
491  public:
492  ResourceIterator() = delete;
493 
494  ResourceIterator(ResourceNode* rn) : m_rn(rn) {}
495 
496  const ResourceIterator& operator++()
497  {
498  m_rn = m_rn->next;
499  return *this;
500  }
501 
502  ResourceIterator operator++(int)
503  {
504  ResourceIterator ret = *this;
505  this->operator++();
506  return ret;
507  }
508 
509  ::RAJA::resources::Hip& operator*() { return m_rn->res; }
510 
511  bool operator==(const ResourceIterator& rhs) const
512  {
513  return m_rn == rhs.m_rn;
514  }
515 
516  bool operator!=(const ResourceIterator& rhs) const
517  {
518  return !this->operator==(rhs);
519  }
520 
521  private:
522  ResourceNode* m_rn;
523  };
524 
526  class ResourceNodeIterator
527  {
528  public:
529  ResourceNodeIterator() = delete;
530 
531  ResourceNodeIterator(ResourceNode* rn, Node* n) : m_rn(rn), m_n(n) {}
532 
533  const ResourceNodeIterator& operator++()
534  {
535  if (m_n->next)
536  {
537  m_n = m_n->next;
538  }
539  else if (m_rn->next)
540  {
541  m_rn = m_rn->next;
542  m_n = m_rn->node_list;
543  }
544  else
545  {
546  m_rn = nullptr;
547  m_n = nullptr;
548  }
549  return *this;
550  }
551 
552  ResourceNodeIterator operator++(int)
553  {
554  ResourceNodeIterator ret = *this;
555  this->operator++();
556  return ret;
557  }
558 
559  auto operator*() -> T (&)[num_slots] { return m_n->values; }
560 
561  bool operator==(const ResourceNodeIterator& rhs) const
562  {
563  return m_n == rhs.m_n;
564  }
565 
566  bool operator!=(const ResourceNodeIterator& rhs) const
567  {
568  return !this->operator==(rhs);
569  }
570 
571  private:
572  ResourceNode* m_rn;
573  Node* m_n;
574  };
575 
576  PinnedTally() : resource_list(nullptr) {}
577 
578  PinnedTally(const PinnedTally&) = delete;
579 
581  ResourceIterator resourceBegin() { return {resource_list}; }
582 
584  ResourceIterator resourceEnd() { return {nullptr}; }
585 
587  ResourceNodeIterator begin()
588  {
589  return {resource_list, resource_list ? resource_list->node_list : nullptr};
590  }
591 
593  ResourceNodeIterator end() { return {nullptr, nullptr}; }
594 
596  auto new_value(::RAJA::resources::Hip res) -> T (&)[num_slots]
597  {
598  std::lock_guard<std::mutex> lock(m_mutex);
599  ResourceNode* rn = resource_list;
600  while (rn)
601  {
602  if (rn->res.get_stream() == res.get_stream()) break;
603  rn = rn->next;
604  }
605  if (!rn)
606  {
607  rn = (ResourceNode*)malloc(sizeof(ResourceNode));
608  rn->next = resource_list;
609  rn->res = res;
610  rn->node_list = nullptr;
611  resource_list = rn;
612  }
613  Node* n = mempool::getInstance().template malloc<Node>(1);
614  n->next = rn->node_list;
615  rn->node_list = n;
616  return n->values;
617  }
618 
620  void synchronize_resources()
621  {
622  auto end = resourceEnd();
623  for (auto r = resourceBegin(); r != end; ++r)
624  {
626  }
627  }
628 
630  void free_list()
631  {
632  while (resource_list)
633  {
634  ResourceNode* rn = resource_list;
635  while (rn->node_list)
636  {
637  Node* n = rn->node_list;
638  rn->node_list = n->next;
639  mempool::getInstance().free(n);
640  }
641  resource_list = rn->next;
642  free(rn);
643  }
644  }
645 
646  ~PinnedTally() { free_list(); }
647 
648  std::mutex m_mutex;
649 
650 private:
651  ResourceNode* resource_list;
652 };
653 
654 //
656 //
657 // Reduction classes.
658 //
660 //
661 
664 template<typename Combiner,
665  typename Accessor,
666  typename T,
667  size_t replication,
668  size_t atomic_stride>
669 struct ReduceLastBlock_Data
670 {
671  using tally_mempool_type = pinned_mempool_type;
672  using data_mempool_type = device_mempool_type;
673  using count_mempool_type = device_zeroed_mempool_type;
674 
675  static constexpr size_t tally_slots = replication;
676 
677  mutable T value;
678  T identity;
679  unsigned int* device_count;
681  bool own_device_ptr;
682 
683  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()) {};
684 
690  ReduceLastBlock_Data(T initValue, T identity_)
691  : value {initValue},
692  identity {identity_},
693  device_count {nullptr},
694  device {},
695  own_device_ptr {false}
696  {}
697 
699  ReduceLastBlock_Data(const ReduceLastBlock_Data& other)
700  : value {other.identity},
701  identity {other.identity},
702  device_count {other.device_count},
703  device {other.device},
704  own_device_ptr {false}
705  {}
706 
707  ReduceLastBlock_Data& operator=(const ReduceLastBlock_Data&) = default;
708 
710  // uninitialized memory
711  T* init_grid_vals(T (&output)[tally_slots])
712  {
713  for (size_t r = 0; r < tally_slots; ++r)
714  {
715  output[r] = identity;
716  }
717  return &output[0];
718  }
719 
722  void grid_reduce(T* output)
723  {
724  T temp = value;
725  size_t replicationId =
726  impl::grid_reduce_last_block<Combiner, Accessor, replication,
727  atomic_stride>(temp, identity, device,
728  device_count);
729  if (replicationId != replication)
730  {
731  output[replicationId] = temp;
732  }
733  }
734 
736  // allocate device pointers and get a new result buffer from the pinned tally
737  bool setupForDevice()
738  {
739  bool act = !device.allocated() && setupReducers();
740  if (act)
741  {
742  hip_dim_t gridDim = currentGridDim();
743  size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
744  size_t maxNumSlots = (numBlocks + replication - 1) / replication;
745  device.allocate(maxNumSlots * replication);
746  device_count =
747  count_mempool_type::getInstance().template malloc<unsigned int>(
748  replication * atomic_stride);
749  own_device_ptr = true;
750  }
751  return act;
752  }
753 
755  // free device pointers
756  bool teardownForDevice()
757  {
758  bool act = own_device_ptr;
759  if (act)
760  {
761  device.deallocate();
762  count_mempool_type::getInstance().free(device_count);
763  device_count = nullptr;
764  own_device_ptr = false;
765  }
766  return act;
767  }
768 };
769 
771 template<typename Combiner,
772  typename T,
773  size_t replication,
774  size_t atomic_stride>
775 struct ReduceAtomicHostInit_Data
776 {
777  using tally_mempool_type = device_pinned_mempool_type;
778 
779  static constexpr size_t tally_slots = replication * atomic_stride;
780 
781  mutable T value;
782  T identity;
783  bool is_setup;
784  bool own_device_ptr;
785 
786  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()) {}
787 
788  ReduceAtomicHostInit_Data(T initValue, T identity_)
789  : value {initValue},
790  identity {identity_},
791  is_setup {false},
792  own_device_ptr {false}
793  {}
794 
796  ReduceAtomicHostInit_Data(const ReduceAtomicHostInit_Data& other)
797  : value {other.identity},
798  identity {other.identity},
799  is_setup {other.is_setup},
800  own_device_ptr {false}
801  {}
802 
803  ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) =
804  default;
805 
807  // uninitialized memory
808  T* init_grid_vals(T (&output)[tally_slots])
809  {
810  for (size_t r = 0; r < tally_slots; ++r)
811  {
812  output[r] = identity;
813  }
814  return &output[0];
815  }
816 
819  void grid_reduce(T* output)
820  {
821  T temp = value;
822 
823  impl::grid_reduce_atomic_host_init<Combiner, replication, atomic_stride>(
824  temp, identity, output);
825  }
826 
828  // allocate device pointers and get a new result buffer from the pinned tally
829  bool setupForDevice()
830  {
831  bool act = !is_setup && setupReducers();
832  if (act)
833  {
834  is_setup = true;
835  own_device_ptr = true;
836  }
837  return act;
838  }
839 
841  // free device pointers
842  bool teardownForDevice()
843  {
844  bool act = own_device_ptr;
845  if (act)
846  {
847  is_setup = false;
848  own_device_ptr = false;
849  }
850  return act;
851  }
852 };
853 
855 template<typename Combiner,
856  typename Accessor,
857  typename T,
858  size_t replication,
859  size_t atomic_stride>
860 struct ReduceAtomicDeviceInit_Data
861 {
862  using tally_mempool_type = pinned_mempool_type;
863  using data_mempool_type = device_mempool_type;
864  using count_mempool_type = device_zeroed_mempool_type;
865 
866  static constexpr size_t tally_slots = replication;
867 
868  mutable T value;
869  T identity;
870  unsigned int* device_count;
871  T* device;
872  bool own_device_ptr;
873 
874  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()) {}
875 
876  ReduceAtomicDeviceInit_Data(T initValue, T identity_)
877  : value {initValue},
878  identity {identity_},
879  device_count {nullptr},
880  device {nullptr},
881  own_device_ptr {false}
882  {}
883 
885  ReduceAtomicDeviceInit_Data(const ReduceAtomicDeviceInit_Data& other)
886  : value {other.identity},
887  identity {other.identity},
888  device_count {other.device_count},
889  device {other.device},
890  own_device_ptr {false}
891  {}
892 
893  ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) =
894  default;
895 
897  // uninitialized memory
898  T* init_grid_vals(T (&output)[tally_slots])
899  {
900  for (size_t r = 0; r < tally_slots; ++r)
901  {
902  output[r] = identity;
903  }
904  return &output[0];
905  }
906 
909  void grid_reduce(T* output)
910  {
911  T temp = value;
912 
913  size_t replicationId =
914  impl::grid_reduce_atomic_device_init<Combiner, Accessor, replication,
915  atomic_stride>(
916  temp, identity, device, device_count);
917  if (replicationId != replication)
918  {
919  output[replicationId] = temp;
920  }
921  }
922 
924  // allocate device pointers and get a new result buffer from the pinned tally
925  bool setupForDevice()
926  {
927  bool act = !device && setupReducers();
928  if (act)
929  {
930  device = data_mempool_type::getInstance().template malloc<T>(
931  replication * atomic_stride);
932  device_count =
933  count_mempool_type::getInstance().template malloc<unsigned int>(
934  replication * atomic_stride);
935  own_device_ptr = true;
936  }
937  return act;
938  }
939 
941  // free device pointers
942  bool teardownForDevice()
943  {
944  bool act = own_device_ptr;
945  if (act)
946  {
947  data_mempool_type::getInstance().free(device);
948  device = nullptr;
949  count_mempool_type::getInstance().free(device_count);
950  device_count = nullptr;
951  own_device_ptr = false;
952  }
953  return act;
954  }
955 };
956 
958 template<typename Combiner, typename T, typename tuning>
959 class Reduce
960 {
961  static constexpr size_t replication =
962  (tuning::replication > 0) ? tuning::replication : 32;
963  static constexpr size_t atomic_stride =
964  (tuning::atomic_stride > 0)
965  ? tuning::atomic_stride
966  : ((policy::hip::device_constants
967  .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
969  policy::hip::device_constants
970  .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE,
971  sizeof(T))
972  : 1);
973 
974  using Accessor = std::conditional_t<
975  (tuning::comm_mode == block_communication_mode::block_fence),
976  impl::AccessorDeviceScopeUseBlockFence,
977  std::conditional_t<(tuning::comm_mode ==
978  block_communication_mode::device_fence),
979  impl::AccessorDeviceScopeUseDeviceFence,
980  void>>;
981 
982  static constexpr bool atomic_policy =
983  (tuning::algorithm ==
984  reduce_algorithm::init_device_combine_atomic_block) ||
985  (tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block);
986  static constexpr bool atomic_available =
987  RAJA::reduce::hip::hip_atomic_available<T>::value;
988 
990  using reduce_data_type = std::conditional_t<
991  (tuning::algorithm == reduce_algorithm::combine_last_block) ||
992  (atomic_policy && !atomic_available),
993  hip::ReduceLastBlock_Data<Combiner,
994  Accessor,
995  T,
996  replication,
997  atomic_stride>,
998  std::conditional_t<
999  atomic_available,
1000  std::conditional_t<
1001  (tuning::algorithm ==
1002  reduce_algorithm::init_device_combine_atomic_block),
1003  hip::ReduceAtomicDeviceInit_Data<Combiner,
1004  Accessor,
1005  T,
1006  replication,
1007  atomic_stride>,
1008  std::conditional_t<
1009  (tuning::algorithm ==
1010  reduce_algorithm::init_host_combine_atomic_block),
1011  hip::ReduceAtomicHostInit_Data<Combiner,
1012  T,
1013  replication,
1014  atomic_stride>,
1015  void>>,
1016  void>>;
1017 
1018  static constexpr size_t tally_slots = reduce_data_type::tally_slots;
1019 
1020  using TallyType = PinnedTally<T,
1021  tally_slots,
1022  typename reduce_data_type::tally_mempool_type>;
1023 
1025  // only use list before setup for device and only use val_ptr after
1026  union tally_u
1027  {
1028  TallyType* list;
1029  T* val_ptr;
1030  constexpr tally_u(TallyType* l) : list(l) {};
1031  constexpr tally_u(T* v_ptr) : val_ptr(v_ptr) {};
1032  };
1033 
1034 public:
1035  Reduce() : Reduce(T(), Combiner::identity()) {}
1036 
1038  // the original object's parent is itself
1039  explicit Reduce(T init_val, T identity_ = Combiner::identity())
1040  : parent {this},
1041  tally_or_val_ptr {new TallyType},
1042  val(init_val, identity_)
1043  {}
1044 
1045  void reset(T in_val, T identity_ = Combiner::identity())
1046  {
1047  operator T(); // syncs device
1048  val = reduce_data_type(in_val, identity_);
1049  }
1050 
1052  // init val_ptr to avoid uninitialized read caused by host copy of
1053  // reducer in host device lambda not being used on device.
1055  Reduce(const Reduce& other)
1056 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
1057  : parent {other.parent},
1058 #else
1059  : parent {&other},
1060 #endif
1061  tally_or_val_ptr {other.tally_or_val_ptr},
1062  val(other.val)
1063  {
1064 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
1065  if (parent)
1066  {
1067  if (val.setupForDevice())
1068  {
1069  tally_or_val_ptr.val_ptr = val.init_grid_vals(
1070  tally_or_val_ptr.list->new_value(currentResource()));
1071  parent = nullptr;
1072  }
1073  }
1074 #endif
1075  }
1076 
1078  // on device store in pinned buffer on host
1080  ~Reduce()
1081  {
1082 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
1083  if (parent == this)
1084  {
1085  delete tally_or_val_ptr.list;
1086  tally_or_val_ptr.list = nullptr;
1087  }
1088  else if (parent)
1089  {
1090  if (val.value != val.identity)
1091  {
1092  std::lock_guard<std::mutex> lock(tally_or_val_ptr.list->m_mutex);
1093  parent->combine(val.value);
1094  }
1095  }
1096  else
1097  {
1098  if (val.teardownForDevice())
1099  {
1100  tally_or_val_ptr.val_ptr = nullptr;
1101  }
1102  }
1103 #else
1104  if (!parent->parent)
1105  {
1106  val.grid_reduce(tally_or_val_ptr.val_ptr);
1107  }
1108  else
1109  {
1110  parent->combine(val.value);
1111  }
1112 #endif
1113  }
1114 
1116  operator T()
1117  {
1118  auto n = tally_or_val_ptr.list->begin();
1119  auto end = tally_or_val_ptr.list->end();
1120  if (n != end)
1121  {
1122  tally_or_val_ptr.list->synchronize_resources();
1124  std::move(val.value));
1125  for (; n != end; ++n)
1126  {
1127  T(&values)[tally_slots] = *n;
1128  for (size_t r = 0; r < tally_slots; ++r)
1129  {
1130  reducer.combine(std::move(values[r]));
1131  }
1132  }
1133  val.value = reducer.get_and_reset();
1134  tally_or_val_ptr.list->free_list();
1135  }
1136  return val.value;
1137  }
1138 
1140  T get() { return operator T(); }
1141 
1144  void combine(T other) const { Combiner {}(val.value, other); }
1145 
1149  T& local() const { return val.value; }
1150 
1151  T get_combined() const { return val.value; }
1152 
1153 private:
1154  const Reduce* parent;
1155  tally_u tally_or_val_ptr;
1156  reduce_data_type val;
1157 };
1158 
1159 } // end namespace hip
1160 
1162 template<typename tuning, typename T>
1163 class ReduceSum<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
1164  : public hip::Reduce<RAJA::reduce::sum<T>, T, tuning>
1165 {
1166 
1167 public:
1168  using Base = hip::Reduce<RAJA::reduce::sum<T>, T, tuning>;
1169  using Base::Base;
1170 
1173  const ReduceSum& operator+=(T rhs) const
1174  {
1175  this->combine(rhs);
1176  return *this;
1177  }
1178 };
1179 
1181 template<typename tuning, typename T>
1182 class ReduceBitOr<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
1183  : public hip::Reduce<RAJA::reduce::or_bit<T>, T, tuning>
1184 {
1185 
1186 public:
1187  using Base = hip::Reduce<RAJA::reduce::or_bit<T>, T, tuning>;
1188  using Base::Base;
1189 
1192  const ReduceBitOr& operator|=(T rhs) const
1193  {
1194  this->combine(rhs);
1195  return *this;
1196  }
1197 };
1198 
1200 template<typename tuning, typename T>
1201 class ReduceBitAnd<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
1202  : public hip::Reduce<RAJA::reduce::and_bit<T>, T, tuning>
1203 {
1204 
1205 public:
1206  using Base = hip::Reduce<RAJA::reduce::and_bit<T>, T, tuning>;
1207  using Base::Base;
1208 
1211  const ReduceBitAnd& operator&=(T rhs) const
1212  {
1213  this->combine(rhs);
1214  return *this;
1215  }
1216 };
1217 
1219 template<typename tuning, typename T>
1220 class ReduceMin<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
1221  : public hip::Reduce<RAJA::reduce::min<T>, T, tuning>
1222 {
1223 
1224 public:
1225  using Base = hip::Reduce<RAJA::reduce::min<T>, T, tuning>;
1226  using Base::Base;
1227 
1230  const ReduceMin& min(T rhs) const
1231  {
1232  this->combine(rhs);
1233  return *this;
1234  }
1235 };
1236 
1238 template<typename tuning, typename T>
1239 class ReduceMax<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
1240  : public hip::Reduce<RAJA::reduce::max<T>, T, tuning>
1241 {
1242 
1243 public:
1244  using Base = hip::Reduce<RAJA::reduce::max<T>, T, tuning>;
1245  using Base::Base;
1246 
1249  const ReduceMax& max(T rhs) const
1250  {
1251  this->combine(rhs);
1252  return *this;
1253  }
1254 };
1255 
1257 template<typename tuning, typename T, typename IndexType>
1258 class ReduceMinLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
1259  : public hip::Reduce<
1260  RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
1261  RAJA::reduce::detail::ValueLoc<T, IndexType>,
1262  tuning>
1263 {
1264 
1265 public:
1267  using Combiner = RAJA::reduce::min<value_type>;
1268  using NonLocCombiner = RAJA::reduce::min<T>;
1269  using Base = hip::Reduce<Combiner, value_type, tuning>;
1270  using Base::Base;
1271 
1273  ReduceMinLoc(T init_val,
1274  IndexType init_idx,
1275  T identity_val = NonLocCombiner::identity(),
1276  IndexType identity_idx =
1278  : Base(value_type(init_val, init_idx),
1279  value_type(identity_val, identity_idx))
1280  {}
1281 
1283  // this must be here to hide Base::reset
1284  void reset(T init_val,
1285  IndexType init_idx,
1286  T identity_val = NonLocCombiner::identity(),
1287  IndexType identity_idx =
1289  {
1290  Base::reset(value_type(init_val, init_idx),
1291  value_type(identity_val, identity_idx));
1292  }
1293 
1296  const ReduceMinLoc& minloc(T rhs, IndexType loc) const
1297  {
1298  this->combine(value_type(rhs, loc));
1299  return *this;
1300  }
1301 
1303  IndexType getLoc() { return Base::get().getLoc(); }
1304 
1306  operator T() { return Base::get(); }
1307 
1309  T get() { return Base::get(); }
1310 };
1311 
1313 template<typename tuning, typename T, typename IndexType>
1314 class ReduceMaxLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
1315  : public hip::Reduce<
1316  RAJA::reduce::max<
1317  RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
1318  RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
1319  tuning>
1320 {
1321 public:
1323  using Combiner = RAJA::reduce::max<value_type>;
1324  using NonLocCombiner = RAJA::reduce::max<T>;
1325  using Base = hip::Reduce<Combiner, value_type, tuning>;
1326  using Base::Base;
1327 
1329  ReduceMaxLoc(T init_val,
1330  IndexType init_idx,
1331  T identity_val = NonLocCombiner::identity(),
1332  IndexType identity_idx =
1334  : Base(value_type(init_val, init_idx),
1335  value_type(identity_val, identity_idx))
1336  {}
1337 
1339  // this must be here to hide Base::reset
1340  void reset(T init_val,
1341  IndexType init_idx,
1342  T identity_val = NonLocCombiner::identity(),
1343  IndexType identity_idx =
1345  {
1346  Base::reset(value_type(init_val, init_idx),
1347  value_type(identity_val, identity_idx));
1348  }
1349 
1352  const ReduceMaxLoc& maxloc(T rhs, IndexType loc) const
1353  {
1354  this->combine(value_type(rhs, loc));
1355  return *this;
1356  }
1357 
1359  IndexType getLoc() { return Base::get().getLoc(); }
1360 
1362  operator T() { return Base::get(); }
1363 
1365  T get() { return Base::get(); }
1366 };
1367 
1368 } // namespace RAJA
1369 
1370 #endif // closing endif for RAJA_ENABLE_HIP guard
1371 
1372 #endif // closing endif for header file include guard
Header file defining prototypes for routines used to manage memory for HIP reductions and other opera...
Header file for common RAJA internal definitions.
Header file for common RAJA internal definitions.
RAJA header file containing an implementation of a memory pool.
Array class specialized for Struct of Array data layout.
Definition: SoAArray.hpp:42
constexpr RAJA_HOST_DEVICE void set(size_t i, value_type val)
Definition: SoAArray.hpp:48
Pointer class specialized for Struct of Array data layout allocated via RAJA basic_mempools.
Definition: SoAPtr.hpp:52
constexpr RAJA_HOST_DEVICE void set(size_t i, value_type val)
Definition: SoAPtr.hpp:100
constexpr RAJA_HOST_DEVICE value_type get(size_t i) const
Definition: SoAPtr.hpp:95
Definition: reduce.hpp:131
Header file containing RAJA intrinsics templates for HIP execution.
Header file containing RAJA HIP policy definitions.
Header file for common RAJA internal macro definitions.
#define RAJA_HOST_DEVICE
Definition: macros.hpp:65
#define RAJA_DIVIDE_CEILING_INT(dividend, divisor)
Definition: macros.hpp:122
#define RAJA_DEVICE
Definition: macros.hpp:66
constexpr auto Reduce(T *target)
Definition: reducer.hpp:231
RAJA_INLINE RAJA_HOST_DEVICE auto operator*(LHS const &left_operand, RHS const &right_operand) -> TensorMultiply< typename NormalizeOperandHelper< LHS >::return_type, RHS >
Definition: TensorMultiply.hpp:155
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T *acc, T value)
Atomic bitwise AND equivalent to (*acc) = (*acc) & value This only works with integral data types.
Definition: atomic.hpp:224
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result min(Args... args)
Definition: foldl.hpp:161
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T *acc, T value)
Atomic bitwise OR equivalent to (*acc) = (*acc) | value This only works with integral data types.
Definition: atomic.hpp:240
RAJA_HOST_DEVICE RAJA_INLINE Iter next(Iter it)
returns iterator to next item
Definition: algorithm.hpp:90
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T *acc, T value)
Atomic add.
Definition: atomic.hpp:117
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result sum(Args... args)
Definition: foldl.hpp:143
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
std::conditional_t< RAJA::operators::is_fp_associative< T >::value, BinaryTreeReduce< T, BinaryOp >, LeftFoldReduce< T, BinaryOp > > HighAccuracyReduce
Definition: reduce.hpp:357
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T *acc, T value)
Atomic maximum equivalent to (*acc) = std::max(*acc, value)
Definition: atomic.hpp:156
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T *acc, T compare, T value)
Atomic compare and swap.
Definition: atomic.hpp:286
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc)
Atomic increment.
Definition: atomic.hpp:168
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T *acc, T value)
Atomic minimum equivalent to (*acc) = std::min(*acc, value)
Definition: atomic.hpp:143
void synchronize()
Synchronize all current RAJA executions for the specified policy.
Definition: synchronize.hpp:44
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155
Base types used in common for RAJA reducer objects.
Header file providing RAJA reduction declarations.
RAJA header file defining atomic operations for HIP.
Header file containing utility methods used in HIP operations.
Definition: reduce.hpp:115
Definition: reduce.hpp:95
Definition: reduce.hpp:91
Header file for RAJA type definitions.
Header file providing RAJA sort templates.