RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
reduce.hpp
Go to the documentation of this file.
1 
14 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
15 // Copyright (c) Lawrence Livermore National Security, LLC and other
16 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
17 // files for dates and other details. No copyright assignment is required
18 // to contribute to RAJA.
19 //
20 // SPDX-License-Identifier: (BSD-3-Clause)
21 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
22 
23 #ifndef RAJA_cuda_reduce_HPP
24 #define RAJA_cuda_reduce_HPP
25 
26 #include "RAJA/config.hpp"
27 
28 #if defined(RAJA_ENABLE_CUDA)
29 
30 #include <type_traits>
31 #include <mutex>
32 
33 #include <cuda.h>
34 
35 #include "RAJA/util/macros.hpp"
36 #include "RAJA/util/SoAArray.hpp"
37 #include "RAJA/util/SoAPtr.hpp"
39 #include "RAJA/util/types.hpp"
40 #include "RAJA/util/reduce.hpp"
41 
43 #include "RAJA/pattern/reduce.hpp"
44 
47 
48 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
50 #else
52 #endif
53 
56 
57 namespace RAJA
58 {
59 
60 namespace reduce
61 {
62 
63 namespace cuda
64 {
65 
67 template<typename Combiner>
68 struct atomic;
69 
70 template<typename T>
71 struct atomic<sum<T>>
72 {
73  RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
74  {
75  RAJA::atomicAdd(RAJA::cuda_atomic {}, &val, v);
76  }
77 };
78 
79 template<typename T>
80 struct atomic<min<T>>
81 {
82  RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
83  {
84  RAJA::atomicMin(RAJA::cuda_atomic {}, &val, v);
85  }
86 };
87 
88 template<typename T>
89 struct atomic<max<T>>
90 {
91  RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
92  {
93  RAJA::atomicMax(RAJA::cuda_atomic {}, &val, v);
94  }
95 };
96 
97 template<typename T>
98 struct atomic<and_bit<T>>
99 {
100  RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
101  {
102  RAJA::atomicAnd(RAJA::cuda_atomic {}, &val, v);
103  }
104 };
105 
106 template<typename T>
107 struct atomic<or_bit<T>>
108 {
109  RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
110  {
111  RAJA::atomicOr(RAJA::cuda_atomic {}, &val, v);
112  }
113 };
114 
115 template<typename T>
116 struct cuda_atomic_available
117 {
118  static constexpr const bool value =
119  (std::is_integral<T>::value && (4 == sizeof(T) || 8 == sizeof(T))) ||
120  std::is_same<T, float>::value || std::is_same<T, double>::value;
121 };
122 
123 } // namespace cuda
124 
125 } // namespace reduce
126 
127 namespace cuda
128 {
129 
130 namespace impl
131 {
132 
134 // returns true if put reduced value in val
135 template<typename Combiner,
136  typename Accessor,
137  int replication,
138  int atomic_stride,
139  typename T,
140  typename TempIterator>
141 RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
142  T identity,
143  TempIterator in_device_mem,
144  unsigned int* device_count)
145 {
146  typename TempIterator::template rebind_accessor<Accessor> device_mem(
147  in_device_mem);
148 
149  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
150  (blockDim.x * blockDim.y) * threadIdx.z;
151  int numThreads = blockDim.x * blockDim.y * blockDim.z;
152 
153  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
154  (gridDim.x * gridDim.y) * blockIdx.z;
155  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
156 
157  int replicationId = blockId % replication;
158  int slotId = blockId / replication;
159 
160  int maxNumSlots = (numBlocks + replication - 1) / replication;
161  unsigned int numSlots = (numBlocks / replication) +
162  ((replicationId < (numBlocks % replication)) ? 1 : 0);
163 
164  int atomicOffset = replicationId * atomic_stride;
165  int beginSlots = replicationId * maxNumSlots;
166  int blockSlot = beginSlots + slotId;
167 
168  T temp = block_reduce<Combiner>(val, identity);
169 
170  if (numSlots <= 1u)
171  {
172  if (threadId == 0)
173  {
174  val = temp;
175  }
176  return (threadId == 0) ? replicationId : replication;
177  }
178 
179  // one thread per block writes to device_mem
180  bool isLastBlock = false;
181  if (threadId == 0)
182  {
183  device_mem.set(blockSlot, temp);
184  // ensure write visible to all threadblocks
185  Accessor::fence_release();
186  // increment counter, (wraps back to zero if old count == (numSlots-1))
187  unsigned int old_count =
188  ::atomicInc(&device_count[atomicOffset], (numSlots - 1));
189  isLastBlock = (old_count == (numSlots - 1));
190  }
191 
192  // returns non-zero value if any thread passes in a non-zero value
193  isLastBlock = __syncthreads_or(isLastBlock);
194 
195  // last block accumulates values from device_mem
196  if (isLastBlock)
197  {
198  temp = identity;
199  Accessor::fence_acquire();
200 
201  for (unsigned int i = threadId; i < numSlots; i += numThreads)
202  {
203  Combiner {}(temp, device_mem.get(beginSlots + i));
204  }
205 
206  temp = block_reduce<Combiner>(temp, identity);
207 
208  // one thread returns value
209  if (threadId == 0)
210  {
211  val = temp;
212  }
213  }
214 
215  return (isLastBlock && threadId == 0) ? replicationId : replication;
216 }
217 
218 namespace expt
219 {
220 
221 template<typename ThreadIterationGetter, typename Combiner, typename T>
222 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
223 {
224  const int numThreads = ThreadIterationGetter::size();
225  const int threadId = ThreadIterationGetter::index();
226 
227  const int warpId = threadId % RAJA::policy::cuda::device_constants.WARP_SIZE;
228  const int warpNum = threadId / RAJA::policy::cuda::device_constants.WARP_SIZE;
229 
230  T temp = val;
231 
232  if (numThreads % RAJA::policy::cuda::device_constants.WARP_SIZE == 0)
233  {
234 
235  // reduce each warp
236  for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2)
237  {
238  T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i);
239  temp = Combiner {}(temp, rhs);
240  }
241  }
242  else
243  {
244 
245  // reduce each warp
246  for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2)
247  {
248  int srcLane = threadId ^ i;
249  T rhs = RAJA::cuda::impl::shfl_sync(temp, srcLane);
250  // only add from threads that exist (don't double count own value)
251  if (srcLane < numThreads)
252  {
253  temp = Combiner {}(temp, rhs);
254  }
255  }
256  }
257 
258  static_assert(RAJA::policy::cuda::device_constants.MAX_WARPS <=
259  RAJA::policy::cuda::device_constants.WARP_SIZE,
260  "Max Warps must be less than or equal to Warp Size for this "
261  "algorithm to work");
262 
263  // reduce per warp values
264  if (numThreads > RAJA::policy::cuda::device_constants.WARP_SIZE)
265  {
266 
267  // Need to separate declaration and initialization for clang-cuda
268  __shared__ unsigned char
269  tmpsd[sizeof(RAJA::detail::SoAArray<
270  T, RAJA::policy::cuda::device_constants.MAX_WARPS>)];
271 
272  // Partial placement new: Should call new(tmpsd) here but recasting memory
273  // to avoid calling constructor/destructor in shared memory.
274  RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS>*
275  sd = reinterpret_cast<RAJA::detail::SoAArray<
276  T, RAJA::policy::cuda::device_constants.MAX_WARPS>*>(tmpsd);
277 
278  // write per warp values to shared memory
279  if (warpId == 0)
280  {
281  sd->set(warpNum, temp);
282  }
283 
284  __syncthreads();
285 
286  if (warpNum == 0)
287  {
288 
289  // read per warp values
290  if (warpId * RAJA::policy::cuda::device_constants.WARP_SIZE < numThreads)
291  {
292  temp = sd->get(warpId);
293  }
294  else
295  {
296  temp = identity;
297  }
298 
299  for (int i = 1; i < RAJA::policy::cuda::device_constants.MAX_WARPS;
300  i *= 2)
301  {
302  T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i);
303  temp = Combiner {}(temp, rhs);
304  }
305  }
306 
307  __syncthreads();
308  }
309 
310  return temp;
311 }
312 
313 template<typename GlobalIterationGetter, typename OP, typename T>
314 RAJA_DEVICE RAJA_INLINE void grid_reduce(
315  T* device_target,
316  T val,
318  unsigned int* device_count)
319 {
320  using BlockIterationGetter =
321  typename get_index_block<GlobalIterationGetter>::type;
322  using ThreadIterationGetter =
323  typename get_index_thread<GlobalIterationGetter>::type;
324 
325  const int numBlocks = BlockIterationGetter::size();
326  const int numThreads = ThreadIterationGetter::size();
327  const unsigned int wrap_around = numBlocks - 1;
328 
329  const int blockId = BlockIterationGetter::index();
330  const int threadId = ThreadIterationGetter::index();
331 
332  T temp = block_reduce<ThreadIterationGetter, OP>(val, OP::identity());
333 
334  // one thread per block writes to device_mem
335  bool lastBlock = false;
336  if (threadId == 0)
337  {
338  device_mem.set(blockId, temp);
339  // ensure write visible to all threadblocks
340  __threadfence();
341  // increment counter, (wraps back to zero if old count == wrap_around)
342  unsigned int old_count = ::atomicInc(device_count, wrap_around);
343  lastBlock = (old_count == wrap_around);
344  }
345 
346  // returns non-zero value if any thread passes in a non-zero value
347  lastBlock = __syncthreads_or(lastBlock);
348 
349  // last block accumulates values from device_mem
350  if (lastBlock)
351  {
352  temp = OP::identity();
353  __threadfence();
354 
355  for (int i = threadId; i < numBlocks; i += numThreads)
356  {
357  temp = OP {}(temp, device_mem.get(i));
358  }
359 
360  temp = block_reduce<ThreadIterationGetter, OP>(temp, OP::identity());
361 
362  // one thread returns value
363  if (threadId == 0)
364  {
365  *device_target = temp;
366  }
367  }
368 }
369 
370 } // namespace expt
371 
373 // returns true if put reduced value in val
374 template<typename Combiner,
375  typename Accessor,
376  int replication,
377  int atomic_stride,
378  typename T>
379 RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(
380  T& val,
381  T identity,
382  T* device_mem,
383  unsigned int* device_count)
384 {
385  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
386  (blockDim.x * blockDim.y) * threadIdx.z;
387 
388  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
389  (gridDim.x * gridDim.y) * blockIdx.z;
390  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
391 
392  int replicationId = (blockId % replication);
393  int atomicOffset = replicationId * atomic_stride;
394 
395  unsigned int numSlots = (numBlocks / replication) +
396  ((replicationId < (numBlocks % replication)) ? 1 : 0);
397 
398  if (numSlots <= 1u)
399  {
400  T temp = block_reduce<Combiner>(val, identity);
401  if (threadId == 0)
402  {
403  val = temp;
404  }
405  return (threadId == 0) ? replicationId : replication;
406  }
407 
408  // the first block of each replication initializes device_mem
409  if (threadId == 0)
410  {
411  unsigned int old_val = ::atomicCAS(&device_count[atomicOffset], 0u, 1u);
412  if (old_val == 0u)
413  {
414  Accessor::set(device_mem, atomicOffset, identity);
415  Accessor::fence_release();
416  ::atomicAdd(&device_count[atomicOffset], 1u);
417  }
418  }
419 
420  T temp = block_reduce<Combiner>(val, identity);
421 
422  // one thread per block performs an atomic on device_mem
423  bool isLastBlock = false;
424  if (threadId == 0)
425  {
426  // wait for device_mem to be initialized
427  while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u)
428  ;
429  Accessor::fence_acquire();
430  RAJA::reduce::cuda::atomic<Combiner> {}(device_mem[atomicOffset], temp);
431  Accessor::fence_release();
432  // increment counter, (wraps back to zero if old count == (numSlots+1))
433  unsigned int old_count =
434  ::atomicInc(&device_count[atomicOffset], (numSlots + 1));
435  isLastBlock = (old_count == (numSlots + 1));
436 
437  // the last block for each replication gets the value from device_mem
438  if (isLastBlock)
439  {
440  Accessor::fence_acquire();
441  val = Accessor::get(device_mem, atomicOffset);
442  }
443  }
444 
445  return isLastBlock ? replicationId : replication;
446 }
447 
449 template<typename Combiner, int replication, int atomic_stride, typename T>
450 RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
451  T identity,
452  T* device_mem)
453 {
454  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
455  (blockDim.x * blockDim.y) * threadIdx.z;
456 
457  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
458  (gridDim.x * gridDim.y) * blockIdx.z;
459 
460  int replicationId = (blockId % replication);
461  int atomicOffset = replicationId * atomic_stride;
462 
463  T temp = block_reduce<Combiner>(val, identity);
464 
465  // one thread per block performs an atomic on device_mem
466  if (threadId == 0 && temp != identity)
467  {
468  RAJA::reduce::cuda::atomic<Combiner> {}(device_mem[atomicOffset], temp);
469  }
470 }
471 
472 } // namespace impl
473 
475 // use one per reducer object
476 template<typename T, size_t num_slots, typename mempool>
477 class PinnedTally
478 {
479 public:
481  struct Node
482  {
483  Node* next;
484  T values[num_slots];
485  };
486 
488  struct ResourceNode
489  {
490  ResourceNode* next;
491  ::RAJA::resources::Cuda res;
492  Node* node_list;
493  };
494 
496  class ResourceIterator
497  {
498  public:
499  ResourceIterator() = delete;
500 
501  ResourceIterator(ResourceNode* rn) : m_rn(rn) {}
502 
503  const ResourceIterator& operator++()
504  {
505  m_rn = m_rn->next;
506  return *this;
507  }
508 
509  ResourceIterator operator++(int)
510  {
511  ResourceIterator ret = *this;
512  this->operator++();
513  return ret;
514  }
515 
516  ::RAJA::resources::Cuda& operator*() { return m_rn->res; }
517 
518  bool operator==(const ResourceIterator& rhs) const
519  {
520  return m_rn == rhs.m_rn;
521  }
522 
523  bool operator!=(const ResourceIterator& rhs) const
524  {
525  return !this->operator==(rhs);
526  }
527 
528  private:
529  ResourceNode* m_rn;
530  };
531 
533  class ResourceNodeIterator
534  {
535  public:
536  ResourceNodeIterator() = delete;
537 
538  ResourceNodeIterator(ResourceNode* rn, Node* n) : m_rn(rn), m_n(n) {}
539 
540  const ResourceNodeIterator& operator++()
541  {
542  if (m_n->next)
543  {
544  m_n = m_n->next;
545  }
546  else if (m_rn->next)
547  {
548  m_rn = m_rn->next;
549  m_n = m_rn->node_list;
550  }
551  else
552  {
553  m_rn = nullptr;
554  m_n = nullptr;
555  }
556  return *this;
557  }
558 
559  ResourceNodeIterator operator++(int)
560  {
561  ResourceNodeIterator ret = *this;
562  this->operator++();
563  return ret;
564  }
565 
566  auto operator*() -> T (&)[num_slots] { return m_n->values; }
567 
568  bool operator==(const ResourceNodeIterator& rhs) const
569  {
570  return m_n == rhs.m_n;
571  }
572 
573  bool operator!=(const ResourceNodeIterator& rhs) const
574  {
575  return !this->operator==(rhs);
576  }
577 
578  private:
579  ResourceNode* m_rn;
580  Node* m_n;
581  };
582 
583  PinnedTally() : resource_list(nullptr) {}
584 
585  PinnedTally(const PinnedTally&) = delete;
586 
588  ResourceIterator resourceBegin() { return {resource_list}; }
589 
591  ResourceIterator resourceEnd() { return {nullptr}; }
592 
594  ResourceNodeIterator begin()
595  {
596  return {resource_list, resource_list ? resource_list->node_list : nullptr};
597  }
598 
600  ResourceNodeIterator end() { return {nullptr, nullptr}; }
601 
603  auto new_value(::RAJA::resources::Cuda res) -> T (&)[num_slots]
604  {
605  std::lock_guard<std::mutex> lock(m_mutex);
606  ResourceNode* rn = resource_list;
607  while (rn)
608  {
609  if (rn->res.get_stream() == res.get_stream()) break;
610  rn = rn->next;
611  }
612  if (!rn)
613  {
614  rn = (ResourceNode*)malloc(sizeof(ResourceNode));
615  rn->next = resource_list;
616  rn->res = res;
617  rn->node_list = nullptr;
618  resource_list = rn;
619  }
620  Node* n = mempool::getInstance().template malloc<Node>(1);
621  n->next = rn->node_list;
622  rn->node_list = n;
623  return n->values;
624  }
625 
627  void synchronize_resources()
628  {
629  auto end = resourceEnd();
630  for (auto r = resourceBegin(); r != end; ++r)
631  {
633  }
634  }
635 
637  void free_list()
638  {
639  while (resource_list)
640  {
641  ResourceNode* rn = resource_list;
642  while (rn->node_list)
643  {
644  Node* n = rn->node_list;
645  rn->node_list = n->next;
646  mempool::getInstance().free(n);
647  }
648  resource_list = rn->next;
649  free(rn);
650  }
651  }
652 
653  ~PinnedTally() { free_list(); }
654 
655  std::mutex m_mutex;
656 
657 private:
658  ResourceNode* resource_list;
659 };
660 
661 //
663 //
664 // Reduction classes.
665 //
667 //
668 
671 template<typename Combiner,
672  typename Accessor,
673  typename T,
674  size_t replication,
675  size_t atomic_stride>
676 struct ReduceLastBlock_Data
677 {
678  using tally_mempool_type = pinned_mempool_type;
679  using data_mempool_type = device_mempool_type;
680  using count_mempool_type = device_zeroed_mempool_type;
681 
682  static constexpr size_t tally_slots = replication;
683 
684  mutable T value;
685  T identity;
686  unsigned int* device_count;
688  bool owns_device_pointer;
689 
690  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()) {}
691 
697  ReduceLastBlock_Data(T initValue, T identity_)
698  : value {initValue},
699  identity {identity_},
700  device_count {nullptr},
701  device {},
702  owns_device_pointer {false}
703  {}
704 
706  ReduceLastBlock_Data(const ReduceLastBlock_Data& other)
707  : value {other.identity},
708  identity {other.identity},
709  device_count {other.device_count},
710  device {other.device},
711  owns_device_pointer {false}
712  {}
713 
714  ReduceLastBlock_Data& operator=(const ReduceLastBlock_Data&) = default;
715 
717  // uninitialized memory
718  T* init_grid_vals(T (&output)[tally_slots])
719  {
720  for (size_t r = 0; r < tally_slots; ++r)
721  {
722  output[r] = identity;
723  }
724  return &output[0];
725  }
726 
729  void grid_reduce(T* output)
730  {
731  T temp = value;
732 
733  size_t replicationId =
734  impl::grid_reduce_last_block<Combiner, Accessor, replication,
735  atomic_stride>(temp, identity, device,
736  device_count);
737  if (replicationId != replication)
738  {
739  output[replicationId] = temp;
740  }
741  }
742 
744  // allocate device pointers and get a new result buffer from the pinned tally
745  bool setupForDevice()
746  {
747  bool act = !device.allocated() && setupReducers();
748  if (act)
749  {
750  cuda_dim_t gridDim = currentGridDim();
751  size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
752  size_t maxNumSlots = (numBlocks + replication - 1) / replication;
753  device.allocate(maxNumSlots * replication);
754  device_count =
755  count_mempool_type::getInstance().template malloc<unsigned int>(
756  replication * atomic_stride);
757  owns_device_pointer = true;
758  }
759  return act;
760  }
761 
763  // free device pointers
764  bool teardownForDevice()
765  {
766  bool act = owns_device_pointer;
767  if (act)
768  {
769  device.deallocate();
770  count_mempool_type::getInstance().free(device_count);
771  device_count = nullptr;
772  owns_device_pointer = false;
773  }
774  return act;
775  }
776 };
777 
779 template<typename Combiner,
780  typename T,
781  size_t replication,
782  size_t atomic_stride>
783 struct ReduceAtomicHostInit_Data
784 {
785  using tally_mempool_type = device_pinned_mempool_type;
786 
787  static constexpr size_t tally_slots = replication * atomic_stride;
788 
789  mutable T value;
790  T identity;
791  bool is_setup;
792  bool owns_device_pointer;
793 
794  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()) {};
795 
796  ReduceAtomicHostInit_Data(T initValue, T identity_)
797  : value {initValue},
798  identity {identity_},
799  is_setup {false},
800  owns_device_pointer {false}
801  {}
802 
804  ReduceAtomicHostInit_Data(const ReduceAtomicHostInit_Data& other)
805  : value {other.identity},
806  identity {other.identity},
807  is_setup {other.is_setup},
808  owns_device_pointer {false}
809  {}
810 
811  ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) =
812  default;
813 
815  // uninitialized memory
816  T* init_grid_vals(T (&output)[tally_slots])
817  {
818  for (size_t r = 0; r < tally_slots; ++r)
819  {
820  output[r] = identity;
821  }
822  return &output[0];
823  }
824 
827  void grid_reduce(T* output)
828  {
829  T temp = value;
830 
831  impl::grid_reduce_atomic_host_init<Combiner, replication, atomic_stride>(
832  temp, identity, output);
833  }
834 
836  // allocate device pointers and get a new result buffer from the pinned tally
837  bool setupForDevice()
838  {
839  bool act = !is_setup && setupReducers();
840  if (act)
841  {
842  is_setup = true;
843  owns_device_pointer = true;
844  }
845  return act;
846  }
847 
849  // free device pointers
850  bool teardownForDevice()
851  {
852  bool act = owns_device_pointer;
853  if (act)
854  {
855  is_setup = false;
856  owns_device_pointer = false;
857  }
858  return act;
859  }
860 };
861 
863 template<typename Combiner,
864  typename Accessor,
865  typename T,
866  size_t replication,
867  size_t atomic_stride>
868 struct ReduceAtomicDeviceInit_Data
869 {
870  using tally_mempool_type = pinned_mempool_type;
871  using data_mempool_type = device_mempool_type;
872  using count_mempool_type = device_zeroed_mempool_type;
873 
874  static constexpr size_t tally_slots = replication;
875 
876  mutable T value;
877  T identity;
878  unsigned int* device_count;
879  T* device;
880  bool owns_device_pointer;
881 
882  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()) {};
883 
884  ReduceAtomicDeviceInit_Data(T initValue, T identity_)
885  : value {initValue},
886  identity {identity_},
887  device_count {nullptr},
888  device {nullptr},
889  owns_device_pointer {false}
890  {}
891 
893  ReduceAtomicDeviceInit_Data(const ReduceAtomicDeviceInit_Data& other)
894  : value {other.identity},
895  identity {other.identity},
896  device_count {other.device_count},
897  device {other.device},
898  owns_device_pointer {false}
899  {}
900 
901  ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) =
902  default;
903 
905  // uninitialized memory
906  T* init_grid_vals(T (&output)[tally_slots])
907  {
908  for (size_t r = 0; r < tally_slots; ++r)
909  {
910  output[r] = identity;
911  }
912  return &output[0];
913  }
914 
917  void grid_reduce(T* output)
918  {
919  T temp = value;
920 
921  size_t replicationId =
922  impl::grid_reduce_atomic_device_init<Combiner, Accessor, replication,
923  atomic_stride>(
924  temp, identity, device, device_count);
925  if (replicationId != replication)
926  {
927  output[replicationId] = temp;
928  }
929  }
930 
932  // allocate device pointers and get a new result buffer from the pinned tally
933  bool setupForDevice()
934  {
935  bool act = !device && setupReducers();
936  if (act)
937  {
938  device = data_mempool_type::getInstance().template malloc<T>(
939  replication * atomic_stride);
940  device_count =
941  count_mempool_type::getInstance().template malloc<unsigned int>(
942  replication * atomic_stride);
943  owns_device_pointer = true;
944  }
945  return act;
946  }
947 
949  // free device pointers
950  bool teardownForDevice()
951  {
952  bool act = owns_device_pointer;
953  if (act)
954  {
955  data_mempool_type::getInstance().free(device);
956  device = nullptr;
957  count_mempool_type::getInstance().free(device_count);
958  device_count = nullptr;
959  owns_device_pointer = false;
960  }
961  return act;
962  }
963 };
964 
966 template<typename Combiner, typename T, typename tuning>
967 class Reduce
968 {
969  static constexpr size_t replication =
970  (tuning::replication > 0) ? tuning::replication : 1;
971  static constexpr size_t atomic_stride =
972  (tuning::atomic_stride > 0)
973  ? tuning::atomic_stride
974  : ((policy::cuda::device_constants
975  .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
977  policy::cuda::device_constants
978  .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE,
979  sizeof(T))
980  : 1);
981 
982  using Accessor = std::conditional_t<
983  (tuning::comm_mode == block_communication_mode::block_fence),
984  impl::AccessorDeviceScopeUseBlockFence,
985  std::conditional_t<(tuning::comm_mode ==
986  block_communication_mode::device_fence),
987  impl::AccessorDeviceScopeUseDeviceFence,
988  void>>;
989 
990  static constexpr bool atomic_policy =
991  (tuning::algorithm ==
992  reduce_algorithm::init_device_combine_atomic_block) ||
993  (tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block);
994  static constexpr bool atomic_available =
995  RAJA::reduce::cuda::cuda_atomic_available<T>::value;
996 
998  using reduce_data_type = std::conditional_t<
999  (tuning::algorithm == reduce_algorithm::combine_last_block) ||
1000  (atomic_policy && !atomic_available),
1001  cuda::ReduceLastBlock_Data<Combiner,
1002  Accessor,
1003  T,
1004  replication,
1005  atomic_stride>,
1006  std::conditional_t<
1007  atomic_available,
1008  std::conditional_t<
1009  (tuning::algorithm ==
1010  reduce_algorithm::init_device_combine_atomic_block),
1011  cuda::ReduceAtomicDeviceInit_Data<Combiner,
1012  Accessor,
1013  T,
1014  replication,
1015  atomic_stride>,
1016  std::conditional_t<
1017  (tuning::algorithm ==
1018  reduce_algorithm::init_host_combine_atomic_block),
1019  cuda::ReduceAtomicHostInit_Data<Combiner,
1020  T,
1021  replication,
1022  atomic_stride>,
1023  void>>,
1024  void>>;
1025 
1026  static constexpr size_t tally_slots = reduce_data_type::tally_slots;
1027 
1028  using TallyType = PinnedTally<T,
1029  tally_slots,
1030  typename reduce_data_type::tally_mempool_type>;
1031 
1033  // only use list before setup for device and only use val_ptr after
1034  union tally_u
1035  {
1036  TallyType* list;
1037  T* val_ptr;
1038  constexpr tally_u(TallyType* l) : list(l) {};
1039  constexpr tally_u(T* v_ptr) : val_ptr(v_ptr) {};
1040  };
1041 
1042 public:
1043  Reduce() : Reduce(T(), Combiner::identity()) {}
1044 
1046  // the original object's parent is itself
1047  explicit Reduce(T init_val, T identity_ = Combiner::identity())
1048  : parent {this},
1049  tally_or_val_ptr {new TallyType},
1050  val(init_val, identity_)
1051  {}
1052 
1053  void reset(T in_val, T identity_ = Combiner::identity())
1054  {
1055  operator T(); // syncs device
1056  val = reduce_data_type(in_val, identity_);
1057  }
1058 
1060  // init val_ptr to avoid uninitialized read caused by host copy of
1061  // reducer in host device lambda not being used on device.
1063  Reduce(const Reduce& other)
1064 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
1065  : parent {other.parent},
1066 #else
1067  : parent {&other},
1068 #endif
1069  tally_or_val_ptr {other.tally_or_val_ptr},
1070  val(other.val)
1071  {
1072 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
1073  if (parent)
1074  {
1075  if (val.setupForDevice())
1076  {
1077  tally_or_val_ptr.val_ptr = val.init_grid_vals(
1078  tally_or_val_ptr.list->new_value(currentResource()));
1079  parent = nullptr;
1080  }
1081  }
1082 #endif
1083  }
1084 
1086  // on device store in pinned buffer on host
1088  ~Reduce()
1089  {
1090 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
1091  if (parent == this)
1092  {
1093  delete tally_or_val_ptr.list;
1094  tally_or_val_ptr.list = nullptr;
1095  }
1096  else if (parent)
1097  {
1098  if (val.value != val.identity)
1099  {
1100  std::lock_guard<std::mutex> lock(tally_or_val_ptr.list->m_mutex);
1101  parent->combine(val.value);
1102  }
1103  }
1104  else
1105  {
1106  if (val.teardownForDevice())
1107  {
1108  tally_or_val_ptr.val_ptr = nullptr;
1109  }
1110  }
1111 #else
1112  if (!parent->parent)
1113  {
1114  val.grid_reduce(tally_or_val_ptr.val_ptr);
1115  }
1116  else
1117  {
1118  parent->combine(val.value);
1119  }
1120 #endif
1121  }
1122 
1124  operator T()
1125  {
1126  auto n = tally_or_val_ptr.list->begin();
1127  auto end = tally_or_val_ptr.list->end();
1128  if (n != end)
1129  {
1130  tally_or_val_ptr.list->synchronize_resources();
1132  std::move(val.value));
1133  for (; n != end; ++n)
1134  {
1135  T(&values)[tally_slots] = *n;
1136  for (size_t r = 0; r < tally_slots; ++r)
1137  {
1138  reducer.combine(std::move(values[r]));
1139  }
1140  }
1141  val.value = reducer.get_and_reset();
1142  tally_or_val_ptr.list->free_list();
1143  }
1144  return val.value;
1145  }
1146 
1148  T get() { return operator T(); }
1149 
1152  void combine(T other) const { Combiner {}(val.value, other); }
1153 
1157  T& local() const { return val.value; }
1158 
1159  T get_combined() const { return val.value; }
1160 
1161 private:
1162  const Reduce* parent;
1163  tally_u tally_or_val_ptr;
1164  reduce_data_type val;
1165 };
1166 
1167 } // end namespace cuda
1168 
1170 template<typename tuning, typename T>
1171 class ReduceSum<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
1172  : public cuda::Reduce<RAJA::reduce::sum<T>, T, tuning>
1173 {
1174 
1175 public:
1176  using Base = cuda::Reduce<RAJA::reduce::sum<T>, T, tuning>;
1177  using Base::Base;
1178 
1181  const ReduceSum& operator+=(T rhs) const
1182  {
1183  this->combine(rhs);
1184  return *this;
1185  }
1186 };
1187 
1189 template<typename tuning, typename T>
1190 class ReduceBitOr<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
1191  : public cuda::Reduce<RAJA::reduce::or_bit<T>, T, tuning>
1192 {
1193 
1194 public:
1195  using Base = cuda::Reduce<RAJA::reduce::or_bit<T>, T, tuning>;
1196  using Base::Base;
1197 
1200  const ReduceBitOr& operator|=(T rhs) const
1201  {
1202  this->combine(rhs);
1203  return *this;
1204  }
1205 };
1206 
1208 template<typename tuning, typename T>
1209 class ReduceBitAnd<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
1210  : public cuda::Reduce<RAJA::reduce::and_bit<T>, T, tuning>
1211 {
1212 
1213 public:
1214  using Base = cuda::Reduce<RAJA::reduce::and_bit<T>, T, tuning>;
1215  using Base::Base;
1216 
1219  const ReduceBitAnd& operator&=(T rhs) const
1220  {
1221  this->combine(rhs);
1222  return *this;
1223  }
1224 };
1225 
1227 template<typename tuning, typename T>
1228 class ReduceMin<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
1229  : public cuda::Reduce<RAJA::reduce::min<T>, T, tuning>
1230 {
1231 
1232 public:
1233  using Base = cuda::Reduce<RAJA::reduce::min<T>, T, tuning>;
1234  using Base::Base;
1235 
1238  const ReduceMin& min(T rhs) const
1239  {
1240  this->combine(rhs);
1241  return *this;
1242  }
1243 };
1244 
1246 template<typename tuning, typename T>
1247 class ReduceMax<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
1248  : public cuda::Reduce<RAJA::reduce::max<T>, T, tuning>
1249 {
1250 
1251 public:
1252  using Base = cuda::Reduce<RAJA::reduce::max<T>, T, tuning>;
1253  using Base::Base;
1254 
1257  const ReduceMax& max(T rhs) const
1258  {
1259  this->combine(rhs);
1260  return *this;
1261  }
1262 };
1263 
1265 template<typename tuning, typename T, typename IndexType>
1266 class ReduceMinLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
1267  : public cuda::Reduce<
1268  RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
1269  RAJA::reduce::detail::ValueLoc<T, IndexType>,
1270  tuning>
1271 {
1272 
1273 public:
1275  using Combiner = RAJA::reduce::min<value_type>;
1276  using NonLocCombiner = RAJA::reduce::min<T>;
1277  using Base = cuda::Reduce<Combiner, value_type, tuning>;
1278  using Base::Base;
1279 
1281  ReduceMinLoc(T init_val,
1282  IndexType init_idx,
1283  T identity_val = NonLocCombiner::identity(),
1284  IndexType identity_idx =
1286  : Base(value_type(init_val, init_idx),
1287  value_type(identity_val, identity_idx))
1288  {}
1289 
1291  // this must be here to hide Base::reset
1292  void reset(T init_val,
1293  IndexType init_idx,
1294  T identity_val = NonLocCombiner::identity(),
1295  IndexType identity_idx =
1297  {
1298  Base::reset(value_type(init_val, init_idx),
1299  value_type(identity_val, identity_idx));
1300  }
1301 
1304  const ReduceMinLoc& minloc(T rhs, IndexType loc) const
1305  {
1306  this->combine(value_type(rhs, loc));
1307  return *this;
1308  }
1309 
1311  IndexType getLoc() { return Base::get().getLoc(); }
1312 
1314  operator T() { return Base::get(); }
1315 
1317  T get() { return Base::get(); }
1318 };
1319 
1321 template<typename tuning, typename T, typename IndexType>
1322 class ReduceMaxLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
1323  : public cuda::Reduce<
1324  RAJA::reduce::max<
1325  RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
1326  RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
1327  tuning>
1328 {
1329 public:
1331  using Combiner = RAJA::reduce::max<value_type>;
1332  using NonLocCombiner = RAJA::reduce::max<T>;
1333  using Base = cuda::Reduce<Combiner, value_type, tuning>;
1334  using Base::Base;
1335 
1337  ReduceMaxLoc(T init_val,
1338  IndexType init_idx,
1339  T identity_val = NonLocCombiner::identity(),
1340  IndexType identity_idx =
1342  : Base(value_type(init_val, init_idx),
1343  value_type(identity_val, identity_idx))
1344  {}
1345 
1347  // this must be here to hide Base::reset
1348  void reset(T init_val,
1349  IndexType init_idx,
1350  T identity_val = NonLocCombiner::identity(),
1351  IndexType identity_idx =
1353  {
1354  Base::reset(value_type(init_val, init_idx),
1355  value_type(identity_val, identity_idx));
1356  }
1357 
1360  const ReduceMaxLoc& maxloc(T rhs, IndexType loc) const
1361  {
1362  this->combine(value_type(rhs, loc));
1363  return *this;
1364  }
1365 
1367  IndexType getLoc() { return Base::get().getLoc(); }
1368 
1370  operator T() { return Base::get(); }
1371 
1373  T get() { return Base::get(); }
1374 };
1375 
1376 } // namespace RAJA
1377 
1378 #endif // closing endif for RAJA_ENABLE_CUDA guard
1379 
1380 #endif // closing endif for header file include guard
Header file defining prototypes for routines used to manage memory for CUDA reductions and other oper...
Header file for common RAJA internal definitions.
Header file for common RAJA internal definitions.
RAJA header file containing an implementation of a memory pool.
Array class specialized for Struct of Array data layout.
Definition: SoAArray.hpp:42
constexpr RAJA_HOST_DEVICE void set(size_t i, value_type val)
Definition: SoAArray.hpp:48
Pointer class specialized for Struct of Array data layout allocated via RAJA basic_mempools.
Definition: SoAPtr.hpp:52
constexpr RAJA_HOST_DEVICE void set(size_t i, value_type val)
Definition: SoAPtr.hpp:100
constexpr RAJA_HOST_DEVICE value_type get(size_t i) const
Definition: SoAPtr.hpp:95
Definition: reduce.hpp:131
Header file containing RAJA intrinsics templates for CUDA execution.
Header file containing RAJA CUDA policy definitions.
Header file for common RAJA internal macro definitions.
#define RAJA_HOST_DEVICE
Definition: macros.hpp:65
#define RAJA_DIVIDE_CEILING_INT(dividend, divisor)
Definition: macros.hpp:122
#define RAJA_DEVICE
Definition: macros.hpp:66
constexpr auto Reduce(T *target)
Definition: reducer.hpp:231
RAJA_INLINE RAJA_HOST_DEVICE auto operator*(LHS const &left_operand, RHS const &right_operand) -> TensorMultiply< typename NormalizeOperandHelper< LHS >::return_type, RHS >
Definition: TensorMultiply.hpp:155
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T *acc, T value)
Atomic bitwise AND equivalent to (*acc) = (*acc) & value This only works with integral data types.
Definition: atomic.hpp:224
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result min(Args... args)
Definition: foldl.hpp:161
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T *acc, T value)
Atomic bitwise OR equivalent to (*acc) = (*acc) | value This only works with integral data types.
Definition: atomic.hpp:240
RAJA_HOST_DEVICE RAJA_INLINE Iter next(Iter it)
returns iterator to next item
Definition: algorithm.hpp:90
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T *acc, T value)
Atomic add.
Definition: atomic.hpp:117
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result sum(Args... args)
Definition: foldl.hpp:143
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
std::conditional_t< RAJA::operators::is_fp_associative< T >::value, BinaryTreeReduce< T, BinaryOp >, LeftFoldReduce< T, BinaryOp > > HighAccuracyReduce
Definition: reduce.hpp:357
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T *acc, T value)
Atomic maximum equivalent to (*acc) = std::max(*acc, value)
Definition: atomic.hpp:156
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T *acc, T compare, T value)
Atomic compare and swap.
Definition: atomic.hpp:286
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc)
Atomic increment.
Definition: atomic.hpp:168
RAJA_SUPPRESS_HD_WARN RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T *acc, T value)
Atomic minimum equivalent to (*acc) = std::min(*acc, value)
Definition: atomic.hpp:143
void synchronize()
Synchronize all current RAJA executions for the specified policy.
Definition: synchronize.hpp:44
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155
Base types used in common for RAJA reducer objects.
Header file providing RAJA reduction declarations.
RAJA header file defining atomic operations for CUDA.
Header file containing utility methods used in CUDA operations.
Definition: reduce.hpp:115
Definition: reduce.hpp:95
Definition: reduce.hpp:91
Header file for RAJA type definitions.
Header file providing RAJA sort templates.