RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
reduce.hpp
Go to the documentation of this file.
1 
11 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
12 // Copyright (c) Lawrence Livermore National Security, LLC and other
13 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
14 // files for dates and other details. No copyright assignment is required
15 // to contribute to RAJA.
16 //
17 // SPDX-License-Identifier: (BSD-3-Clause)
18 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
19 
20 #ifndef RAJA_sycl_reduce_HPP
21 #define RAJA_sycl_reduce_HPP
22 
23 #include "RAJA/config.hpp"
24 
25 #if defined(RAJA_ENABLE_SYCL)
26 
27 #include <algorithm>
28 
29 
30 #include "RAJA/util/types.hpp"
31 
32 #include "RAJA/pattern/reduce.hpp"
33 
35 
36 namespace RAJA
37 {
38 
39 namespace sycl
40 {
41 
42 template<typename T, typename I>
43 struct minloc
44 {
45  RAJA_HOST_DEVICE static constexpr T identity()
46  {
48  }
49 
50  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T& val,
51  I& loc,
52  const T v,
53  const I l)
54  {
55  if (v < val)
56  {
57  loc = l;
58  val = v;
59  }
60  }
61 };
62 
63 template<typename T, typename I>
64 struct maxloc
65 {
66  RAJA_HOST_DEVICE static constexpr T identity()
67  {
69  }
70 
71  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T& val,
72  I& loc,
73  const T v,
74  const I l)
75  {
76  if (v > val)
77  {
78  loc = l;
79  val = v;
80  }
81  }
82 };
83 
84 // Alias for clarity. Reduction size operates on number of teams.
85 // Ideally, MaxNumTeams = ThreadsPerTeam in omp_target_parallel_for_exec.
86 static int MaxNumTeams = 1;
87 
89 struct Offload_Info
90 {
91  int hostID {1};
92  int deviceID {2};
93  bool isMapped {false};
94 
95  Offload_Info() = default;
96 
97  Offload_Info(const Offload_Info& other)
98  : hostID {other.hostID},
99  deviceID {other.deviceID},
100  isMapped {other.isMapped}
101  {}
102 };
103 
106 template<typename T>
107 struct Reduce_Data
108 {
109  mutable T value;
110  T* device;
111  T* host;
112 
114  Reduce_Data() = delete;
115 
120  Reduce_Data(T initValue, T identityValue, Offload_Info& info)
121  : value(initValue)
122  {
123  ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
124 
125 
126  device = reinterpret_cast<T*>(
127  ::sycl::malloc_device(sycl::MaxNumTeams * sizeof(T), *(q)));
128  host = reinterpret_cast<T*>(
129  ::sycl::malloc_host(sycl::MaxNumTeams * sizeof(T), *(q)));
130 
131  if (!host)
132  {
133  printf("Unable to allocate space on host\n");
134  exit(1);
135  }
136  if (!device)
137  {
138  printf("Unable to allocate space on device\n");
139  exit(1);
140  }
141  std::fill_n(host, sycl::MaxNumTeams, identityValue);
142  hostToDevice(info);
143  }
144 
145  void reset(T initValue) { value = initValue; }
146 
148  Reduce_Data(const Reduce_Data&) = default;
149 
151  RAJA_INLINE void hostToDevice(Offload_Info& RAJA_UNUSED_ARG(info))
152  {
153  ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
154 
155  if (!q)
156  {
157  camp::resources::Resource res = camp::resources::Sycl();
158  q = res.get<camp::resources::Sycl>().get_queue();
159  }
160 
161  // precondition: host and device are valid pointers
162  auto e =
163  q->memcpy(reinterpret_cast<void*>(device),
164  reinterpret_cast<void*>(host), sycl::MaxNumTeams * sizeof(T));
165 
166  e.wait();
167  }
168 
170  RAJA_INLINE void deviceToHost(Offload_Info& RAJA_UNUSED_ARG(info))
171  {
172  ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
173 
174  if (!q)
175  {
176  camp::resources::Resource res = camp::resources::Sycl();
177  q = res.get<camp::resources::Sycl>().get_queue();
178  }
179 
180  // precondition: host and device are valid pointers
181  auto e = q->memcpy(reinterpret_cast<void*>(host),
182  reinterpret_cast<void*>(device),
183  sycl::MaxNumTeams * sizeof(T));
184 
185  e.wait();
186  }
187 
189  RAJA_INLINE void cleanup(Offload_Info& RAJA_UNUSED_ARG(info))
190  {
191  ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
192 
193  if (device)
194  {
195  ::sycl::free(reinterpret_cast<void*>(device), *q);
196  device = nullptr;
197  }
198  if (host)
199  {
200  ::sycl::free(reinterpret_cast<void*>(host), *q);
201  // delete[] host;
202  host = nullptr;
203  }
204  }
205 };
206 
207 } // end namespace sycl
208 
211 template<typename Reducer, typename T>
212 struct TargetReduce
213 {
214  TargetReduce() = delete;
215  TargetReduce(const TargetReduce&) = default;
216 
217  explicit TargetReduce(T init_val)
218  : val(Reducer::identity(), Reducer::identity(), info),
219  info(),
220  initVal(init_val),
221  finalVal(Reducer::identity())
222  {}
223 
224  void reset(T init_val_, T identity_ = Reducer::identity())
225  {
226  val.cleanup(info);
227  val = sycl::Reduce_Data<T>(identity_, identity_, info);
228  info.isMapped = false;
229  initVal = init_val_;
230  finalVal = identity_;
231  }
232 
234  ~TargetReduce() {}
235 
237  operator T()
238  {
239  if (!info.isMapped)
240  {
241  val.deviceToHost(info);
242  for (int i = 0; i < sycl::MaxNumTeams; ++i)
243  {
244  Reducer {}(val.value, val.host[i]);
245  }
246  // val.cleanup(info);
247  info.isMapped = true;
248  }
249  finalVal = Reducer::identity();
250  Reducer {}(finalVal, initVal);
251  Reducer {}(finalVal, val.value);
252  T returnVal = finalVal;
253  reset(finalVal);
254  return returnVal;
255  }
256 
258  T get() { return operator T(); }
259 
261  TargetReduce& reduce(T rhsVal)
262  {
263 #ifdef __SYCL_DEVICE_ONLY__
264  auto i = 0; //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
265  auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel,
266  ::sycl::memory_scope::device,
267  ::sycl::access::address_space::global_space>(
268  val.device[i]);
269  Reducer {}(atm, rhsVal);
270  return *this;
271 #else
272  Reducer {}(val.value, rhsVal);
273  return *this;
274 #endif
275  }
276 
278  const TargetReduce& reduce(T rhsVal) const
279  {
280 #ifdef __SYCL_DEVICE_ONLY__
281  auto i = 0; //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
282  auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel,
283  ::sycl::memory_scope::device,
284  ::sycl::access::address_space::global_space>(
285  val.device[i]);
286  Reducer {}(atm, rhsVal);
287  return *this;
288 #else
289  Reducer {}(val.value, rhsVal);
290  return *this;
291 #endif
292  }
293 
295  sycl::Reduce_Data<T> val;
296 
297 private:
299  sycl::Offload_Info info;
301  T initVal;
302  T finalVal;
303 };
304 
307 template<typename Reducer, typename T, typename IndexType>
308 struct TargetReduceLoc
309 {
310  TargetReduceLoc() = delete;
311  TargetReduceLoc(const TargetReduceLoc&) = default;
312 
313  explicit TargetReduceLoc(
314  T init_val,
315  IndexType init_loc,
316  T identity_val_ = Reducer::identity(),
317  IndexType identity_loc_ =
319  : info(),
320  val(identity_val_, identity_val_, info),
321  loc(identity_loc_, identity_loc_, info),
322  initVal(init_val),
323  finalVal(identity_val_),
324  initLoc(init_loc),
325  finalLoc(identity_loc_)
326  {}
327 
328  void reset(T init_val_,
329  IndexType init_loc_,
330  T identity_val_ = Reducer::identity(),
331  IndexType identity_loc_ =
333  {
334  val.cleanup(info);
335  val = sycl::Reduce_Data<T>(identity_val_, identity_val_, info);
336  loc.cleanup(info);
337  loc = sycl::Reduce_Data<IndexType>(identity_loc_, identity_loc_, info);
338  info.isMapped = false;
339  initVal = init_val_;
340  finalVal = identity_val_;
341  initLoc = init_loc_;
342  finalLoc = identity_loc_;
343  }
344 
346  ~TargetReduceLoc() {}
347 
349  operator T()
350  {
351  if (!info.isMapped)
352  {
353  val.deviceToHost(info);
354  loc.deviceToHost(info);
355 
356  for (int i = 0; i < sycl::MaxNumTeams; ++i)
357  {
358  Reducer {}(val.value, loc.value, val.host[i], loc.host[i]);
359  }
360  info.isMapped = true;
361  }
362  finalVal = Reducer::identity();
363  finalLoc = IndexType(RAJA::reduce::detail::DefaultLoc<IndexType>().value());
364  Reducer {}(finalVal, finalLoc, initVal, initLoc);
365  Reducer {}(finalVal, finalLoc, val.value, loc.value);
366  returnVal = finalVal;
367  returnLoc = finalLoc;
368  reset(finalVal, finalLoc);
369  return returnVal;
370  }
371 
373  T get() { return operator T(); }
374 
377  IndexType getLoc()
378  {
379  if (!info.isMapped) get();
380  // return loc.value;
381  return (returnLoc);
382  }
383 
385  TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc)
386  {
387 #ifdef __SYCL_DEVICE_ONLY__
388  auto i = 0; //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
389  ::sycl::atomic_fence(::sycl::memory_order_acquire,
390  ::sycl::memory_scope::device);
391  Reducer {}(val.device[i], loc.device[i], rhsVal, rhsLoc);
392  ::sycl::atomic_fence(::sycl::memory_order_release,
393  ::sycl::memory_scope::device);
394  return *this;
395 #else
396  Reducer {}(val.value, loc.value, rhsVal, rhsLoc);
397  return *this;
398 #endif
399  }
400 
402  const TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc) const
403  {
404  Reducer {}(val.value, loc.value, rhsVal, rhsLoc);
405  return *this;
406  }
407 
409  sycl::Reduce_Data<T> val;
410  sycl::Reduce_Data<IndexType> loc;
411 
412 private:
414  sycl::Offload_Info info;
416  // sycl::Reduce_Data<T> val;
418  T initVal;
419  T finalVal;
420  T returnVal;
421  IndexType initLoc;
422  IndexType finalLoc;
423  IndexType returnLoc;
424 };
425 
427 template<typename T>
428 class ReduceSum<sycl_reduce, T> : public TargetReduce<RAJA::reduce::sum<T>, T>
429 {
430 public:
431  using self = ReduceSum<sycl_reduce, T>;
432  using parent = TargetReduce<RAJA::reduce::sum<T>, T>;
433  using parent::parent;
434 
436  self& operator+=(T rhsVal)
437  {
438  parent::reduce(rhsVal);
439  return *this;
440  }
441 
443  const self& operator+=(T rhsVal) const
444  {
445 #ifdef __SYCL_DEVICE_ONLY__
446  auto i = 0; //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
447  auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel,
448  ::sycl::memory_scope::device,
449  ::sycl::access::address_space::global_space>(
450  parent::val.device[i]);
451  atm.fetch_add(rhsVal);
452  return *this;
453 #else
454  parent::reduce(rhsVal);
455  return *this;
456 #endif
457  }
458 };
459 
461 template<typename T>
462 class ReduceBitOr<sycl_reduce, T>
463  : public TargetReduce<RAJA::reduce::or_bit<T>, T>
464 {
465 public:
466  using self = ReduceBitOr<sycl_reduce, T>;
467  using parent = TargetReduce<RAJA::reduce::or_bit<T>, T>;
468  using parent::parent;
469 
471  self& operator|=(T rhsVal)
472  {
473 #ifdef __SYCL_DEVICE_ONLY__
474  auto i = 0; //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
475  auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel,
476  ::sycl::memory_scope::device,
477  ::sycl::access::address_space::global_space>(
478  parent::val.device[i]);
479  atm |= rhsVal;
480  return *this;
481 #else
482  parent::reduce(rhsVal);
483  return *this;
484 #endif
485  }
486 
488  const self& operator|=(T rhsVal) const
489  {
490 #ifdef __SYCL_DEVICE_ONLY__
491  auto i = 0; //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
492  auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel,
493  ::sycl::memory_scope::device,
494  ::sycl::access::address_space::global_space>(
495  parent::val.device[i]);
496  atm |= rhsVal;
497  return *this;
498 #else
499  parent::reduce(rhsVal);
500  return *this;
501 #endif
502  }
503 };
504 
506 template<typename T>
507 class ReduceBitAnd<sycl_reduce, T>
508  : public TargetReduce<RAJA::reduce::and_bit<T>, T>
509 {
510 public:
511  using self = ReduceBitAnd<sycl_reduce, T>;
512  using parent = TargetReduce<RAJA::reduce::and_bit<T>, T>;
513  using parent::parent;
514 
516  self& operator&=(T rhsVal)
517  {
518 #ifdef __SYCL_DEVICE_ONLY__
519  auto i = 0; //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
520  auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel,
521  ::sycl::memory_scope::device,
522  ::sycl::access::address_space::global_space>(
523  parent::val.device[i]);
524  atm &= rhsVal;
525  return *this;
526 #else
527  parent::reduce(rhsVal);
528  return *this;
529 #endif
530  }
531 
533  const self& operator&=(T rhsVal) const
534  {
535 #ifdef __SYCL_DEVICE_ONLY__
536  auto i = 0; //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
537  auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel,
538  ::sycl::memory_scope::device,
539  ::sycl::access::address_space::global_space>(
540  parent::val.device[i]);
541  atm &= rhsVal;
542  return *this;
543 #else
544  parent::reduce(rhsVal);
545  return *this;
546 #endif
547  }
548 };
549 
551 template<typename T>
552 class ReduceMin<sycl_reduce, T> : public TargetReduce<RAJA::reduce::min<T>, T>
553 {
554 public:
555  using self = ReduceMin<sycl_reduce, T>;
556  using parent = TargetReduce<RAJA::reduce::min<T>, T>;
557  using parent::parent;
558 
560  self& min(T rhsVal)
561  {
562 #ifdef __SYCL_DEVICE_ONLY__
563  auto i = 0; //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
564  auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel,
565  ::sycl::memory_scope::device,
566  ::sycl::access::address_space::global_space>(
567  parent::val.device[i]);
568  atm.fetch_min(rhsVal);
569  return *this;
570 #else
571  parent::reduce(rhsVal);
572  return *this;
573 #endif
574  }
575 
577  const self& min(T rhsVal) const
578  {
579 #ifdef __SYCL_DEVICE_ONLY__
580  auto i = 0; //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
581  auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel,
582  ::sycl::memory_scope::device,
583  ::sycl::access::address_space::global_space>(
584  parent::val.device[i]);
585  atm.fetch_min(rhsVal);
586  return *this;
587 #else
588  parent::reduce(rhsVal);
589  return *this;
590 #endif
591  }
592 };
593 
595 template<typename T>
596 class ReduceMax<sycl_reduce, T> : public TargetReduce<RAJA::reduce::max<T>, T>
597 {
598 public:
599  using self = ReduceMax<sycl_reduce, T>;
600  using parent = TargetReduce<RAJA::reduce::max<T>, T>;
601  using parent::parent;
602 
604  self& max(T rhsVal)
605  {
606 #ifdef __SYCL_DEVICE_ONLY__
607  auto i = 0; //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
608  auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel,
609  ::sycl::memory_scope::device,
610  ::sycl::access::address_space::global_space>(
611  parent::val.device[i]);
612  atm.fetch_max(rhsVal);
613  return *this;
614 #else
615  parent::reduce(rhsVal);
616  return *this;
617 #endif
618  }
619 
621  const self& max(T rhsVal) const
622  {
623 #ifdef __SYCL_DEVICE_ONLY__
624  auto i = 0; //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
625  auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel,
626  ::sycl::memory_scope::device,
627  ::sycl::access::address_space::global_space>(
628  parent::val.device[i]);
629  atm.fetch_max(rhsVal);
630  return *this;
631 #else
632  parent::reduce(rhsVal);
633  return *this;
634 #endif
635  }
636 };
637 
638 } // namespace RAJA
639 
640 #endif // closing endif for RAJA_ENABLE_SYCL guard
641 
642 #endif // closing endif for header file include guard
#define RAJA_HOST_DEVICE
Definition: macros.hpp:65
#define RAJA_UNUSED_ARG(x)
Definition: macros.hpp:97
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result min(Args... args)
Definition: foldl.hpp:161
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155
Header file providing RAJA reduction declarations.
Definition: reduce.hpp:115
Header file containing RAJA SYCL policy definitions.
Header file for RAJA type definitions.