20 #ifndef RAJA_sycl_reduce_HPP
21 #define RAJA_sycl_reduce_HPP
23 #include "RAJA/config.hpp"
25 #if defined(RAJA_ENABLE_SYCL)
42 template<
typename T,
typename I>
63 template<
typename T,
typename I>
86 static int MaxNumTeams = 1;
93 bool isMapped {
false};
95 Offload_Info() =
default;
97 Offload_Info(
const Offload_Info& other)
98 : hostID {other.hostID},
99 deviceID {other.deviceID},
100 isMapped {other.isMapped}
114 Reduce_Data() =
delete;
120 Reduce_Data(T initValue, T identityValue, Offload_Info& info)
123 ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
126 device =
reinterpret_cast<T*
>(
127 ::sycl::malloc_device(sycl::MaxNumTeams *
sizeof(T), *(q)));
128 host =
reinterpret_cast<T*
>(
129 ::sycl::malloc_host(sycl::MaxNumTeams *
sizeof(T), *(q)));
133 printf(
"Unable to allocate space on host\n");
138 printf(
"Unable to allocate space on device\n");
141 std::fill_n(host, sycl::MaxNumTeams, identityValue);
145 void reset(T initValue) { value = initValue; }
148 Reduce_Data(
const Reduce_Data&) =
default;
153 ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
157 camp::resources::Resource res = camp::resources::Sycl();
158 q = res.get<camp::resources::Sycl>().get_queue();
163 q->memcpy(
reinterpret_cast<void*
>(device),
164 reinterpret_cast<void*
>(host), sycl::MaxNumTeams *
sizeof(T));
172 ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
176 camp::resources::Resource res = camp::resources::Sycl();
177 q = res.get<camp::resources::Sycl>().get_queue();
181 auto e = q->memcpy(
reinterpret_cast<void*
>(host),
182 reinterpret_cast<void*
>(device),
183 sycl::MaxNumTeams *
sizeof(T));
191 ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
195 ::sycl::free(
reinterpret_cast<void*
>(device), *q);
200 ::sycl::free(
reinterpret_cast<void*
>(host), *q);
211 template<
typename Reducer,
typename T>
214 TargetReduce() =
delete;
215 TargetReduce(
const TargetReduce&) =
default;
217 explicit TargetReduce(T init_val)
218 : val(Reducer::identity(), Reducer::identity(), info),
221 finalVal(Reducer::identity())
224 void reset(T init_val_, T identity_ = Reducer::identity())
227 val = sycl::Reduce_Data<T>(identity_, identity_, info);
228 info.isMapped =
false;
230 finalVal = identity_;
241 val.deviceToHost(info);
242 for (
int i = 0; i < sycl::MaxNumTeams; ++i)
244 Reducer {}(val.value, val.host[i]);
247 info.isMapped =
true;
249 finalVal = Reducer::identity();
250 Reducer {}(finalVal, initVal);
251 Reducer {}(finalVal, val.value);
252 T returnVal = finalVal;
258 T
get() {
return operator T(); }
261 TargetReduce&
reduce(T rhsVal)
263 #ifdef __SYCL_DEVICE_ONLY__
265 auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel,
266 ::sycl::memory_scope::device,
267 ::sycl::access::address_space::global_space>(
269 Reducer {}(atm, rhsVal);
272 Reducer {}(val.value, rhsVal);
278 const TargetReduce&
reduce(T rhsVal)
const
280 #ifdef __SYCL_DEVICE_ONLY__
282 auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel,
283 ::sycl::memory_scope::device,
284 ::sycl::access::address_space::global_space>(
286 Reducer {}(atm, rhsVal);
289 Reducer {}(val.value, rhsVal);
295 sycl::Reduce_Data<T> val;
299 sycl::Offload_Info info;
307 template<
typename Reducer,
typename T,
typename IndexType>
308 struct TargetReduceLoc
310 TargetReduceLoc() =
delete;
311 TargetReduceLoc(
const TargetReduceLoc&) =
default;
313 explicit TargetReduceLoc(
316 T identity_val_ = Reducer::identity(),
317 IndexType identity_loc_ =
320 val(identity_val_, identity_val_, info),
321 loc(identity_loc_, identity_loc_, info),
323 finalVal(identity_val_),
325 finalLoc(identity_loc_)
328 void reset(T init_val_,
330 T identity_val_ = Reducer::identity(),
331 IndexType identity_loc_ =
335 val = sycl::Reduce_Data<T>(identity_val_, identity_val_, info);
337 loc = sycl::Reduce_Data<IndexType>(identity_loc_, identity_loc_, info);
338 info.isMapped =
false;
340 finalVal = identity_val_;
342 finalLoc = identity_loc_;
346 ~TargetReduceLoc() {}
353 val.deviceToHost(info);
354 loc.deviceToHost(info);
356 for (
int i = 0; i < sycl::MaxNumTeams; ++i)
358 Reducer {}(val.value, loc.value, val.host[i], loc.host[i]);
360 info.isMapped =
true;
362 finalVal = Reducer::identity();
364 Reducer {}(finalVal, finalLoc, initVal, initLoc);
365 Reducer {}(finalVal, finalLoc, val.value, loc.value);
366 returnVal = finalVal;
367 returnLoc = finalLoc;
368 reset(finalVal, finalLoc);
373 T
get() {
return operator T(); }
379 if (!info.isMapped)
get();
385 TargetReduceLoc&
reduce(T rhsVal, IndexType rhsLoc)
387 #ifdef __SYCL_DEVICE_ONLY__
389 ::sycl::atomic_fence(::sycl::memory_order_acquire,
390 ::sycl::memory_scope::device);
391 Reducer {}(val.device[i], loc.device[i], rhsVal, rhsLoc);
392 ::sycl::atomic_fence(::sycl::memory_order_release,
393 ::sycl::memory_scope::device);
396 Reducer {}(val.value, loc.value, rhsVal, rhsLoc);
402 const TargetReduceLoc&
reduce(T rhsVal, IndexType rhsLoc)
const
404 Reducer {}(val.value, loc.value, rhsVal, rhsLoc);
409 sycl::Reduce_Data<T> val;
410 sycl::Reduce_Data<IndexType> loc;
414 sycl::Offload_Info info;
428 class ReduceSum<sycl_reduce, T> :
public TargetReduce<RAJA::reduce::sum<T>, T>
431 using self = ReduceSum<sycl_reduce, T>;
432 using parent = TargetReduce<RAJA::reduce::sum<T>, T>;
433 using parent::parent;
436 self& operator+=(T rhsVal)
438 parent::reduce(rhsVal);
443 const self& operator+=(T rhsVal)
const
445 #ifdef __SYCL_DEVICE_ONLY__
447 auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel,
448 ::sycl::memory_scope::device,
449 ::sycl::access::address_space::global_space>(
450 parent::val.device[i]);
451 atm.fetch_add(rhsVal);
454 parent::reduce(rhsVal);
462 class ReduceBitOr<sycl_reduce, T>
463 :
public TargetReduce<RAJA::reduce::or_bit<T>, T>
466 using self = ReduceBitOr<sycl_reduce, T>;
467 using parent = TargetReduce<RAJA::reduce::or_bit<T>, T>;
468 using parent::parent;
471 self& operator|=(T rhsVal)
473 #ifdef __SYCL_DEVICE_ONLY__
475 auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel,
476 ::sycl::memory_scope::device,
477 ::sycl::access::address_space::global_space>(
478 parent::val.device[i]);
482 parent::reduce(rhsVal);
488 const self& operator|=(T rhsVal)
const
490 #ifdef __SYCL_DEVICE_ONLY__
492 auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel,
493 ::sycl::memory_scope::device,
494 ::sycl::access::address_space::global_space>(
495 parent::val.device[i]);
499 parent::reduce(rhsVal);
507 class ReduceBitAnd<sycl_reduce, T>
508 :
public TargetReduce<RAJA::reduce::and_bit<T>, T>
511 using self = ReduceBitAnd<sycl_reduce, T>;
512 using parent = TargetReduce<RAJA::reduce::and_bit<T>, T>;
513 using parent::parent;
516 self& operator&=(T rhsVal)
518 #ifdef __SYCL_DEVICE_ONLY__
520 auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel,
521 ::sycl::memory_scope::device,
522 ::sycl::access::address_space::global_space>(
523 parent::val.device[i]);
527 parent::reduce(rhsVal);
533 const self& operator&=(T rhsVal)
const
535 #ifdef __SYCL_DEVICE_ONLY__
537 auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel,
538 ::sycl::memory_scope::device,
539 ::sycl::access::address_space::global_space>(
540 parent::val.device[i]);
544 parent::reduce(rhsVal);
552 class ReduceMin<sycl_reduce, T> :
public TargetReduce<RAJA::reduce::min<T>, T>
555 using self = ReduceMin<sycl_reduce, T>;
556 using parent = TargetReduce<RAJA::reduce::min<T>, T>;
557 using parent::parent;
562 #ifdef __SYCL_DEVICE_ONLY__
564 auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel,
565 ::sycl::memory_scope::device,
566 ::sycl::access::address_space::global_space>(
567 parent::val.device[i]);
568 atm.fetch_min(rhsVal);
571 parent::reduce(rhsVal);
577 const self&
min(T rhsVal)
const
579 #ifdef __SYCL_DEVICE_ONLY__
581 auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel,
582 ::sycl::memory_scope::device,
583 ::sycl::access::address_space::global_space>(
584 parent::val.device[i]);
585 atm.fetch_min(rhsVal);
588 parent::reduce(rhsVal);
596 class ReduceMax<sycl_reduce, T> :
public TargetReduce<RAJA::reduce::max<T>, T>
599 using self = ReduceMax<sycl_reduce, T>;
600 using parent = TargetReduce<RAJA::reduce::max<T>, T>;
601 using parent::parent;
606 #ifdef __SYCL_DEVICE_ONLY__
608 auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel,
609 ::sycl::memory_scope::device,
610 ::sycl::access::address_space::global_space>(
611 parent::val.device[i]);
612 atm.fetch_max(rhsVal);
615 parent::reduce(rhsVal);
621 const self&
max(T rhsVal)
const
623 #ifdef __SYCL_DEVICE_ONLY__
625 auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel,
626 ::sycl::memory_scope::device,
627 ::sycl::access::address_space::global_space>(
628 parent::val.device[i]);
629 atm.fetch_max(rhsVal);
632 parent::reduce(rhsVal);
#define RAJA_HOST_DEVICE
Definition: macros.hpp:65
#define RAJA_UNUSED_ARG(x)
Definition: macros.hpp:97
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result min(Args... args)
Definition: foldl.hpp:161
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155
Header file providing RAJA reduction declarations.
Definition: reduce.hpp:115
Header file containing RAJA SYCL policy definitions.
Header file for RAJA type definitions.