RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
forall.hpp
Go to the documentation of this file.
1 
15 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
16 // Copyright (c) Lawrence Livermore National Security, LLC and other
17 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
18 // files for dates and other details. No copyright assignment is required
19 // to contribute to RAJA.
20 //
21 // SPDX-License-Identifier: (BSD-3-Clause)
22 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
23 
24 #ifndef RAJA_forall_sycl_HPP
25 #define RAJA_forall_sycl_HPP
26 
27 #include "RAJA/config.hpp"
28 
29 #if defined(RAJA_ENABLE_SYCL)
30 
31 #include <algorithm>
32 #include <chrono>
33 
35 
36 #include "RAJA/pattern/forall.hpp"
37 
39 
40 #include "RAJA/util/macros.hpp"
41 #include "RAJA/util/types.hpp"
42 
45 
46 #include "RAJA/index/IndexSet.hpp"
47 
48 #include "RAJA/util/resource.hpp"
49 
50 namespace RAJA
51 {
52 
53 namespace policy
54 {
55 
56 namespace sycl
57 {
58 
59 namespace impl
60 {
61 
69 RAJA_INLINE
70 ::sycl::range<1> getGridDim(size_t len, size_t block_size)
71 {
72  size_t size = {block_size * ((len + block_size - 1) / block_size)};
73  ::sycl::range<1> gridSize(size);
74  return gridSize;
75 }
76 
77 } // namespace impl
78 
79 //
81 //
82 // Function templates for SYCL execution over iterables.
83 //
85 //
86 
87 
88 template<typename Iterable,
89  typename LoopBody,
90  size_t BlockSize,
91  bool Async,
92  typename ForallParam>
93 RAJA_INLINE concepts::enable_if_t<
94  resources::EventProxy<resources::Sycl>,
96 forall_impl(resources::Sycl& sycl_res,
97  sycl_exec<BlockSize, Async> const& pol,
98  Iterable&& iter,
99  LoopBody&& loop_body,
100  ForallParam f_params)
101 
102 {
103  using Iterator = camp::decay<decltype(std::begin(iter))>;
104  using IndexType =
105  camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
106  using EXEC_POL = camp::decay<decltype(pol)>;
107  using LOOP_BODY = camp::decay<LoopBody>;
108  // Deduce at compile time if lbody is trivially constructible and if user
109  // has supplied parameters. These will be used to determine which sycl launch
110  // to configure below.
111  constexpr bool is_parampack_empty =
113  constexpr bool is_lbody_trivially_copyable =
114  std::is_trivially_copyable<LoopBody>::value;
115 
116  //
117  // Compute the requested iteration space size
118  //
119  Iterator begin = std::begin(iter);
120  Iterator end = std::end(iter);
121  IndexType len = std::distance(begin, end);
122 
123  // Return immediately if there is no work to be done
124  if (len <= 0 || BlockSize <= 0)
125  {
126  return resources::EventProxy<resources::Sycl>(sycl_res);
127  }
128 
129  //
130  // Compute the number of blocks
131  //
132  sycl_dim_t blockSize {BlockSize};
133  sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
134 
135  ::sycl::queue* q = sycl_res.get_queue();
136  LOOP_BODY* lbody = nullptr;
137  Iterator* d_begin = nullptr;
138 
139  if constexpr (!is_parampack_empty)
140  {
142  }
143  if constexpr (!is_lbody_trivially_copyable)
144  {
145  //
146  // Setup shared memory buffers
147  // Kernel body is nontrivially copyable, create space on device and copy to
148  // Workaround until "is_device_copyable" is supported
149  //
150  lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LoopBody), *q);
151  q->memcpy(lbody, &loop_body, sizeof(LOOP_BODY)).wait();
152 
153  d_begin = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q);
154  q->memcpy(d_begin, &begin, sizeof(Iterator)).wait();
155  }
156 
157  // Both the parallel_for call, combinations, and resolution are all
158  // unique to the parameter case, so we make a constexpr branch here
159  if constexpr (!is_parampack_empty)
160  {
161  auto combiner = [](ForallParam x, ForallParam y) {
163  return x;
164  };
165 
166  ForallParam* res = ::sycl::malloc_shared<ForallParam>(1, *q);
168  auto reduction = ::sycl::reduction(res, f_params, combiner);
169 
170  q->submit([&](::sycl::handler& h) {
171  h.parallel_for(::sycl::range<1>(len), reduction,
172  [=](::sycl::item<1> it, auto& red) {
173  ForallParam fp;
175  IndexType ii = it.get_id(0);
176  if (ii < len)
177  {
178  if constexpr (is_lbody_trivially_copyable)
179  {
180  RAJA::expt::invoke_body(fp, loop_body, begin[ii]);
181  }
182  else
183  {
184  RAJA::expt::invoke_body(fp, *lbody, (*d_begin)[ii]);
185  }
186  }
187  red.combine(fp);
188  });
189  });
190 
191  q->wait();
193  ::sycl::free(res, *q);
195  }
196  // Note: separate branches
197  else
198  {
199  q->submit([&](::sycl::handler& h) {
200  h.parallel_for(::sycl::nd_range<1> {gridSize, blockSize},
201  [=](::sycl::nd_item<1> it) {
202  IndexType ii = it.get_global_id(0);
203  if (ii < len)
204  {
205  if constexpr (is_lbody_trivially_copyable)
206  {
207  loop_body(begin[ii]);
208  }
209  else
210  {
211  (*lbody)((*d_begin)[ii]);
212  }
213  }
214  });
215  });
216 
217  if (!Async)
218  {
219  q->wait();
220  }
221  }
222 
223 
224  // If we had to allocate device memory, free it
225  if constexpr (!is_lbody_trivially_copyable)
226  {
227  ::sycl::free(lbody, *q);
228  ::sycl::free(d_begin, *q);
229  }
230 
231 
232  return resources::EventProxy<resources::Sycl>(sycl_res);
233 }
234 
235 //
237 //
238 // The following function templates iterate over index set segments
239 // using the explicitly named segment iteration policy and execute
240 // segments as SYCL kernels.
241 //
243 //
244 
253 template<typename LoopBody,
254  size_t BlockSize,
255  bool Async,
256  typename... SegmentTypes>
257 RAJA_INLINE resources::EventProxy<resources::Sycl> forall_impl(
258  resources::Sycl& r,
259  ExecPolicy<seq_segit, sycl_exec<BlockSize, Async>>,
260  const TypedIndexSet<SegmentTypes...>& iset,
261  LoopBody&& loop_body)
262 {
263  int num_seg = iset.getNumSegments();
264  for (int isi = 0; isi < num_seg; ++isi)
265  {
266  iset.segmentCall(r, isi, detail::CallForall(), sycl_exec<BlockSize, true>(),
267  loop_body);
268  } // iterate over segments of index set
269 
270  if (!Async)
271  {
272  ::sycl::queue* q = r.get_queue();
273  q->wait();
274  }
275 
276  return resources::EventProxy<resources::Sycl>(r);
277 }
278 
279 } // namespace sycl
280 
281 } // namespace policy
282 
283 } // namespace RAJA
284 
285 #endif // closing endif for RAJA_ENABLE_SYCL guard
286 
287 #endif // closing endif for header file include guard
RAJA header file defining index set classes.
Header file defining prototypes for routines used to manage memory for SYCL reductions and other oper...
Header file for common RAJA internal macro definitions.
value_type::device_call &[i_loop] iter
Definition: WorkRunner.hpp:216
constexpr RAJA_HOST_DEVICE auto invoke_body(Params &&params, Fn &&f, Ts &&... extra)
Definition: forall.hpp:598
seq_exec seq_segit
Definition: policy.hpp:83
RAJA_INLINE concepts::enable_if_t< resources::EventProxy< resources::Host >, expt::type_traits::is_ForallParamPack< ForallParam >, expt::type_traits::is_ForallParamPack_empty< ForallParam > > forall_impl(resources::Host host_res, const simd_exec &, Iterable &&iter, Func &&body, ForallParam)
Definition: forall.hpp:81
Definition: AlignedRangeIndexSetBuilders.cpp:35
Header file containing RAJA index set and segment iteration template methods that take an execution p...
Header file for RAJA resource definitions.
static constexpr void parampack_resolve(EXEC_POL const &pol, ForallParamPack< Params... > &f_params, Args &&... args)
Definition: forall.hpp:304
static constexpr void parampack_init(EXEC_POL const &pol, ForallParamPack< Params... > &f_params, Args &&... args)
Definition: forall.hpp:269
static RAJA_HOST_DEVICE constexpr void parampack_combine(EXEC_POL const &pol, ForallParamPack< Params... > &f_params, Args &&... args)
Definition: forall.hpp:286
Definition: TypeTraits.hpp:59
Header file containing RAJA SYCL policy definitions.
RAJA header file for handling different SYCL header include paths.
Header file for RAJA type definitions.