RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
WorkRunner.hpp
Go to the documentation of this file.
1 
11 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
12 // Copyright (c) Lawrence Livermore National Security, LLC and other
13 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
14 // files for dates and other details. No copyright assignment is required
15 // to contribute to RAJA.
16 //
17 // SPDX-License-Identifier: (BSD-3-Clause)
18 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
19 
20 #ifndef RAJA_hip_WorkGroup_WorkRunner_HPP
21 #define RAJA_hip_WorkGroup_WorkRunner_HPP
22 
23 #include "RAJA/config.hpp"
24 
27 
29 
30 namespace RAJA
31 {
32 
33 namespace detail
34 {
35 
40 template<size_t BLOCK_SIZE,
41  bool Async,
42  typename DISPATCH_POLICY_T,
43  typename ALLOCATOR_T,
44  typename INDEX_T,
45  typename... Args>
46 struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
48  DISPATCH_POLICY_T,
49  ALLOCATOR_T,
50  INDEX_T,
51  Args...>
52  : WorkRunnerForallOrdered<RAJA::hip_exec_async<BLOCK_SIZE>,
53  RAJA::hip_work<BLOCK_SIZE, Async>,
54  RAJA::ordered,
55  DISPATCH_POLICY_T,
56  ALLOCATOR_T,
57  INDEX_T,
58  Args...>
59 {
61  RAJA::hip_work<BLOCK_SIZE, Async>,
63  DISPATCH_POLICY_T,
64  ALLOCATOR_T,
65  INDEX_T,
66  Args...>;
67  using base::base;
68  using IndexType = INDEX_T;
70 
75  template<typename WorkContainer>
76  per_run_storage run(WorkContainer const& storage,
77  typename base::resource_type r,
78  Args... args) const
79  {
80  per_run_storage run_storage =
81  base::run(storage, r, std::forward<Args>(args)...);
82 
83  IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
84 
85  // Only synchronize if we had something to iterate over
86  if (num_loops > 0 && BLOCK_SIZE > 0)
87  {
88  if (!Async)
89  {
91  }
92  }
93 
94  return run_storage;
95  }
96 };
97 
102 template<size_t BLOCK_SIZE,
103  bool Async,
104  typename DISPATCH_POLICY_T,
105  typename ALLOCATOR_T,
106  typename INDEX_T,
107  typename... Args>
108 struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
110  DISPATCH_POLICY_T,
111  ALLOCATOR_T,
112  INDEX_T,
113  Args...>
114  : WorkRunnerForallReverse<RAJA::hip_exec_async<BLOCK_SIZE>,
115  RAJA::hip_work<BLOCK_SIZE, Async>,
116  RAJA::reverse_ordered,
117  DISPATCH_POLICY_T,
118  ALLOCATOR_T,
119  INDEX_T,
120  Args...>
121 {
123  RAJA::hip_work<BLOCK_SIZE, Async>,
125  DISPATCH_POLICY_T,
126  ALLOCATOR_T,
127  INDEX_T,
128  Args...>;
129  using base::base;
130  using IndexType = INDEX_T;
132 
137  template<typename WorkContainer>
138  per_run_storage run(WorkContainer const& storage,
139  typename base::resource_type r,
140  Args... args) const
141  {
142  per_run_storage run_storage =
143  base::run(storage, r, std::forward<Args>(args)...);
144 
145  IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
146 
147  // Only synchronize if we had something to iterate over
148  if (num_loops > 0 && BLOCK_SIZE > 0)
149  {
150  if (!Async)
151  {
153  }
154  }
155 
156  return run_storage;
157  }
158 };
159 
164 template<typename Segment_type,
165  typename LoopBody,
166  typename index_type,
167  typename... Args>
169 {
170  template<typename segment_in, typename body_in>
171  HoldHipDeviceXThreadblockLoop(segment_in&& segment, body_in&& body)
172  : m_segment(std::forward<segment_in>(segment)),
173  m_body(std::forward<body_in>(body))
174  {}
175 
176  RAJA_DEVICE RAJA_INLINE void operator()(Args... args) const
177  {
178  // TODO:: decide when to run hooks, may bypass this and use impl directly
179  // TODO:: decide whether or not to privatize the loop body
180  const index_type i_begin = threadIdx.x + blockIdx.x * blockDim.x;
181  const index_type stride = blockDim.x * gridDim.x;
182  const auto begin = m_segment.begin();
183  const auto end = m_segment.end();
184  const index_type len(end - begin);
185  for (index_type i = i_begin; i < len; i += stride)
186  {
187  m_body(begin[i], std::forward<Args>(args)...);
188  }
189  }
190 
191 private:
192  Segment_type m_segment;
193  LoopBody m_body;
194 };
195 
196 template<size_t BLOCK_SIZE,
197  typename StorageIter,
198  typename value_type,
199  typename index_type,
200  typename... Args>
201 __launch_bounds__(BLOCK_SIZE, 1) __global__
202  void hip_unordered_y_block_global(const StorageIter iter, Args... args)
203 {
204  const index_type i_loop = blockIdx.y;
205  // TODO: cache pointer to value_type in shared memory
206  // TODO: cache holder (value_type::obj) in shared memory
207  value_type::device_call(&iter[i_loop], args...);
208 }
209 
216 template<size_t BLOCK_SIZE,
217  bool Async,
218  typename DISPATCH_POLICY_T,
219  typename ALLOCATOR_T,
220  typename INDEX_T,
221  typename... Args>
222 struct WorkRunner<
223  RAJA::hip_work<BLOCK_SIZE, Async>,
224  RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
225  DISPATCH_POLICY_T,
226  ALLOCATOR_T,
227  INDEX_T,
228  Args...>
229 {
230  using exec_policy = RAJA::hip_work<BLOCK_SIZE, Async>;
231  using order_policy =
232  RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average;
233  using dispatch_policy = DISPATCH_POLICY_T;
234  using Allocator = ALLOCATOR_T;
235  using index_type = INDEX_T;
236  using resource_type = resources::Hip;
237 
238  // The type that will hold the segment and loop body in work storage
239  struct holder_type
240  {
241  template<typename T>
243  typename camp::at<T, camp::num<0>>::type, // ITERABLE
244  typename camp::at<T, camp::num<1>>::type, // LOOP_BODY
245  index_type,
246  Args...>;
247  };
248 
250  template<typename T>
251  using holder_type_t = typename holder_type::template type<T>;
252 
253  // The policy indicating where the call function is invoked
254  // in this case the values are called on the device
256 
257  // The Dispatcher policy with holder_types used internally to handle the
258  // ranges and callables passed in by the user.
261 
262  using dispatcher_type = Dispatcher<Platform::hip,
264  RAJA::hip_work<BLOCK_SIZE, true>,
265  Args...>;
266 
267  WorkRunner() = default;
268 
269  WorkRunner(WorkRunner const&) = delete;
270  WorkRunner& operator=(WorkRunner const&) = delete;
271 
272  WorkRunner(WorkRunner&& o) : m_total_iterations(o.m_total_iterations)
273  {
274  o.m_total_iterations = 0;
275  }
276 
278  {
279  m_total_iterations = o.m_total_iterations;
280 
281  o.m_total_iterations = 0;
282  return *this;
283  }
284 
285  // runner interfaces with storage to enqueue so the runner can get
286  // information from the segment and loop at enqueue time
287  template<typename WorkContainer, typename Iterable, typename LoopBody>
288  inline void enqueue(WorkContainer& storage,
289  Iterable&& iter,
290  LoopBody&& loop_body)
291  {
292  using Iterator = camp::decay<decltype(std::begin(iter))>;
293  using LOOP_BODY = camp::decay<LoopBody>;
294  using ITERABLE = camp::decay<Iterable>;
295  using IndexType =
296  camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
297 
299 
300  // using true_value_type = typename WorkContainer::template
301  // true_value_type<holder>;
302 
303  Iterator begin = std::begin(iter);
304  Iterator end = std::end(iter);
305  IndexType len = std::distance(begin, end);
306 
307  // Only launch kernel if we have something to iterate over
308  if (len > 0 && BLOCK_SIZE > 0)
309  {
310 
311  m_total_iterations += len;
312 
313  //
314  // TODO: Privatize the loop_body, using make_launch_body to setup
315  // reductions
316  //
317  // LOOP_BODY body = RAJA::hip::make_launch_body(func,
318  // gridSize, blockSize, shmem, stream,
319  // std::forward<LoopBody>(loop_body));
320 
321  storage.template emplace<holder>(
322  get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy {}),
323  std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
324  }
325  }
326 
327  // no extra storage required here
328  using per_run_storage = int;
329 
330  template<typename WorkContainer>
331  per_run_storage run(WorkContainer const& storage,
332  resource_type r,
333  Args... args) const
334  {
335  using Iterator = camp::decay<decltype(std::begin(storage))>;
336  using IndexType = camp::decay<decltype(std::distance(std::begin(storage),
337  std::end(storage)))>;
338  using value_type = typename WorkContainer::value_type;
339 
340  per_run_storage run_storage {};
341 
342  auto func = hip_unordered_y_block_global<BLOCK_SIZE, Iterator, value_type,
343  index_type, Args...>;
344 
345  //
346  // Compute the requested iteration space size
347  //
348  Iterator begin = std::begin(storage);
349  Iterator end = std::end(storage);
350  IndexType num_loops = std::distance(begin, end);
351 
352  // Only launch kernel if we have something to iterate over
353  if (num_loops > 0 && BLOCK_SIZE > 0)
354  {
355 
356  index_type average_iterations =
357  m_total_iterations / static_cast<index_type>(num_loops);
358 
359  //
360  // Compute the number of blocks
361  //
362  constexpr index_type block_size = static_cast<index_type>(BLOCK_SIZE);
363  hip_dim_t blockSize {static_cast<hip_dim_member_t>(block_size), 1, 1};
364  hip_dim_t gridSize {
365  static_cast<hip_dim_member_t>((average_iterations + block_size - 1) /
366  block_size),
367  static_cast<hip_dim_member_t>(num_loops), 1};
368 
369 
370  //
371  // Setup shared memory buffers
372  //
373  size_t shmem = 0;
374 
375  {
376  //
377  // Launch the kernel
378  //
379  void* func_args[] = {(void*)&begin, (void*)&args...};
380  RAJA::hip::launch((const void*)func, gridSize, blockSize, func_args,
381  shmem, r, Async);
382  }
383  }
384 
385  return run_storage;
386  }
387 
388  // clear any state so ready to be destroyed or reused
389  void clear() { m_total_iterations = 0; }
390 
391 private:
392  index_type m_total_iterations = 0;
393 };
394 
395 #if !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
396 
398 template<size_t BLOCK_SIZE,
399  bool Async,
400  typename ALLOCATOR_T,
401  typename INDEX_T,
402  typename... Args>
403 struct WorkRunner<
404  RAJA::hip_work<BLOCK_SIZE, Async>,
405  RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
407  ALLOCATOR_T,
408  INDEX_T,
409  Args...>;
411 template<size_t BLOCK_SIZE,
412  bool Async,
413  typename ALLOCATOR_T,
414  typename INDEX_T,
415  typename... Args>
416 struct WorkRunner<
417  RAJA::hip_work<BLOCK_SIZE, Async>,
418  RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
419  RAJA::indirect_virtual_function_dispatch,
420  ALLOCATOR_T,
421  INDEX_T,
422  Args...>;
423 
424 #endif
425 
426 } // namespace detail
427 
428 } // namespace RAJA
429 
430 #endif // closing endif for header file include guard
Header file defining prototypes for routines used to manage memory for HIP reductions and other opera...
Header file containing RAJA HIP policy definitions.
#define RAJA_DEVICE
Definition: macros.hpp:66
__launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__ void cuda_unordered_y_block_global(const RAJA_CUDA_GRID_CONSTANT StorageIter iter
Args args
Definition: WorkRunner.hpp:212
value_type::device_call &[i_loop] iter
Definition: WorkRunner.hpp:216
typename dispatcher_transform_types< dispatch_policy, holder_type >::type dispatcher_transform_types_t
Definition: Dispatcher.hpp:72
Definition: AlignedRangeIndexSetBuilders.cpp:35
void launch(LaunchParams const &launch_params, ReduceParams &&... rest_of_launch_args)
Definition: launch_core.hpp:268
auto & body
Definition: launch.hpp:177
void synchronize()
Synchronize all current RAJA executions for the specified policy.
Definition: synchronize.hpp:44
Definition: ListSegment.hpp:416
Header file providing RAJA WorkStorage.
Definition: Dispatcher.hpp:85
Definition: WorkRunner.hpp:169
RAJA_DEVICE RAJA_INLINE void operator()(Args... args) const
Definition: WorkRunner.hpp:176
HoldHipDeviceXThreadblockLoop(segment_in &&segment, body_in &&body)
Definition: WorkRunner.hpp:171
Definition: WorkRunner.hpp:162
typename resources::get_resource< FORALL_EXEC_POLICY >::type resource_type
Definition: WorkRunner.hpp:169
int per_run_storage
Definition: WorkRunner.hpp:231
Definition: WorkRunner.hpp:252
Definition: WorkRunner.hpp:300
per_run_storage run(WorkContainer const &storage, typename base::resource_type r, Args... args) const
Definition: WorkRunner.hpp:76
per_run_storage run(WorkContainer const &storage, typename base::resource_type r, Args... args) const
Definition: WorkRunner.hpp:138
Definition: WorkRunner.hpp:149
Dispatch using function pointers to make indirect function calls.
Definition: WorkGroup.hpp:77
Definition: WorkGroup.hpp:45
Definition: WorkGroup.hpp:53