RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
WorkRunner.hpp
Go to the documentation of this file.
1 
11 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
12 // Copyright (c) Lawrence Livermore National Security, LLC and other
13 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
14 // files for dates and other details. No copyright assignment is required
15 // to contribute to RAJA.
16 //
17 // SPDX-License-Identifier: (BSD-3-Clause)
18 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
19 
20 #ifndef RAJA_cuda_WorkGroup_WorkRunner_HPP
21 #define RAJA_cuda_WorkGroup_WorkRunner_HPP
22 
23 #include "RAJA/config.hpp"
24 
27 
29 
30 namespace RAJA
31 {
32 
33 namespace detail
34 {
35 
40 template<size_t BLOCK_SIZE,
41  size_t BLOCKS_PER_SM,
42  bool Async,
43  typename DISPATCH_POLICY_T,
44  typename ALLOCATOR_T,
45  typename INDEX_T,
46  typename... Args>
47 struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
49  DISPATCH_POLICY_T,
50  ALLOCATOR_T,
51  INDEX_T,
52  Args...>
54  RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
55  RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
56  RAJA::ordered,
57  DISPATCH_POLICY_T,
58  ALLOCATOR_T,
59  INDEX_T,
60  Args...>
61 {
63  RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
64  RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
66  DISPATCH_POLICY_T,
67  ALLOCATOR_T,
68  INDEX_T,
69  Args...>;
70  using base::base;
71  using IndexType = INDEX_T;
73 
78  template<typename WorkContainer>
79  per_run_storage run(WorkContainer const& storage,
80  typename base::resource_type r,
81  Args... args) const
82  {
83  per_run_storage run_storage =
84  base::run(storage, r, std::forward<Args>(args)...);
85 
86  IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
87 
88  // Only synchronize if we had something to iterate over
89  if (num_loops > 0 && BLOCK_SIZE > 0)
90  {
91  if (!Async)
92  {
94  }
95  }
96 
97  return run_storage;
98  }
99 };
100 
105 template<size_t BLOCK_SIZE,
106  size_t BLOCKS_PER_SM,
107  bool Async,
108  typename DISPATCH_POLICY_T,
109  typename ALLOCATOR_T,
110  typename INDEX_T,
111  typename... Args>
112 struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
114  DISPATCH_POLICY_T,
115  ALLOCATOR_T,
116  INDEX_T,
117  Args...>
119  RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
120  RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
121  RAJA::reverse_ordered,
122  DISPATCH_POLICY_T,
123  ALLOCATOR_T,
124  INDEX_T,
125  Args...>
126 {
128  RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
129  RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
131  DISPATCH_POLICY_T,
132  ALLOCATOR_T,
133  INDEX_T,
134  Args...>;
135  using base::base;
136  using IndexType = INDEX_T;
138 
143  template<typename WorkContainer>
144  per_run_storage run(WorkContainer const& storage,
145  typename base::resource_type r,
146  Args... args) const
147  {
148  per_run_storage run_storage =
149  base::run(storage, r, std::forward<Args>(args)...);
150 
151  IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
152 
153  // Only synchronize if we had something to iterate over
154  if (num_loops > 0 && BLOCK_SIZE > 0)
155  {
156  if (!Async)
157  {
159  }
160  }
161 
162  return run_storage;
163  }
164 };
165 
170 template<typename Segment_type,
171  typename LoopBody,
172  typename index_type,
173  typename... Args>
175 {
176  template<typename segment_in, typename body_in>
177  HoldCudaDeviceXThreadblockLoop(segment_in&& segment, body_in&& body)
178  : m_segment(std::forward<segment_in>(segment)),
179  m_body(std::forward<body_in>(body))
180  {}
181 
182  RAJA_DEVICE RAJA_INLINE void operator()(Args... args) const
183  {
184  // TODO:: decide when to run hooks, may bypass this and use impl directly
185  // TODO:: decide whether or not to privatize the loop body
186  const index_type i_begin = threadIdx.x + blockIdx.x * blockDim.x;
187  const index_type stride = blockDim.x * gridDim.x;
188  const auto begin = m_segment.begin();
189  const auto end = m_segment.end();
190  const index_type len(end - begin);
191  for (index_type i = i_begin; i < len; i += stride)
192  {
193  m_body(begin[i], std::forward<Args>(args)...);
194  }
195  }
196 
197 private:
198  Segment_type m_segment;
199  LoopBody m_body;
200 };
201 
202 template<size_t BLOCK_SIZE,
203  size_t BLOCKS_PER_SM,
204  typename StorageIter,
205  typename value_type,
206  typename index_type,
207  typename... Args>
208 __launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
209  void cuda_unordered_y_block_global(const RAJA_CUDA_GRID_CONSTANT StorageIter
210  iter,
211  Args... args)
212 {
213  const index_type i_loop = blockIdx.y;
214  // TODO: cache pointer to value_type in shared memory
215  // TODO: cache holder (value_type::obj) in shared memory
216  value_type::device_call(&iter[i_loop], args...);
217 }
218 
225 template<size_t BLOCK_SIZE,
226  size_t BLOCKS_PER_SM,
227  bool Async,
228  typename DISPATCH_POLICY_T,
229  typename ALLOCATOR_T,
230  typename INDEX_T,
231  typename... Args>
232 struct WorkRunner<
233  RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
234  RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average,
235  DISPATCH_POLICY_T,
236  ALLOCATOR_T,
237  INDEX_T,
238  Args...>
239 {
240  using exec_policy =
241  RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
242  using order_policy = RAJA::policy::cuda::
243  unordered_cuda_loop_y_block_iter_x_threadblock_average;
244  using dispatch_policy = DISPATCH_POLICY_T;
245  using Allocator = ALLOCATOR_T;
246  using index_type = INDEX_T;
247  using resource_type = resources::Cuda;
248 
249  // The type that will hold the segment and loop body in work storage
250  struct holder_type
251  {
252  template<typename T>
254  typename camp::at<T, camp::num<0>>::type, // ITERABLE
255  typename camp::at<T, camp::num<1>>::type, // LOOP_BODY
256  index_type,
257  Args...>;
258  };
259 
261  template<typename T>
262  using holder_type_t = typename holder_type::template type<T>;
263 
264  // The policy indicating where the call function is invoked
265  // in this case the values are called on the device
267 
268  // The Dispatcher policy with holder_types used internally to handle the
269  // ranges and callables passed in by the user.
272 
274  Dispatcher<Platform::cuda,
276  RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>,
277  Args...>;
278 
279  WorkRunner() = default;
280 
281  WorkRunner(WorkRunner const&) = delete;
282  WorkRunner& operator=(WorkRunner const&) = delete;
283 
284  WorkRunner(WorkRunner&& o) : m_total_iterations(o.m_total_iterations)
285  {
286  o.m_total_iterations = 0;
287  }
288 
290  {
291  m_total_iterations = o.m_total_iterations;
292 
293  o.m_total_iterations = 0;
294  return *this;
295  }
296 
297  // runner interfaces with storage to enqueue so the runner can get
298  // information from the segment and loop at enqueue time
299  template<typename WorkContainer, typename Iterable, typename LoopBody>
300  inline void enqueue(WorkContainer& storage,
301  Iterable&& iter,
302  LoopBody&& loop_body)
303  {
304  using Iterator = camp::decay<decltype(std::begin(iter))>;
305  using LOOP_BODY = camp::decay<LoopBody>;
306  using ITERABLE = camp::decay<Iterable>;
307  using IndexType =
308  camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
309 
311 
312  // using true_value_type = typename WorkContainer::template
313  // true_value_type<holder>;
314 
315  Iterator begin = std::begin(iter);
316  Iterator end = std::end(iter);
317  IndexType len = std::distance(begin, end);
318 
319  // Only launch kernel if we have something to iterate over
320  if (len > 0 && BLOCK_SIZE > 0)
321  {
322 
323  m_total_iterations += len;
324 
325  //
326  // TODO: Privatize the loop_body, using make_launch_body to setup
327  // reductions
328  //
329  // LOOP_BODY body = RAJA::cuda::make_launch_body(func,
330  // gridSize, blockSize, shmem, stream,
331  // std::forward<LoopBody>(loop_body));
332 
333  storage.template emplace<holder>(
334  get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy {}),
335  std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
336  }
337  }
338 
339  // no extra storage required here
340  using per_run_storage = int;
341 
342  template<typename WorkContainer>
343  per_run_storage run(WorkContainer const& storage,
344  resource_type r,
345  Args... args) const
346  {
347  using Iterator = camp::decay<decltype(std::begin(storage))>;
348  using IndexType = camp::decay<decltype(std::distance(std::begin(storage),
349  std::end(storage)))>;
350  using value_type = typename WorkContainer::value_type;
351 
352  per_run_storage run_storage {};
353 
354  auto func =
355  cuda_unordered_y_block_global<BLOCK_SIZE, BLOCKS_PER_SM, Iterator,
356  value_type, index_type, Args...>;
357 
358  //
359  // Compute the requested iteration space size
360  //
361  Iterator begin = std::begin(storage);
362  Iterator end = std::end(storage);
363  IndexType num_loops = std::distance(begin, end);
364 
365  // Only launch kernel if we have something to iterate over
366  if (num_loops > 0 && BLOCK_SIZE > 0)
367  {
368 
369  index_type average_iterations =
370  m_total_iterations / static_cast<index_type>(num_loops);
371 
372  //
373  // Compute the number of blocks
374  //
375  constexpr index_type block_size = static_cast<index_type>(BLOCK_SIZE);
376  cuda_dim_t blockSize {static_cast<cuda_dim_member_t>(block_size), 1, 1};
377  cuda_dim_t gridSize {
378  static_cast<cuda_dim_member_t>((average_iterations + block_size - 1) /
379  block_size),
380  static_cast<cuda_dim_member_t>(num_loops), 1};
381 
382 
383  //
384  // Setup shared memory buffers
385  //
386  size_t shmem = 0;
387 
388  {
389  //
390  // Launch the kernel
391  //
392  void* func_args[] = {(void*)&begin, (void*)&args...};
393  RAJA::cuda::launch((const void*)func, gridSize, blockSize, func_args,
394  shmem, r, Async);
395  }
396  }
397 
398  return run_storage;
399  }
400 
401  // clear any state so ready to be destroyed or reused
402  void clear() { m_total_iterations = 0; }
403 
404 private:
405  index_type m_total_iterations = 0;
406 };
407 
408 
409 } // namespace detail
410 
411 } // namespace RAJA
412 
413 #endif // closing endif for header file include guard
Header file defining prototypes for routines used to manage memory for CUDA reductions and other oper...
Header file containing RAJA CUDA policy definitions.
#define RAJA_DEVICE
Definition: macros.hpp:66
__launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__ void cuda_unordered_y_block_global(const RAJA_CUDA_GRID_CONSTANT StorageIter iter
Args args
Definition: WorkRunner.hpp:212
value_type::device_call &[i_loop] iter
Definition: WorkRunner.hpp:216
typename dispatcher_transform_types< dispatch_policy, holder_type >::type dispatcher_transform_types_t
Definition: Dispatcher.hpp:72
Definition: AlignedRangeIndexSetBuilders.cpp:35
void launch(LaunchParams const &launch_params, ReduceParams &&... rest_of_launch_args)
Definition: launch_core.hpp:268
auto & body
Definition: launch.hpp:177
void synchronize()
Synchronize all current RAJA executions for the specified policy.
Definition: synchronize.hpp:44
Definition: ListSegment.hpp:416
Header file providing RAJA WorkStorage.
Definition: Dispatcher.hpp:85
Definition: WorkRunner.hpp:175
HoldCudaDeviceXThreadblockLoop(segment_in &&segment, body_in &&body)
Definition: WorkRunner.hpp:177
RAJA_DEVICE RAJA_INLINE void operator()(Args... args) const
Definition: WorkRunner.hpp:182
Definition: WorkRunner.hpp:162
typename resources::get_resource< FORALL_EXEC_POLICY >::type resource_type
Definition: WorkRunner.hpp:169
int per_run_storage
Definition: WorkRunner.hpp:231
Definition: WorkRunner.hpp:252
Definition: WorkRunner.hpp:300
per_run_storage run(WorkContainer const &storage, typename base::resource_type r, Args... args) const
Definition: WorkRunner.hpp:79
per_run_storage run(WorkContainer const &storage, typename base::resource_type r, Args... args) const
Definition: WorkRunner.hpp:144
Definition: WorkRunner.hpp:149
Definition: WorkGroup.hpp:45
Definition: WorkGroup.hpp:53