20 #ifndef RAJA_cuda_WorkGroup_WorkRunner_HPP
21 #define RAJA_cuda_WorkGroup_WorkRunner_HPP
23 #include "RAJA/config.hpp"
40 template<
size_t BLOCK_SIZE,
43 typename DISPATCH_POLICY_T,
47 struct WorkRunner<
RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
54 RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
55 RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
63 RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
64 RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
78 template<
typename WorkContainer>
84 base::run(storage, r, std::forward<Args>(
args)...);
86 IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
89 if (num_loops > 0 && BLOCK_SIZE > 0)
105 template<
size_t BLOCK_SIZE,
106 size_t BLOCKS_PER_SM,
108 typename DISPATCH_POLICY_T,
109 typename ALLOCATOR_T,
119 RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
120 RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
121 RAJA::reverse_ordered,
128 RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
129 RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
143 template<
typename WorkContainer>
149 base::run(storage, r, std::forward<Args>(
args)...);
151 IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
154 if (num_loops > 0 && BLOCK_SIZE > 0)
170 template<
typename Segment_type,
176 template<
typename segment_in,
typename body_in>
178 : m_segment(
std::forward<segment_in>(segment)),
179 m_body(
std::forward<body_in>(
body))
186 const index_type i_begin = threadIdx.x + blockIdx.x * blockDim.x;
187 const index_type stride = blockDim.x * gridDim.x;
188 const auto begin = m_segment.begin();
189 const auto end = m_segment.end();
190 const index_type len(end - begin);
191 for (index_type i = i_begin; i < len; i += stride)
193 m_body(begin[i], std::forward<Args>(
args)...);
198 Segment_type m_segment;
202 template<
size_t BLOCK_SIZE,
203 size_t BLOCKS_PER_SM,
204 typename StorageIter,
209 void cuda_unordered_y_block_global(
const RAJA_CUDA_GRID_CONSTANT StorageIter
213 const index_type i_loop = blockIdx.y;
225 template<
size_t BLOCK_SIZE,
226 size_t BLOCKS_PER_SM,
228 typename DISPATCH_POLICY_T,
229 typename ALLOCATOR_T,
233 RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
234 RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average,
241 RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
243 unordered_cuda_loop_y_block_iter_x_threadblock_average;
254 typename camp::at<T, camp::num<0>>
::type,
255 typename camp::at<T, camp::num<1>>
::type,
276 RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>,
286 o.m_total_iterations = 0;
291 m_total_iterations = o.m_total_iterations;
293 o.m_total_iterations = 0;
299 template<
typename WorkContainer,
typename Iterable,
typename LoopBody>
302 LoopBody&& loop_body)
304 using Iterator = camp::decay<decltype(std::begin(
iter))>;
305 using LOOP_BODY = camp::decay<LoopBody>;
306 using ITERABLE = camp::decay<Iterable>;
308 camp::decay<decltype(std::distance(std::begin(
iter), std::end(
iter)))>;
315 Iterator begin = std::begin(
iter);
316 Iterator end = std::end(
iter);
317 IndexType len = std::distance(begin, end);
320 if (len > 0 && BLOCK_SIZE > 0)
323 m_total_iterations += len;
333 storage.template emplace<holder>(
335 std::forward<Iterable>(
iter), std::forward<LoopBody>(loop_body));
342 template<
typename WorkContainer>
347 using Iterator = camp::decay<decltype(std::begin(storage))>;
348 using IndexType = camp::decay<decltype(std::distance(std::begin(storage),
349 std::end(storage)))>;
350 using value_type =
typename WorkContainer::value_type;
355 cuda_unordered_y_block_global<BLOCK_SIZE, BLOCKS_PER_SM, Iterator,
361 Iterator begin = std::begin(storage);
362 Iterator end = std::end(storage);
363 IndexType num_loops = std::distance(begin, end);
366 if (num_loops > 0 && BLOCK_SIZE > 0)
370 m_total_iterations /
static_cast<index_type>(num_loops);
376 cuda_dim_t blockSize {
static_cast<cuda_dim_member_t
>(block_size), 1, 1};
377 cuda_dim_t gridSize {
378 static_cast<cuda_dim_member_t
>((average_iterations + block_size - 1) /
380 static_cast<cuda_dim_member_t
>(num_loops), 1};
392 void* func_args[] = {(
void*)&begin, (
void*)&
args...};
402 void clear() { m_total_iterations = 0; }
405 index_type m_total_iterations = 0;
Header file defining prototypes for routines used to manage memory for CUDA reductions and other oper...
Header file containing RAJA CUDA policy definitions.
#define RAJA_DEVICE
Definition: macros.hpp:66
__launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__ void cuda_unordered_y_block_global(const RAJA_CUDA_GRID_CONSTANT StorageIter iter
Args args
Definition: WorkRunner.hpp:212
value_type::device_call &[i_loop] iter
Definition: WorkRunner.hpp:216
typename dispatcher_transform_types< dispatch_policy, holder_type >::type dispatcher_transform_types_t
Definition: Dispatcher.hpp:72
Definition: AlignedRangeIndexSetBuilders.cpp:35
void launch(LaunchParams const &launch_params, ReduceParams &&... rest_of_launch_args)
Definition: launch_core.hpp:268
auto & body
Definition: launch.hpp:177
void synchronize()
Synchronize all current RAJA executions for the specified policy.
Definition: synchronize.hpp:44
Definition: ListSegment.hpp:416
Header file providing RAJA WorkStorage.
Definition: Dispatcher.hpp:85
Definition: WorkRunner.hpp:175
HoldCudaDeviceXThreadblockLoop(segment_in &&segment, body_in &&body)
Definition: WorkRunner.hpp:177
RAJA_DEVICE RAJA_INLINE void operator()(Args... args) const
Definition: WorkRunner.hpp:182
Definition: WorkRunner.hpp:162
typename resources::get_resource< FORALL_EXEC_POLICY >::type resource_type
Definition: WorkRunner.hpp:169
int per_run_storage
Definition: WorkRunner.hpp:231
Definition: WorkRunner.hpp:252
Definition: WorkRunner.hpp:300
resources::Cuda resource_type
Definition: WorkRunner.hpp:247
RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average order_policy
Definition: WorkRunner.hpp:243
WorkRunner & operator=(WorkRunner const &)=delete
typename holder_type::template type< T > holder_type_t
Definition: WorkRunner.hpp:262
DISPATCH_POLICY_T dispatch_policy
Definition: WorkRunner.hpp:244
INDEX_T index_type
Definition: WorkRunner.hpp:246
void enqueue(WorkContainer &storage, Iterable &&iter, LoopBody &&loop_body)
Definition: WorkRunner.hpp:300
WorkRunner(WorkRunner const &)=delete
WorkRunner(WorkRunner &&o)
Definition: WorkRunner.hpp:284
int per_run_storage
Definition: WorkRunner.hpp:340
WorkRunner & operator=(WorkRunner &&o)
Definition: WorkRunner.hpp:289
per_run_storage run(WorkContainer const &storage, resource_type r, Args... args) const
Definition: WorkRunner.hpp:343
exec_policy dispatcher_exec_policy
Definition: WorkRunner.hpp:266
ALLOCATOR_T Allocator
Definition: WorkRunner.hpp:245
RAJA::cuda_work_explicit< BLOCK_SIZE, BLOCKS_PER_SM, Async > exec_policy
Definition: WorkRunner.hpp:241
dispatcher_transform_types_t< dispatch_policy, holder_type > dispatcher_holder_policy
Definition: WorkRunner.hpp:271
void clear()
Definition: WorkRunner.hpp:402
per_run_storage run(WorkContainer const &storage, typename base::resource_type r, Args... args) const
Definition: WorkRunner.hpp:79
INDEX_T IndexType
Definition: WorkRunner.hpp:71
INDEX_T IndexType
Definition: WorkRunner.hpp:136
per_run_storage run(WorkContainer const &storage, typename base::resource_type r, Args... args) const
Definition: WorkRunner.hpp:144
Definition: WorkRunner.hpp:149
Definition: WorkGroup.hpp:45
Definition: WorkGroup.hpp:53