RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
scan.hpp
Go to the documentation of this file.
1 
11 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
12 // Copyright (c) Lawrence Livermore National Security, LLC and other
13 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
14 // files for dates and other details. No copyright assignment is required
15 // to contribute to RAJA.
16 //
17 // SPDX-License-Identifier: (BSD-3-Clause)
18 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
19 
20 #ifndef RAJA_scan_cuda_HPP
21 #define RAJA_scan_cuda_HPP
22 
23 #include "RAJA/config.hpp"
24 
25 #if defined(RAJA_ENABLE_CUDA)
26 
27 #include <iterator>
28 #include <type_traits>
29 
30 #include "cub/device/device_scan.cuh"
31 #include "cub/util_allocator.cuh"
32 
35 
36 namespace RAJA
37 {
38 namespace impl
39 {
40 namespace scan
41 {
42 
47 template<typename IterationMapping,
48  typename IterationGetter,
49  typename Concretizer,
50  size_t BLOCKS_PER_SM,
51  bool Async,
52  typename InputIter,
53  typename Function>
54 RAJA_INLINE resources::EventProxy<resources::Cuda> inclusive_inplace(
55  resources::Cuda cuda_res,
56  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
57  IterationGetter,
58  Concretizer,
59  BLOCKS_PER_SM,
60  Async>,
61  InputIter begin,
62  InputIter end,
63  Function binary_op)
64 {
65  cudaStream_t stream = cuda_res.get_stream();
66 
67  int len = std::distance(begin, end);
68  // Determine temporary device storage requirements
69  void* d_temp_storage = nullptr;
70  size_t temp_storage_bytes = 0;
71  CAMP_CUDA_API_INVOKE_AND_CHECK(::cub::DeviceScan::InclusiveScan,
72  d_temp_storage, temp_storage_bytes, begin,
73  begin, binary_op, len, stream);
74  // Allocate temporary storage
75  d_temp_storage =
76  cuda::device_mempool_type::getInstance().malloc<unsigned char>(
77  temp_storage_bytes);
78  // Run
79  CAMP_CUDA_API_INVOKE_AND_CHECK(::cub::DeviceScan::InclusiveScan,
80  d_temp_storage, temp_storage_bytes, begin,
81  begin, binary_op, len, stream);
82  // Free temporary storage
83  cuda::device_mempool_type::getInstance().free(d_temp_storage);
84 
85  cuda::launch(cuda_res, Async);
86 
87  return resources::EventProxy<resources::Cuda>(cuda_res);
88 }
89 
94 template<typename IterationMapping,
95  typename IterationGetter,
96  typename Concretizer,
97  size_t BLOCKS_PER_SM,
98  bool Async,
99  typename InputIter,
100  typename Function,
101  typename T>
102 RAJA_INLINE resources::EventProxy<resources::Cuda> exclusive_inplace(
103  resources::Cuda cuda_res,
104  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
105  IterationGetter,
106  Concretizer,
107  BLOCKS_PER_SM,
108  Async>,
109  InputIter begin,
110  InputIter end,
111  Function binary_op,
112  T init)
113 {
114  cudaStream_t stream = cuda_res.get_stream();
115 
116  int len = std::distance(begin, end);
117  // Determine temporary device storage requirements
118  void* d_temp_storage = nullptr;
119  size_t temp_storage_bytes = 0;
120  CAMP_CUDA_API_INVOKE_AND_CHECK(::cub::DeviceScan::ExclusiveScan,
121  d_temp_storage, temp_storage_bytes, begin,
122  begin, binary_op, init, len, stream);
123  // Allocate temporary storage
124  d_temp_storage =
125  cuda::device_mempool_type::getInstance().malloc<unsigned char>(
126  temp_storage_bytes);
127  // Run
128  CAMP_CUDA_API_INVOKE_AND_CHECK(::cub::DeviceScan::ExclusiveScan,
129  d_temp_storage, temp_storage_bytes, begin,
130  begin, binary_op, init, len, stream);
131  // Free temporary storage
132  cuda::device_mempool_type::getInstance().free(d_temp_storage);
133 
134  cuda::launch(cuda_res, Async);
135 
136  return resources::EventProxy<resources::Cuda>(cuda_res);
137 }
138 
143 template<typename IterationMapping,
144  typename IterationGetter,
145  typename Concretizer,
146  size_t BLOCKS_PER_SM,
147  bool Async,
148  typename InputIter,
149  typename OutputIter,
150  typename Function>
151 RAJA_INLINE resources::EventProxy<resources::Cuda> inclusive(
152  resources::Cuda cuda_res,
153  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
154  IterationGetter,
155  Concretizer,
156  BLOCKS_PER_SM,
157  Async>,
158  InputIter begin,
159  InputIter end,
160  OutputIter out,
161  Function binary_op)
162 {
163  cudaStream_t stream = cuda_res.get_stream();
164 
165  int len = std::distance(begin, end);
166  // Determine temporary device storage requirements
167  void* d_temp_storage = nullptr;
168  size_t temp_storage_bytes = 0;
169  CAMP_CUDA_API_INVOKE_AND_CHECK(::cub::DeviceScan::InclusiveScan,
170  d_temp_storage, temp_storage_bytes, begin, out,
171  binary_op, len, stream);
172  // Allocate temporary storage
173  d_temp_storage =
174  cuda::device_mempool_type::getInstance().malloc<unsigned char>(
175  temp_storage_bytes);
176  // Run
177  CAMP_CUDA_API_INVOKE_AND_CHECK(::cub::DeviceScan::InclusiveScan,
178  d_temp_storage, temp_storage_bytes, begin, out,
179  binary_op, len, stream);
180  // Free temporary storage
181  cuda::device_mempool_type::getInstance().free(d_temp_storage);
182 
183  cuda::launch(cuda_res, Async);
184 
185  return resources::EventProxy<resources::Cuda>(cuda_res);
186 }
187 
192 template<typename IterationMapping,
193  typename IterationGetter,
194  typename Concretizer,
195  size_t BLOCKS_PER_SM,
196  bool Async,
197  typename InputIter,
198  typename OutputIter,
199  typename Function,
200  typename T>
201 RAJA_INLINE resources::EventProxy<resources::Cuda> exclusive(
202  resources::Cuda cuda_res,
203  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
204  IterationGetter,
205  Concretizer,
206  BLOCKS_PER_SM,
207  Async>,
208  InputIter begin,
209  InputIter end,
210  OutputIter out,
211  Function binary_op,
212  T init)
213 {
214  cudaStream_t stream = cuda_res.get_stream();
215 
216  int len = std::distance(begin, end);
217  // Determine temporary device storage requirements
218  void* d_temp_storage = nullptr;
219  size_t temp_storage_bytes = 0;
220  CAMP_CUDA_API_INVOKE_AND_CHECK(::cub::DeviceScan::ExclusiveScan,
221  d_temp_storage, temp_storage_bytes, begin, out,
222  binary_op, init, len, stream);
223  // Allocate temporary storage
224  d_temp_storage =
225  cuda::device_mempool_type::getInstance().malloc<unsigned char>(
226  temp_storage_bytes);
227  // Run
228  CAMP_CUDA_API_INVOKE_AND_CHECK(::cub::DeviceScan::ExclusiveScan,
229  d_temp_storage, temp_storage_bytes, begin, out,
230  binary_op, init, len, stream);
231  // Free temporary storage
232  cuda::device_mempool_type::getInstance().free(d_temp_storage);
233 
234  cuda::launch(cuda_res, Async);
235 
236  return resources::EventProxy<resources::Cuda>(cuda_res);
237 }
238 
239 } // namespace scan
240 
241 } // namespace impl
242 
243 } // namespace RAJA
244 
245 #endif // closing endif for RAJA_ENABLE_CUDA guard
246 
247 #endif // closing endif for header file include guard
Header file defining prototypes for routines used to manage memory for CUDA reductions and other oper...
Header file containing RAJA CUDA policy definitions.
RAJA_INLINE concepts::enable_if_t< resources::EventProxy< resources::Host >, type_traits::is_openmp_policy< Policy > > inclusive(resources::Host host_res, const Policy &exec, Iter begin, Iter end, OutIter out, BinFn f)
Definition: scan.hpp:144
RAJA_INLINE concepts::enable_if_t< resources::EventProxy< resources::Host >, type_traits::is_openmp_policy< Policy > > exclusive(resources::Host host_res, const Policy &exec, Iter begin, Iter end, OutIter out, BinFn f, ValueT v)
Definition: scan.hpp:167
RAJA_INLINE concepts::enable_if_t< resources::EventProxy< resources::Host >, type_traits::is_openmp_policy< Policy > > inclusive_inplace(resources::Host host_res, const Policy &, Iter begin, Iter end, BinFn f)
Definition: scan.hpp:51
RAJA_INLINE concepts::enable_if_t< resources::EventProxy< resources::Host >, type_traits::is_openmp_policy< Policy > > exclusive_inplace(resources::Host host_res, const Policy &, Iter begin, Iter end, BinFn f, ValueT v)
Definition: scan.hpp:96
Definition: AlignedRangeIndexSetBuilders.cpp:35
void launch(LaunchParams const &launch_params, ReduceParams &&... rest_of_launch_args)
Definition: launch_core.hpp:268