RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
Tile.hpp
Go to the documentation of this file.
1 
12 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
13 // Copyright (c) Lawrence Livermore National Security, LLC and other
14 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
15 // files for dates and other details. No copyright assignment is required
16 // to contribute to RAJA.
17 //
18 // SPDX-License-Identifier: (BSD-3-Clause)
19 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
20 
21 
22 #ifndef RAJA_policy_cuda_kernel_Tile_HPP
23 #define RAJA_policy_cuda_kernel_Tile_HPP
24 
25 #include "RAJA/config.hpp"
26 
27 #if defined(RAJA_ENABLE_CUDA)
28 
29 #include <iostream>
30 #include <type_traits>
31 
32 #include "camp/camp.hpp"
33 #include "camp/concepts.hpp"
34 #include "camp/tuple.hpp"
35 
36 #include "RAJA/util/macros.hpp"
37 #include "RAJA/util/types.hpp"
38 
41 
42 namespace RAJA
43 {
44 namespace internal
45 {
46 
52 template<typename Data,
53  camp::idx_t ArgumentId,
54  camp::idx_t chunk_size,
55  typename IndexMapper,
57  typename... EnclosedStmts,
58  typename Types>
59 struct CudaStatementExecutor<
60  Data,
61  statement::Tile<
62  ArgumentId,
63  RAJA::tile_fixed<chunk_size>,
64  RAJA::policy::cuda::
65  cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
66  EnclosedStmts...>,
67  Types>
68 {
69 
70  using stmt_list_t = StatementList<EnclosedStmts...>;
71 
72  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
73 
74  using diff_t = segment_diff_type<ArgumentId, Data>;
75 
76  using DimensionCalculator = KernelDimensionCalculator<
77  RAJA::policy::cuda::
78  cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>>;
79 
80  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
81  {
82  // Get the segment referenced by this Tile statement
83  auto& segment = camp::get<ArgumentId>(data.segment_tuple);
84 
85  using segment_t = camp::decay<decltype(segment)>;
86 
87  // compute trip count
88  const diff_t i =
89  IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
90 
91  // Keep copy of original segment, so we can restore it
92  segment_t orig_segment = segment;
93 
94  // Assign our new tiled segment
95  segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
96 
97  // execute enclosed statements
98  enclosed_stmts_t::exec(data, thread_active);
99 
100  // Set range back to original values
101  segment = orig_segment;
102  }
103 
104  static inline LaunchDims calculateDimensions(Data const& data)
105  {
106  // Compute how many chunks
107  const diff_t full_len = segment_length<ArgumentId>(data);
108  const diff_t len =
109  RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
110 
111  LaunchDims dims = DimensionCalculator::get_dimensions(len);
112 
113  // privatize data, so we can mess with the segments
114  using data_t = camp::decay<Data>;
115  data_t private_data = data;
116 
117  // Get original segment
118  auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
119 
120  // restrict to first tile
121  segment = segment.slice(0, static_cast<diff_t>(chunk_size));
122 
123  // NOTE: We do not detect improper uses of direct_unchecked policies under
124  // tiling. This happens when using a direct unchecked policy on a tiled
125  // range that is not evenly divisible by chunk_size.
126  LaunchDims enclosed_dims =
127  enclosed_stmts_t::calculateDimensions(private_data);
128 
129  return combine(dims, enclosed_dims);
130  }
131 };
132 
138 template<typename Data,
139  camp::idx_t ArgumentId,
140  camp::idx_t chunk_size,
141  typename IndexMapper,
143  typename... EnclosedStmts,
144  typename Types>
145 struct CudaStatementExecutor<
146  Data,
147  statement::Tile<ArgumentId,
148  RAJA::tile_fixed<chunk_size>,
149  RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
150  sync,
151  IndexMapper>,
152  EnclosedStmts...>,
153  Types>
154 {
155 
156  using stmt_list_t = StatementList<EnclosedStmts...>;
157 
158  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
159 
160  using diff_t = segment_diff_type<ArgumentId, Data>;
161 
162  using DimensionCalculator = KernelDimensionCalculator<
163  RAJA::policy::cuda::
164  cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
165 
166  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
167  {
168  // Get the segment referenced by this Tile statement
169  auto& segment = camp::get<ArgumentId>(data.segment_tuple);
170 
171  using segment_t = camp::decay<decltype(segment)>;
172 
173  // compute trip count
174  const diff_t len = segment.end() - segment.begin();
175  const diff_t i =
176  IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
177 
178  // execute enclosed statements if any thread will
179  // but mask off threads without work
180  const bool have_work = (i < len);
181 
182  // Keep copy of original segment, so we can restore it
183  segment_t orig_segment = segment;
184 
185  // Assign our new tiled segment
186  segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
187 
188  // execute enclosed statements
189  enclosed_stmts_t::exec(data, thread_active && have_work);
190 
191  // Set range back to original values
192  segment = orig_segment;
193  }
194 
195  static inline LaunchDims calculateDimensions(Data const& data)
196  {
197  // Compute how many chunks
198  const diff_t full_len = segment_length<ArgumentId>(data);
199  const diff_t len =
200  RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
201 
202  LaunchDims dims = DimensionCalculator::get_dimensions(len);
203 
204  // privatize data, so we can mess with the segments
205  using data_t = camp::decay<Data>;
206  data_t private_data = data;
207 
208  // Get original segment
209  auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
210 
211  // restrict to first tile
212  segment = segment.slice(0, static_cast<diff_t>(chunk_size));
213 
214  LaunchDims enclosed_dims =
215  enclosed_stmts_t::calculateDimensions(private_data);
216 
217  return combine(dims, enclosed_dims);
218  }
219 };
220 
226 template<typename Data,
227  camp::idx_t ArgumentId,
228  camp::idx_t chunk_size,
229  typename IndexMapper,
230  typename... EnclosedStmts,
231  typename Types>
232 struct CudaStatementExecutor<
233  Data,
234  statement::Tile<
235  ArgumentId,
236  RAJA::tile_fixed<chunk_size>,
237  RAJA::policy::cuda::cuda_indexer<
238  iteration_mapping::StridedLoop<named_usage::unspecified>,
239  kernel_sync_requirement::sync,
240  IndexMapper>,
241  EnclosedStmts...>,
242  Types>
243 {
244 
245  using stmt_list_t = StatementList<EnclosedStmts...>;
246 
247  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
248 
249  using diff_t = segment_diff_type<ArgumentId, Data>;
250 
251  using DimensionCalculator =
252  KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
253  iteration_mapping::StridedLoop<named_usage::unspecified>,
255  IndexMapper>>;
256 
257  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
258  {
259  // Get the segment referenced by this Tile statement
260  auto& segment = camp::get<ArgumentId>(data.segment_tuple);
261 
262  // Keep copy of original segment, so we can restore it
263  using segment_t = camp::decay<decltype(segment)>;
264  segment_t orig_segment = segment;
265 
266  // compute trip count
267  const diff_t len = segment.end() - segment.begin();
268  const diff_t i_init =
269  IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
270  const diff_t i_stride =
271  IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
272 
273  // Iterate through in chunks
274  // threads will have the same numbers of iterations
275  for (diff_t ii = 0; ii < len; ii += i_stride)
276  {
277  const diff_t i = ii + i_init;
278 
279  // execute enclosed statements if any thread will
280  // but mask off threads without work
281  const bool have_work = (i < len);
282 
283  // Assign our new tiled segment
284  segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
285 
286  // execute enclosed statements
287  enclosed_stmts_t::exec(data, thread_active && have_work);
288  }
289 
290  // Set range back to original values
291  segment = orig_segment;
292  }
293 
294  static inline LaunchDims calculateDimensions(Data const& data)
295  {
296  // Compute how many chunks
297  const diff_t full_len = segment_length<ArgumentId>(data);
298  const diff_t len =
299  RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
300 
301  LaunchDims dims = DimensionCalculator::get_dimensions(len);
302 
303  // privatize data, so we can mess with the segments
304  using data_t = camp::decay<Data>;
305  data_t private_data = data;
306 
307  // Get original segment
308  auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
309 
310  // restrict to first tile
311  segment = segment.slice(0, chunk_size);
312 
313  LaunchDims enclosed_dims =
314  enclosed_stmts_t::calculateDimensions(private_data);
315 
316  return combine(dims, enclosed_dims);
317  }
318 };
319 
325 template<typename Data,
326  camp::idx_t ArgumentId,
327  camp::idx_t chunk_size,
328  typename IndexMapper,
329  typename... EnclosedStmts,
330  typename Types>
331 struct CudaStatementExecutor<
332  Data,
333  statement::Tile<
334  ArgumentId,
335  RAJA::tile_fixed<chunk_size>,
336  RAJA::policy::cuda::cuda_indexer<
337  iteration_mapping::StridedLoop<named_usage::unspecified>,
338  kernel_sync_requirement::none,
339  IndexMapper>,
340  EnclosedStmts...>,
341  Types>
342 {
343 
344  using stmt_list_t = StatementList<EnclosedStmts...>;
345 
346  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
347 
348  using diff_t = segment_diff_type<ArgumentId, Data>;
349 
350  using DimensionCalculator =
351  KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
352  iteration_mapping::StridedLoop<named_usage::unspecified>,
354  IndexMapper>>;
355 
356  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
357  {
358  // Get the segment referenced by this Tile statement
359  auto& segment = camp::get<ArgumentId>(data.segment_tuple);
360 
361  // Keep copy of original segment, so we can restore it
362  using segment_t = camp::decay<decltype(segment)>;
363  segment_t orig_segment = segment;
364 
365  // compute trip count
366  const diff_t len = segment.end() - segment.begin();
367  const diff_t i_init =
368  IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
369  const diff_t i_stride =
370  IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
371 
372  // Iterate through one at a time
373  // threads will have the different numbers of iterations
374  for (diff_t i = i_init; i < len; i += i_stride)
375  {
376 
377  // Assign our new tiled segment
378  segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
379 
380  // execute enclosed statements
381  enclosed_stmts_t::exec(data, thread_active);
382  }
383 
384  // Set range back to original values
385  segment = orig_segment;
386  }
387 
388  static inline LaunchDims calculateDimensions(Data const& data)
389  {
390  // Compute how many chunks
391  const diff_t full_len = segment_length<ArgumentId>(data);
392  const diff_t len =
393  RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
394 
395  LaunchDims dims = DimensionCalculator::get_dimensions(len);
396 
397  // privatize data, so we can mess with the segments
398  using data_t = camp::decay<Data>;
399  data_t private_data = data;
400 
401  // Get original segment
402  auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
403 
404  // restrict to first tile
405  segment = segment.slice(0, chunk_size);
406 
407  LaunchDims enclosed_dims =
408  enclosed_stmts_t::calculateDimensions(private_data);
409 
410  return combine(dims, enclosed_dims);
411  }
412 };
413 
419 template<typename Data,
420  camp::idx_t ArgumentId,
421  typename TPol,
422  typename... EnclosedStmts,
423  typename Types>
424 struct CudaStatementExecutor<
425  Data,
426  statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
427  Types>
428  : CudaStatementExecutor<
429  Data,
430  statement::Tile<
431  ArgumentId,
432  TPol,
433  RAJA::policy::cuda::cuda_indexer<
434  iteration_mapping::StridedLoop<named_usage::unspecified>,
435  kernel_sync_requirement::none,
436  cuda::IndexGlobal<named_dim::x,
437  named_usage::ignored,
438  named_usage::ignored>>,
439  EnclosedStmts...>,
440  Types>
441 {};
442 
443 } // end namespace internal
444 } // end namespace RAJA
445 
446 #endif // RAJA_ENABLE_CUDA
447 #endif /* RAJA_policy_cuda_kernel_Tile_HPP */
Header file for common RAJA internal macro definitions.
#define RAJA_DIVIDE_CEILING_INT(dividend, divisor)
Definition: macros.hpp:122
#define RAJA_DEVICE
Definition: macros.hpp:66
camp::list< Stmts... > StatementList
Definition: StatementList.hpp:41
Definition: AlignedRangeIndexSetBuilders.cpp:35
kernel_sync_requirement
Definition: types.hpp:63
Header file for tile wrapper and iterator.
Header file for loop kernel internals.
Header file for RAJA type definitions.