RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
internal.hpp
Go to the documentation of this file.
1 
12 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
13 // Copyright (c) Lawrence Livermore National Security, LLC and other
14 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
15 // files for dates and other details. No copyright assignment is required
16 // to contribute to RAJA.
17 //
18 // SPDX-License-Identifier: (BSD-3-Clause)
19 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
20 
21 
22 #ifndef RAJA_policy_cuda_kernel_internal_HPP
23 #define RAJA_policy_cuda_kernel_internal_HPP
24 
25 #include "RAJA/config.hpp"
26 
27 #if defined(RAJA_ENABLE_CUDA)
28 
29 #include <cassert>
30 #include <climits>
31 
32 #include "camp/camp.hpp"
33 
34 #include "RAJA/pattern/kernel.hpp"
35 
36 #include "RAJA/util/macros.hpp"
37 #include "RAJA/util/types.hpp"
38 
41 
42 namespace RAJA
43 {
44 
45 namespace internal
46 {
47 
48 struct LaunchDims
49 {
50 
51  CudaDims active {0};
52  CudaDims dims {0};
53  CudaDims min_dims {0};
54 
55  LaunchDims() = default;
56  LaunchDims(LaunchDims const&) = default;
57  LaunchDims(LaunchDims&&) = default;
58  LaunchDims& operator=(LaunchDims const&) = default;
59  LaunchDims& operator=(LaunchDims&&) = default;
60 
61  RAJA_INLINE
62  LaunchDims(CudaDims _active, CudaDims _dims, CudaDims _min_dims)
63  : active {_active},
64  dims {_dims},
65  min_dims {_min_dims}
66  {}
67 
68  RAJA_INLINE
69  LaunchDims max(LaunchDims const& c) const
70  {
71  LaunchDims result;
72 
73  result.active.blocks.x = std::max(c.active.blocks.x, active.blocks.x);
74  result.active.blocks.y = std::max(c.active.blocks.y, active.blocks.y);
75  result.active.blocks.z = std::max(c.active.blocks.z, active.blocks.z);
76 
77  result.dims.blocks.x = std::max(c.dims.blocks.x, dims.blocks.x);
78  result.dims.blocks.y = std::max(c.dims.blocks.y, dims.blocks.y);
79  result.dims.blocks.z = std::max(c.dims.blocks.z, dims.blocks.z);
80 
81  result.min_dims.blocks.x = std::max(c.min_dims.blocks.x, min_dims.blocks.x);
82  result.min_dims.blocks.y = std::max(c.min_dims.blocks.y, min_dims.blocks.y);
83  result.min_dims.blocks.z = std::max(c.min_dims.blocks.z, min_dims.blocks.z);
84 
85  result.active.threads.x = std::max(c.active.threads.x, active.threads.x);
86  result.active.threads.y = std::max(c.active.threads.y, active.threads.y);
87  result.active.threads.z = std::max(c.active.threads.z, active.threads.z);
88 
89  result.dims.threads.x = std::max(c.dims.threads.x, dims.threads.x);
90  result.dims.threads.y = std::max(c.dims.threads.y, dims.threads.y);
91  result.dims.threads.z = std::max(c.dims.threads.z, dims.threads.z);
92 
93  result.min_dims.threads.x =
94  std::max(c.min_dims.threads.x, min_dims.threads.x);
95  result.min_dims.threads.y =
96  std::max(c.min_dims.threads.y, min_dims.threads.y);
97  result.min_dims.threads.z =
98  std::max(c.min_dims.threads.z, min_dims.threads.z);
99 
100  return result;
101  }
102 
103  RAJA_INLINE
104  int blocks_are_active() const
105  {
106  return active.blocks.x || active.blocks.y || active.blocks.z;
107  }
108 
109  RAJA_INLINE
110  int threads_are_active() const
111  {
112  return active.threads.x || active.threads.y || active.threads.z;
113  }
114 
115  RAJA_INLINE
116  int num_blocks() const
117  {
118  if (blocks_are_active())
119  {
120  return (active.blocks.x ? dims.blocks.x : 1) *
121  (active.blocks.y ? dims.blocks.y : 1) *
122  (active.blocks.z ? dims.blocks.z : 1);
123  }
124  else
125  {
126  return 0;
127  }
128  }
129 
130  RAJA_INLINE
131  int num_threads() const
132  {
133  if (threads_are_active())
134  {
135  return (active.threads.x ? dims.threads.x : 1) *
136  (active.threads.y ? dims.threads.y : 1) *
137  (active.threads.z ? dims.threads.z : 1);
138  }
139  else
140  {
141  return 0;
142  }
143  }
144 
145  RAJA_INLINE
146  void clamp_to_min_blocks()
147  {
148  dims.blocks.x = std::max(min_dims.blocks.x, dims.blocks.x);
149  dims.blocks.y = std::max(min_dims.blocks.y, dims.blocks.y);
150  dims.blocks.z = std::max(min_dims.blocks.z, dims.blocks.z);
151  };
152 
153  RAJA_INLINE
154  void clamp_to_min_threads()
155  {
156  dims.threads.x = std::max(min_dims.threads.x, dims.threads.x);
157  dims.threads.y = std::max(min_dims.threads.y, dims.threads.y);
158  dims.threads.z = std::max(min_dims.threads.z, dims.threads.z);
159  };
160 };
161 
162 RAJA_INLINE
163 LaunchDims combine(LaunchDims const& lhs, LaunchDims const& rhs)
164 {
165  return lhs.max(rhs);
166 }
167 
168 template<camp::idx_t cur_stmt, camp::idx_t num_stmts, typename StmtList>
169 struct CudaStatementListExecutorHelper
170 {
171 
172  using next_helper_t =
173  CudaStatementListExecutorHelper<cur_stmt + 1, num_stmts, StmtList>;
174 
175  using cur_stmt_t = camp::at_v<StmtList, cur_stmt>;
176 
177  template<typename Data>
178  inline static RAJA_DEVICE void exec(Data& data, bool thread_active)
179  {
180  // Execute stmt
181  cur_stmt_t::exec(data, thread_active);
182 
183  // Execute next stmt
184  next_helper_t::exec(data, thread_active);
185  }
186 
187  template<typename Data>
188  inline static LaunchDims calculateDimensions(Data& data)
189  {
190  LaunchDims statement_dims = cur_stmt_t::calculateDimensions(data);
191 
192  LaunchDims next_dims = next_helper_t::calculateDimensions(data);
193 
194  return combine(statement_dims, next_dims);
195  }
196 };
197 
198 template<camp::idx_t num_stmts, typename StmtList>
199 struct CudaStatementListExecutorHelper<num_stmts, num_stmts, StmtList>
200 {
201 
202  template<typename Data>
203  inline static RAJA_DEVICE void exec(Data&, bool)
204  {
205  // nop terminator
206  }
207 
208  template<typename Data>
209  inline static LaunchDims calculateDimensions(Data&)
210  {
211  return LaunchDims();
212  }
213 };
214 
215 
216 template<typename Data, typename Policy, typename Types>
217 struct CudaStatementExecutor;
218 
219 template<typename Data, typename StmtList, typename Types>
220 struct CudaStatementListExecutor;
221 
222 template<typename Data, typename... Stmts, typename Types>
223 struct CudaStatementListExecutor<Data, StatementList<Stmts...>, Types>
224 {
225 
226  using enclosed_stmts_t =
227  camp::list<CudaStatementExecutor<Data, Stmts, Types>...>;
228 
229  static constexpr size_t num_stmts = sizeof...(Stmts);
230 
231  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
232  {
233  // Execute statements in order with helper class
234  CudaStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(
235  data, thread_active);
236  }
237 
238  static inline LaunchDims calculateDimensions(Data const& data)
239  {
240  // Compute this statements launch dimensions
241  return CudaStatementListExecutorHelper<
242  0, num_stmts, enclosed_stmts_t>::calculateDimensions(data);
243  }
244 };
245 
246 template<typename StmtList, typename Data, typename Types>
247 using cuda_statement_list_executor_t =
248  CudaStatementListExecutor<Data, StmtList, Types>;
249 
250 
251 template<typename kernel_indexer>
252 struct KernelDimensionCalculator;
253 
254 // specialization for direct unchecked sequential policies
255 template<named_dim dim, kernel_sync_requirement sync>
256 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
257  iteration_mapping::DirectUnchecked,
258  sync,
259  cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
260 {
261  using IndexMapper =
262  cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
263 
264  template<typename IdxT>
265  static LaunchDims get_dimensions(IdxT len)
266  {
267  if (len != static_cast<IdxT>(1))
268  {
269  RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked "
270  "mapped index space");
271  }
272 
273  return LaunchDims {};
274  }
275 };
276 
277 // specialization for direct unchecked thread policies
278 template<named_dim dim, kernel_sync_requirement sync>
279 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
280  iteration_mapping::DirectUnchecked,
281  sync,
282  cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
283 {
284  using IndexMapper =
285  cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
286 
287  template<typename IdxT>
288  static LaunchDims get_dimensions(IdxT len)
289  {
290  LaunchDims dims;
291 
292  // BEWARE: if calculated block_size is too high then the kernel launch will
293  // fail
294  set_cuda_dim<dim>(dims.active.threads,
295  static_cast<cuda_dim_member_t>(true));
296  set_cuda_dim<dim>(dims.dims.threads, static_cast<cuda_dim_member_t>(len));
297  set_cuda_dim<dim>(dims.min_dims.threads,
298  static_cast<cuda_dim_member_t>(len));
299 
300  return dims;
301  }
302 };
303 
305 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
306 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
307  iteration_mapping::DirectUnchecked,
308  sync,
309  cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
310 {
311  static_assert(BLOCK_SIZE > 0,
312  "block size must be > 0, named_usage::unspecified, or "
313  "named_usage::ignored with kernel");
314 
315  using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
316 
317  template<typename IdxT>
318  static LaunchDims get_dimensions(IdxT len)
319  {
320  if (len != static_cast<IdxT>(IndexMapper::block_size))
321  {
322  RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked "
323  "mapped index space");
324  }
325 
326  LaunchDims dims;
327 
328  set_cuda_dim<dim>(dims.active.threads,
329  static_cast<cuda_dim_member_t>(true));
330  set_cuda_dim<dim>(dims.dims.threads,
331  static_cast<cuda_dim_member_t>(IndexMapper::block_size));
332  set_cuda_dim<dim>(dims.min_dims.threads,
333  static_cast<cuda_dim_member_t>(IndexMapper::block_size));
334 
335  return dims;
336  }
337 };
338 
339 // specialization for direct unchecked block policies
340 template<named_dim dim, kernel_sync_requirement sync>
341 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
342  iteration_mapping::DirectUnchecked,
343  sync,
344  cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
345 {
346  using IndexMapper =
347  cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
348 
349  template<typename IdxT>
350  static LaunchDims get_dimensions(IdxT len)
351  {
352  LaunchDims dims;
353 
354  set_cuda_dim<dim>(dims.active.blocks, static_cast<cuda_dim_member_t>(true));
355  set_cuda_dim<dim>(dims.dims.blocks, static_cast<cuda_dim_member_t>(len));
356  set_cuda_dim<dim>(dims.min_dims.blocks,
357  static_cast<cuda_dim_member_t>(len));
358 
359  return dims;
360  }
361 };
362 
364 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
365 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
366  iteration_mapping::DirectUnchecked,
367  sync,
368  cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
369 {
370  static_assert(GRID_SIZE > 0,
371  "grid size must be > 0, named_usage::unspecified, or "
372  "named_usage::ignored with kernel");
373 
374  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
375 
376  template<typename IdxT>
377  static LaunchDims get_dimensions(IdxT len)
378  {
379  if (len != static_cast<IdxT>(IndexMapper::grid_size))
380  {
381  RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked "
382  "mapped index space");
383  }
384 
385  LaunchDims dims;
386 
387  set_cuda_dim<dim>(dims.active.blocks, static_cast<cuda_dim_member_t>(true));
388  set_cuda_dim<dim>(dims.dims.blocks,
389  static_cast<cuda_dim_member_t>(IndexMapper::grid_size));
390  set_cuda_dim<dim>(dims.min_dims.blocks,
391  static_cast<cuda_dim_member_t>(IndexMapper::grid_size));
392 
393  return dims;
394  }
395 };
396 
397 // specialization for direct unchecked global policies
398 template<named_dim dim, kernel_sync_requirement sync>
399 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
400  iteration_mapping::DirectUnchecked,
401  sync,
402  cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
403 {
404  using IndexMapper = cuda::
405  IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
406 
407  template<typename IdxT>
408  static LaunchDims get_dimensions(IdxT len)
409  {
410  if (len != static_cast<IdxT>(0))
411  {
412  RAJA_ABORT_OR_THROW("must know one of block_size or grid_size");
413  }
414 
415  return {};
416  }
417 };
418 
420 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
421 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
422  iteration_mapping::DirectUnchecked,
423  sync,
424  cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
425 {
426  static_assert(GRID_SIZE > 0,
427  "grid size must be > 0, named_usage::unspecified, or "
428  "named_usage::ignored with kernel");
429 
430  using IndexMapper =
431  cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
432 
433  template<typename IdxT>
434  static LaunchDims get_dimensions(IdxT len)
435  {
436  // BEWARE: if calculated block_size is too high then the kernel launch will
437  // fail
438  const IdxT block_size =
439  RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size));
440  if (len != (block_size * static_cast<IdxT>(IndexMapper::grid_size)))
441  {
442  RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked "
443  "mapped index space");
444  }
445 
446  LaunchDims dims;
447 
448  set_cuda_dim<dim>(dims.active.threads,
449  static_cast<cuda_dim_member_t>(true));
450  set_cuda_dim<dim>(dims.dims.threads,
451  static_cast<cuda_dim_member_t>(block_size));
452  set_cuda_dim<dim>(dims.min_dims.threads,
453  static_cast<cuda_dim_member_t>(block_size));
454 
455  set_cuda_dim<dim>(dims.active.blocks, static_cast<cuda_dim_member_t>(true));
456  set_cuda_dim<dim>(dims.dims.blocks,
457  static_cast<cuda_dim_member_t>(IndexMapper::grid_size));
458  set_cuda_dim<dim>(dims.min_dims.blocks,
459  static_cast<cuda_dim_member_t>(IndexMapper::grid_size));
460 
461  return dims;
462  }
463 };
464 
466 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
467 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
468  iteration_mapping::DirectUnchecked,
469  sync,
470  cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
471 {
472  static_assert(BLOCK_SIZE > 0,
473  "block size must be > 0, named_usage::unspecified, or "
474  "named_usage::ignored with kernel");
475 
476  using IndexMapper =
477  cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
478 
479  template<typename IdxT>
480  static LaunchDims get_dimensions(IdxT len)
481  {
482  const IdxT grid_size = RAJA_DIVIDE_CEILING_INT(
483  len, static_cast<IdxT>(IndexMapper::block_size));
484  if (len != (static_cast<IdxT>(IndexMapper::block_size) * grid_size))
485  {
486  RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked "
487  "mapped index space");
488  }
489 
490  LaunchDims dims;
491 
492  set_cuda_dim<dim>(dims.active.threads,
493  static_cast<cuda_dim_member_t>(true));
494  set_cuda_dim<dim>(dims.dims.threads,
495  static_cast<cuda_dim_member_t>(IndexMapper::block_size));
496  set_cuda_dim<dim>(dims.min_dims.threads,
497  static_cast<cuda_dim_member_t>(IndexMapper::block_size));
498 
499  set_cuda_dim<dim>(dims.active.blocks, static_cast<cuda_dim_member_t>(true));
500  set_cuda_dim<dim>(dims.dims.blocks,
501  static_cast<cuda_dim_member_t>(grid_size));
502  set_cuda_dim<dim>(dims.min_dims.blocks,
503  static_cast<cuda_dim_member_t>(grid_size));
504 
505  return dims;
506  }
507 };
508 
510 template<named_dim dim,
511  int BLOCK_SIZE,
512  int GRID_SIZE,
514 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
515  iteration_mapping::DirectUnchecked,
516  sync,
517  cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
518 {
519  static_assert(BLOCK_SIZE > 0,
520  "block size must be > 0, named_usage::unspecified, or "
521  "named_usage::ignored with kernel");
522  static_assert(GRID_SIZE > 0,
523  "grid size must be > 0, named_usage::unspecified, or "
524  "named_usage::ignored with kernel");
525 
526  using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
527 
528  template<typename IdxT>
529  static LaunchDims get_dimensions(IdxT len)
530  {
531  if (len != (static_cast<IdxT>(IndexMapper::block_size) *
532  static_cast<IdxT>(IndexMapper::grid_size)))
533  {
534  RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked "
535  "mapped index space");
536  }
537 
538  LaunchDims dims;
539 
540  set_cuda_dim<dim>(dims.active.threads,
541  static_cast<cuda_dim_member_t>(true));
542  set_cuda_dim<dim>(dims.dims.threads,
543  static_cast<cuda_dim_member_t>(IndexMapper::block_size));
544  set_cuda_dim<dim>(dims.min_dims.threads,
545  static_cast<cuda_dim_member_t>(IndexMapper::block_size));
546 
547  set_cuda_dim<dim>(dims.active.blocks, static_cast<cuda_dim_member_t>(true));
548  set_cuda_dim<dim>(dims.dims.blocks,
549  static_cast<cuda_dim_member_t>(IndexMapper::grid_size));
550  set_cuda_dim<dim>(dims.min_dims.blocks,
551  static_cast<cuda_dim_member_t>(IndexMapper::grid_size));
552 
553  return dims;
554  }
555 };
556 
557 // specialization for direct sequential policies
558 template<named_dim dim, kernel_sync_requirement sync>
559 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
560  iteration_mapping::Direct,
561  sync,
562  cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
563 {
564  using IndexMapper =
565  cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
566 
567  template<typename IdxT>
568  static LaunchDims get_dimensions(IdxT len)
569  {
570  if (len > static_cast<IdxT>(1))
571  {
573  "len exceeds the size of the directly mapped index space");
574  }
575 
576  return LaunchDims {};
577  }
578 };
579 
580 // specialization for direct thread policies
581 template<named_dim dim, kernel_sync_requirement sync>
582 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
583  iteration_mapping::Direct,
584  sync,
585  cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
586 {
587  using IndexMapper =
588  cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
589 
590  template<typename IdxT>
591  static LaunchDims get_dimensions(IdxT len)
592  {
593  LaunchDims dims;
594 
595  // BEWARE: if calculated block_size is too high then the kernel launch will
596  // fail
597  set_cuda_dim<dim>(dims.active.threads,
598  static_cast<cuda_dim_member_t>(true));
599  set_cuda_dim<dim>(dims.dims.threads, static_cast<cuda_dim_member_t>(len));
600  set_cuda_dim<dim>(dims.min_dims.threads,
601  static_cast<cuda_dim_member_t>(len));
602 
603  return dims;
604  }
605 };
606 
608 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
609 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
610  iteration_mapping::Direct,
611  sync,
612  cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
613 {
614  static_assert(BLOCK_SIZE > 0,
615  "block size must be > 0, named_usage::unspecified, or "
616  "named_usage::ignored with kernel");
617 
618  using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
619 
620  template<typename IdxT>
621  static LaunchDims get_dimensions(IdxT len)
622  {
623  constexpr auto zero = static_cast<IdxT>(0);
624 
625  if (len > static_cast<IdxT>(IndexMapper::block_size))
626  {
628  "len exceeds the size of the directly mapped index space");
629  }
630 
631  LaunchDims dims;
632 
633  set_cuda_dim<dim>(dims.active.threads,
634  static_cast<cuda_dim_member_t>(true));
635  set_cuda_dim<dim>(dims.dims.threads,
636  static_cast<cuda_dim_member_t>(
637  (len > zero) ? IndexMapper::block_size : 0));
638  set_cuda_dim<dim>(dims.min_dims.threads,
639  static_cast<cuda_dim_member_t>(IndexMapper::block_size));
640 
641  return dims;
642  }
643 };
644 
645 // specialization for direct block policies
646 template<named_dim dim, kernel_sync_requirement sync>
647 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
648  iteration_mapping::Direct,
649  sync,
650  cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
651 {
652  using IndexMapper =
653  cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
654 
655  template<typename IdxT>
656  static LaunchDims get_dimensions(IdxT len)
657  {
658  LaunchDims dims;
659 
660  set_cuda_dim<dim>(dims.active.blocks, static_cast<cuda_dim_member_t>(true));
661  set_cuda_dim<dim>(dims.dims.blocks, static_cast<cuda_dim_member_t>(len));
662  set_cuda_dim<dim>(dims.min_dims.blocks,
663  static_cast<cuda_dim_member_t>(len));
664 
665  return dims;
666  }
667 };
668 
670 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
671 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
672  iteration_mapping::Direct,
673  sync,
674  cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
675 {
676  static_assert(GRID_SIZE > 0,
677  "grid size must be > 0, named_usage::unspecified, or "
678  "named_usage::ignored with kernel");
679 
680  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
681 
682  template<typename IdxT>
683  static LaunchDims get_dimensions(IdxT len)
684  {
685  constexpr auto zero = static_cast<IdxT>(0);
686 
687  if (len > static_cast<IdxT>(IndexMapper::grid_size))
688  {
690  "len exceeds the size of the directly mapped index space");
691  }
692 
693  LaunchDims dims;
694 
695  set_cuda_dim<dim>(dims.active.blocks, static_cast<cuda_dim_member_t>(true));
696  set_cuda_dim<dim>(dims.dims.blocks,
697  static_cast<cuda_dim_member_t>(
698  (len > zero) ? IndexMapper::grid_size : 0));
699  set_cuda_dim<dim>(dims.min_dims.blocks,
700  static_cast<cuda_dim_member_t>(IndexMapper::grid_size));
701 
702  return dims;
703  }
704 };
705 
706 // specialization for direct global policies
707 template<named_dim dim, kernel_sync_requirement sync>
708 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
709  iteration_mapping::Direct,
710  sync,
711  cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
712 {
713  using IndexMapper = cuda::
714  IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
715 
716  template<typename IdxT>
717  static LaunchDims get_dimensions(IdxT len)
718  {
719  if (len > static_cast<IdxT>(0))
720  {
721  RAJA_ABORT_OR_THROW("must know one of block_size or grid_size");
722  }
723 
724  return LaunchDims {};
725  }
726 };
727 
729 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
730 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
731  iteration_mapping::Direct,
732  sync,
733  cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
734 {
735  static_assert(GRID_SIZE > 0,
736  "grid size must be > 0, named_usage::unspecified, or "
737  "named_usage::ignored with kernel");
738 
739  using IndexMapper =
740  cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
741 
742  template<typename IdxT>
743  static LaunchDims get_dimensions(IdxT len)
744  {
745  constexpr auto zero = static_cast<IdxT>(0);
746 
747  // BEWARE: if calculated block_size is too high then the kernel launch will
748  // fail
749  const IdxT block_size =
750  RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size));
751 
752  LaunchDims dims;
753 
754  set_cuda_dim<dim>(dims.active.threads,
755  static_cast<cuda_dim_member_t>(true));
756  set_cuda_dim<dim>(dims.dims.threads,
757  static_cast<cuda_dim_member_t>(block_size));
758  set_cuda_dim<dim>(dims.min_dims.threads,
759  static_cast<cuda_dim_member_t>(block_size));
760 
761  set_cuda_dim<dim>(dims.active.blocks, static_cast<cuda_dim_member_t>(true));
762  set_cuda_dim<dim>(dims.dims.blocks,
763  static_cast<cuda_dim_member_t>(
764  (len > zero) ? IndexMapper::grid_size : 0));
765  set_cuda_dim<dim>(dims.min_dims.blocks,
766  static_cast<cuda_dim_member_t>(IndexMapper::grid_size));
767 
768  return dims;
769  }
770 };
771 
773 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
774 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
775  iteration_mapping::Direct,
776  sync,
777  cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
778 {
779  static_assert(BLOCK_SIZE > 0,
780  "block size must be > 0, named_usage::unspecified, or "
781  "named_usage::ignored with kernel");
782 
783  using IndexMapper =
784  cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
785 
786  template<typename IdxT>
787  static LaunchDims get_dimensions(IdxT len)
788  {
789  constexpr auto zero = static_cast<IdxT>(0);
790 
791  const IdxT grid_size = RAJA_DIVIDE_CEILING_INT(
792  len, static_cast<IdxT>(IndexMapper::block_size));
793 
794  LaunchDims dims;
795 
796  set_cuda_dim<dim>(dims.active.threads,
797  static_cast<cuda_dim_member_t>(true));
798  set_cuda_dim<dim>(dims.dims.threads,
799  static_cast<cuda_dim_member_t>(
800  (len > zero) ? IndexMapper::block_size : 0));
801  set_cuda_dim<dim>(dims.min_dims.threads,
802  static_cast<cuda_dim_member_t>(IndexMapper::block_size));
803 
804  set_cuda_dim<dim>(dims.active.blocks, static_cast<cuda_dim_member_t>(true));
805  set_cuda_dim<dim>(dims.dims.blocks,
806  static_cast<cuda_dim_member_t>(grid_size));
807  set_cuda_dim<dim>(dims.min_dims.blocks,
808  static_cast<cuda_dim_member_t>(grid_size));
809 
810  return dims;
811  }
812 };
813 
815 template<named_dim dim,
816  int BLOCK_SIZE,
817  int GRID_SIZE,
819 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
820  iteration_mapping::Direct,
821  sync,
822  cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
823 {
824  static_assert(BLOCK_SIZE > 0,
825  "block size must be > 0, named_usage::unspecified, or "
826  "named_usage::ignored with kernel");
827  static_assert(GRID_SIZE > 0,
828  "grid size must be > 0, named_usage::unspecified, or "
829  "named_usage::ignored with kernel");
830 
831  using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
832 
833  template<typename IdxT>
834  static LaunchDims get_dimensions(IdxT len)
835  {
836  constexpr auto zero = static_cast<IdxT>(0);
837 
838  if (len > (static_cast<IdxT>(IndexMapper::block_size) *
839  static_cast<IdxT>(IndexMapper::grid_size)))
840  {
842  "len exceeds the size of the directly mapped index space");
843  }
844 
845  LaunchDims dims;
846 
847  set_cuda_dim<dim>(dims.active.threads,
848  static_cast<cuda_dim_member_t>(true));
849  set_cuda_dim<dim>(dims.dims.threads,
850  static_cast<cuda_dim_member_t>(
851  (len > zero) ? IndexMapper::block_size : 0));
852  set_cuda_dim<dim>(dims.min_dims.threads,
853  static_cast<cuda_dim_member_t>(IndexMapper::block_size));
854 
855  set_cuda_dim<dim>(dims.active.blocks, static_cast<cuda_dim_member_t>(true));
856  set_cuda_dim<dim>(dims.dims.blocks,
857  static_cast<cuda_dim_member_t>(
858  (len > zero) ? IndexMapper::grid_size : 0));
859  set_cuda_dim<dim>(dims.min_dims.blocks,
860  static_cast<cuda_dim_member_t>(IndexMapper::grid_size));
861 
862  return dims;
863  }
864 };
865 
866 // specialization for strided loop sequential policies
867 template<named_dim dim, kernel_sync_requirement sync>
868 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
869  iteration_mapping::StridedLoop<named_usage::unspecified>,
870  sync,
871  cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
872 {
873  using IndexMapper =
874  cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
875 
876  template<typename IdxT>
877  static LaunchDims get_dimensions(IdxT RAJA_UNUSED_ARG(len))
878  {
879  return LaunchDims {};
880  }
881 };
882 
883 // specialization for strided loop thread policies
884 template<named_dim dim, kernel_sync_requirement sync>
885 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
886  iteration_mapping::StridedLoop<named_usage::unspecified>,
887  sync,
888  cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
889 {
890  using IndexMapper =
891  cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
892 
893  template<typename IdxT>
894  static LaunchDims get_dimensions(IdxT len)
895  {
896  LaunchDims dims;
897 
898  // BEWARE: if calculated block_size is too high then the kernel launch will
899  // fail
900  set_cuda_dim<dim>(dims.active.threads,
901  static_cast<cuda_dim_member_t>(true));
902  set_cuda_dim<dim>(dims.dims.threads, static_cast<cuda_dim_member_t>(len));
903  set_cuda_dim<dim>(dims.min_dims.threads, static_cast<cuda_dim_member_t>(1));
904 
905  return dims;
906  }
907 };
908 
910 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
911 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
912  iteration_mapping::StridedLoop<named_usage::unspecified>,
913  sync,
914  cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
915 {
916  static_assert(BLOCK_SIZE > 0,
917  "block size must be > 0, named_usage::unspecified, or "
918  "named_usage::ignored with kernel");
919 
920  using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
921 
922  template<typename IdxT>
923  static LaunchDims get_dimensions(IdxT len)
924  {
925  constexpr auto zero = static_cast<IdxT>(0);
926 
927  LaunchDims dims;
928 
929  set_cuda_dim<dim>(dims.active.threads,
930  static_cast<cuda_dim_member_t>(true));
931  set_cuda_dim<dim>(dims.dims.threads,
932  static_cast<cuda_dim_member_t>(
933  (len > zero) ? IndexMapper::block_size : 0));
934  set_cuda_dim<dim>(dims.min_dims.threads,
935  static_cast<cuda_dim_member_t>(IndexMapper::block_size));
936 
937  return dims;
938  }
939 };
940 
941 // specialization for strided loop block policies
942 template<named_dim dim, kernel_sync_requirement sync>
943 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
944  iteration_mapping::StridedLoop<named_usage::unspecified>,
945  sync,
946  cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
947 {
948  using IndexMapper =
949  cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
950 
951  template<typename IdxT>
952  static LaunchDims get_dimensions(IdxT len)
953  {
954  LaunchDims dims;
955 
956  set_cuda_dim<dim>(dims.active.blocks, static_cast<cuda_dim_member_t>(true));
957  set_cuda_dim<dim>(dims.dims.blocks, static_cast<cuda_dim_member_t>(len));
958  set_cuda_dim<dim>(dims.min_dims.blocks, static_cast<cuda_dim_member_t>(1));
959 
960  return dims;
961  }
962 };
963 
965 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
966 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
967  iteration_mapping::StridedLoop<named_usage::unspecified>,
968  sync,
969  cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
970 {
971  static_assert(GRID_SIZE > 0,
972  "grid size must be > 0, named_usage::unspecified, or "
973  "named_usage::ignored with kernel");
974 
975  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
976 
977  template<typename IdxT>
978  static LaunchDims get_dimensions(IdxT len)
979  {
980  constexpr auto zero = static_cast<IdxT>(0);
981 
982  LaunchDims dims;
983 
984  set_cuda_dim<dim>(dims.active.blocks, static_cast<cuda_dim_member_t>(true));
985  set_cuda_dim<dim>(dims.dims.blocks,
986  static_cast<cuda_dim_member_t>(
987  (len > zero) ? IndexMapper::grid_size : 0));
988  set_cuda_dim<dim>(dims.min_dims.blocks,
989  static_cast<cuda_dim_member_t>(IndexMapper::grid_size));
990 
991  return dims;
992  }
993 };
994 
995 // specialization for strided loop global policies
996 template<named_dim dim, kernel_sync_requirement sync>
997 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
998  iteration_mapping::StridedLoop<named_usage::unspecified>,
999  sync,
1000  cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
1001 {
1002  using IndexMapper = cuda::
1003  IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
1004 
1005  template<typename IdxT>
1006  static LaunchDims get_dimensions(IdxT len)
1007  {
1008  constexpr auto zero = static_cast<IdxT>(0);
1009 
1010  LaunchDims dims;
1011 
1012  set_cuda_dim<dim>(dims.active.threads,
1013  static_cast<cuda_dim_member_t>(true));
1014  set_cuda_dim<dim>(dims.dims.threads,
1015  static_cast<cuda_dim_member_t>((len > zero) ? 1 : 0));
1016  set_cuda_dim<dim>(dims.min_dims.threads, static_cast<cuda_dim_member_t>(1));
1017 
1018  set_cuda_dim<dim>(dims.active.blocks, static_cast<cuda_dim_member_t>(true));
1019  set_cuda_dim<dim>(dims.dims.blocks,
1020  static_cast<cuda_dim_member_t>((len > zero) ? 1 : 0));
1021  set_cuda_dim<dim>(dims.min_dims.blocks, static_cast<cuda_dim_member_t>(1));
1022 
1023  return dims;
1024  }
1025 };
1026 
1028 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
1029 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
1030  iteration_mapping::StridedLoop<named_usage::unspecified>,
1031  sync,
1032  cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
1033 {
1034  static_assert(GRID_SIZE > 0,
1035  "grid size must be > 0, named_usage::unspecified, or "
1036  "named_usage::ignored with kernel");
1037 
1038  using IndexMapper =
1039  cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
1040 
1041  template<typename IdxT>
1042  static LaunchDims get_dimensions(IdxT len)
1043  {
1044  constexpr auto zero = static_cast<IdxT>(0);
1045 
1046  // BEWARE: if calculated block_size is too high then the kernel launch will
1047  // fail
1048  const IdxT block_size =
1049  RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size));
1050 
1051  LaunchDims dims;
1052 
1053  set_cuda_dim<dim>(dims.active.threads,
1054  static_cast<cuda_dim_member_t>(true));
1055  set_cuda_dim<dim>(dims.dims.threads,
1056  static_cast<cuda_dim_member_t>(block_size));
1057  set_cuda_dim<dim>(dims.min_dims.threads, static_cast<cuda_dim_member_t>(1));
1058 
1059  set_cuda_dim<dim>(dims.active.blocks, static_cast<cuda_dim_member_t>(true));
1060  set_cuda_dim<dim>(dims.dims.blocks,
1061  static_cast<cuda_dim_member_t>(
1062  (len > zero) ? IndexMapper::grid_size : 0));
1063  set_cuda_dim<dim>(dims.min_dims.blocks,
1064  static_cast<cuda_dim_member_t>(IndexMapper::grid_size));
1065 
1066  return dims;
1067  }
1068 };
1069 
1071 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
1072 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
1073  iteration_mapping::StridedLoop<named_usage::unspecified>,
1074  sync,
1075  cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
1076 {
1077  static_assert(BLOCK_SIZE > 0,
1078  "block size must be > 0, named_usage::unspecified, or "
1079  "named_usage::ignored with kernel");
1080 
1081  using IndexMapper =
1082  cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
1083 
1084  template<typename IdxT>
1085  static LaunchDims get_dimensions(IdxT len)
1086  {
1087  constexpr auto zero = static_cast<IdxT>(0);
1088 
1089  const IdxT grid_size = RAJA_DIVIDE_CEILING_INT(
1090  len, static_cast<IdxT>(IndexMapper::block_size));
1091 
1092  LaunchDims dims;
1093 
1094  set_cuda_dim<dim>(dims.active.threads,
1095  static_cast<cuda_dim_member_t>(true));
1096  set_cuda_dim<dim>(dims.dims.threads,
1097  static_cast<cuda_dim_member_t>(
1098  (len > zero) ? IndexMapper::block_size : 0));
1099  set_cuda_dim<dim>(dims.min_dims.threads,
1100  static_cast<cuda_dim_member_t>(IndexMapper::block_size));
1101 
1102  set_cuda_dim<dim>(dims.active.blocks, static_cast<cuda_dim_member_t>(true));
1103  set_cuda_dim<dim>(dims.dims.blocks,
1104  static_cast<cuda_dim_member_t>(grid_size));
1105  set_cuda_dim<dim>(dims.min_dims.blocks, static_cast<cuda_dim_member_t>(1));
1106 
1107  return dims;
1108  }
1109 };
1110 
1112 template<named_dim dim,
1113  int BLOCK_SIZE,
1114  int GRID_SIZE,
1116 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
1117  iteration_mapping::StridedLoop<named_usage::unspecified>,
1118  sync,
1119  cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
1120 {
1121  static_assert(BLOCK_SIZE > 0,
1122  "block size must be > 0, named_usage::unspecified, or "
1123  "named_usage::ignored with kernel");
1124  static_assert(GRID_SIZE > 0,
1125  "grid size must be > 0, named_usage::unspecified, or "
1126  "named_usage::ignored with kernel");
1127 
1128  using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
1129 
1130  template<typename IdxT>
1131  static LaunchDims get_dimensions(IdxT len)
1132  {
1133  constexpr auto zero = static_cast<IdxT>(0);
1134 
1135  LaunchDims dims;
1136 
1137  set_cuda_dim<dim>(dims.active.threads,
1138  static_cast<cuda_dim_member_t>(true));
1139  set_cuda_dim<dim>(dims.dims.threads,
1140  static_cast<cuda_dim_member_t>(
1141  (len > zero) ? IndexMapper::block_size : 0));
1142  set_cuda_dim<dim>(dims.min_dims.threads,
1143  static_cast<cuda_dim_member_t>(IndexMapper::block_size));
1144 
1145  set_cuda_dim<dim>(dims.active.blocks, static_cast<cuda_dim_member_t>(true));
1146  set_cuda_dim<dim>(dims.dims.blocks,
1147  static_cast<cuda_dim_member_t>(
1148  (len > zero) ? IndexMapper::grid_size : 0));
1149  set_cuda_dim<dim>(dims.min_dims.blocks,
1150  static_cast<cuda_dim_member_t>(IndexMapper::grid_size));
1151 
1152  return dims;
1153  }
1154 };
1155 
1156 } // namespace internal
1157 
1158 } // namespace RAJA
1159 
1160 #endif // closing endif for RAJA_ENABLE_CUDA guard
1161 
1162 #endif // closing endif for header file include guard
Header file defining prototypes for routines used to manage memory for CUDA reductions and other oper...
Header file containing RAJA CUDA policy definitions.
Header file for common RAJA internal macro definitions.
RAJA_HOST_DEVICE void RAJA_ABORT_OR_THROW(const char *str)
Definition: macros.hpp:143
#define RAJA_UNUSED_ARG(x)
Definition: macros.hpp:97
#define RAJA_DIVIDE_CEILING_INT(dividend, divisor)
Definition: macros.hpp:122
#define RAJA_DEVICE
Definition: macros.hpp:66
camp::list< Stmts... > StatementList
Definition: StatementList.hpp:41
Definition: AlignedRangeIndexSetBuilders.cpp:35
named_dim
Definition: types.hpp:53
kernel_sync_requirement
Definition: types.hpp:63
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155
RAJA header file containing user interface for RAJA::kernel.
Header file for RAJA type definitions.