RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
CudaKernel.hpp
Go to the documentation of this file.
1 
12 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
13 // Copyright (c) Lawrence Livermore National Security, LLC and other
14 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
15 // files for dates and other details. No copyright assignment is required
16 // to contribute to RAJA.
17 //
18 // SPDX-License-Identifier: (BSD-3-Clause)
19 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
20 
21 #ifndef RAJA_policy_cuda_kernel_CudaKernel_HPP
22 #define RAJA_policy_cuda_kernel_CudaKernel_HPP
23 
24 #include "RAJA/config.hpp"
25 
26 #if defined(RAJA_ENABLE_CUDA)
27 
28 #include <cassert>
29 #include <climits>
30 
31 #include "camp/camp.hpp"
32 
33 #include "RAJA/util/macros.hpp"
34 #include "RAJA/util/types.hpp"
35 
36 #include "RAJA/pattern/kernel.hpp"
39 
41 
44 
46 
47 namespace RAJA
48 {
49 
57 template<bool async0, int num_blocks, int num_threads, int blocks_per_sm>
58 struct cuda_explicit_launch
59 {};
60 
74 template<bool async0, int num_blocks, int num_threads>
75 using cuda_launch = cuda_explicit_launch<async0,
76  num_blocks,
77  num_threads,
78  policy::cuda::MIN_BLOCKS_PER_SM>;
79 
85 template<int num_threads0, bool async0>
86 using cuda_occ_calc_launch =
87  cuda_explicit_launch<async0,
88  0,
89  num_threads0,
90  policy::cuda::MIN_BLOCKS_PER_SM>;
91 
92 namespace statement
93 {
94 
100 template<typename LaunchConfig, typename... EnclosedStmts>
101 struct CudaKernelExt
102  : public internal::Statement<
103  ::RAJA::policy::cuda::
104  cuda_exec_explicit<LaunchConfig, void, void, 0, true>,
105  EnclosedStmts...>
106 {};
107 
114 template<int num_blocks, int num_threads, typename... EnclosedStmts>
115 using CudaKernelExp = CudaKernelExt<cuda_launch<false, num_blocks, num_threads>,
116  EnclosedStmts...>;
117 
124 template<int num_blocks, int num_threads, typename... EnclosedStmts>
125 using CudaKernelExpAsync =
126  CudaKernelExt<cuda_launch<true, num_blocks, num_threads>, EnclosedStmts...>;
127 
133 template<typename... EnclosedStmts>
134 using CudaKernelOcc =
135  CudaKernelExt<cuda_occ_calc_launch<1024, false>, EnclosedStmts...>;
136 
142 template<typename... EnclosedStmts>
143 using CudaKernelOccAsync =
144  CudaKernelExt<cuda_occ_calc_launch<1024, true>, EnclosedStmts...>;
145 
151 template<int num_threads, typename... EnclosedStmts>
152 using CudaKernelFixed = CudaKernelExt<
154  EnclosedStmts...>;
155 
161 template<int num_threads, typename... EnclosedStmts>
162 using CudaKernelFixedAsync =
164  EnclosedStmts...>;
165 
171 template<int num_threads, int blocks_per_sm, typename... EnclosedStmts>
172 using CudaKernelFixedSM =
173  CudaKernelExt<cuda_explicit_launch<false,
175  num_threads,
176  blocks_per_sm>,
177  EnclosedStmts...>;
178 
184 template<int num_threads, int blocks_per_sm, typename... EnclosedStmts>
185 using CudaKernelFixedSMAsync =
186  CudaKernelExt<cuda_explicit_launch<true,
188  num_threads,
189  blocks_per_sm>,
190  EnclosedStmts...>;
191 
196 template<typename... EnclosedStmts>
197 using CudaKernel = CudaKernelFixed<1024, EnclosedStmts...>;
198 
203 template<typename... EnclosedStmts>
204 using CudaKernelAsync = CudaKernelFixedAsync<1024, EnclosedStmts...>;
205 
206 } // namespace statement
207 
208 namespace internal
209 {
210 
211 
215 template<typename Data, typename Exec>
216 __global__ void CudaKernelLauncher(const RAJA_CUDA_GRID_CONSTANT Data data)
217 {
218 
219  using data_t = camp::decay<Data>;
220  data_t private_data = data;
221 
222  Exec::exec(private_data, true);
223 
224  RAJA::expt::detail::combine_params<RAJA::cuda_flatten_global_xyz_direct>(
225  private_data.param_tuple);
226 }
227 
235 template<int BlockSize, int BlocksPerSM, typename Data, typename Exec>
236 __launch_bounds__(BlockSize, BlocksPerSM) __global__
237  void CudaKernelLauncherFixed(const RAJA_CUDA_GRID_CONSTANT Data data)
238 {
239 
240  using data_t = camp::decay<Data>;
241  data_t private_data = data;
242 
243  // execute the the object
244  Exec::exec(private_data, true);
245 
246  RAJA::expt::detail::combine_params<RAJA::cuda_flatten_global_xyz_direct>(
247  private_data.param_tuple);
248 }
249 
258 template<int BlockSize, int BlocksPerSM, typename Data, typename executor_t>
259 struct CudaKernelLauncherGetter
260 {
261  using type =
262  camp::decay<decltype(&internal::CudaKernelLauncherFixed<BlockSize,
263  BlocksPerSM,
264  Data,
265  executor_t>)>;
266 
267  static constexpr type get() noexcept
268  {
269  return &internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data,
270  executor_t>;
271  }
272 };
273 
278 template<typename Data, typename executor_t>
279 struct CudaKernelLauncherGetter<0, 0, Data, executor_t>
280 {
281  using type =
282  camp::decay<decltype(&internal::CudaKernelLauncher<Data, executor_t>)>;
283 
284  static constexpr type get() noexcept
285  {
286  return &internal::CudaKernelLauncher<Data, executor_t>;
287  }
288 };
289 
290 
295 template<typename LaunchPolicy,
296  typename StmtList,
297  typename Data,
298  typename Types>
299 struct CudaLaunchHelper;
300 
306 template<bool async0,
307  int num_blocks,
308  int num_threads,
309  int blocks_per_sm,
310  typename StmtList,
311  typename Data,
312  typename Types>
313 struct CudaLaunchHelper<
314  cuda_explicit_launch<async0, num_blocks, num_threads, blocks_per_sm>,
315  StmtList,
316  Data,
317  Types>
318 {
319  using Self = CudaLaunchHelper;
320 
321  static constexpr bool async = async0;
322 
323  using executor_t =
324  internal::cuda_statement_list_executor_t<StmtList, Data, Types>;
325 
326  using kernelGetter_t =
327  CudaKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads,
328  (blocks_per_sm <= 0) ? 0 : blocks_per_sm,
329  Data,
330  executor_t>;
331 
332  inline static const void* get_func()
333  {
334  return reinterpret_cast<const void*>(kernelGetter_t::get());
335  }
336 
337  inline static void recommended_blocks_threads(size_t shmem_size,
338  int& recommended_blocks,
339  int& recommended_threads)
340  {
341  auto func = Self::get_func();
342 
343  if (num_blocks <= 0)
344  {
345 
346  if (num_threads <= 0)
347  {
348 
349  //
350  // determine blocks at runtime
351  // determine threads at runtime
352  //
353  auto data = ::RAJA::cuda::cuda_occupancy_max_blocks_threads<Self>(
354  func, shmem_size);
355  recommended_blocks = data.func_max_blocks_per_device;
356  recommended_threads = data.func_max_threads_per_block;
357  }
358  else
359  {
360 
361  //
362  // determine blocks at runtime
363  // threads determined at compile-time
364  //
365  recommended_threads = num_threads;
366 
367  auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(
368  func, shmem_size);
369  recommended_blocks =
370  data.func_max_blocks_per_sm * data.device_sm_per_device;
371  }
372  }
373  else
374  {
375 
376  if (num_threads <= 0)
377  {
378 
379  //
380  // determine threads at runtime, unsure what use 1024
381  // this value may be invalid for kernels with high register pressure
382  //
383  recommended_threads = 1024;
384  }
385  else
386  {
387 
388  //
389  // threads determined at compile-time
390  //
391  recommended_threads = num_threads;
392  }
393 
394  //
395  // blocks determined at compile-time
396  //
397  recommended_blocks = num_blocks;
398  }
399  }
400 
401  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size),
402  int& max_threads)
403  {
404  if (num_threads <= 0)
405  {
406 
407  //
408  // determine threads at runtime, unsure what use 1024
409  // this value may be invalid for kernels with high register pressure
410  //
411  max_threads = 1024;
412  }
413  else
414  {
415 
416  //
417  // threads determined at compile-time
418  //
419  max_threads = num_threads;
420  }
421  }
422 
423  inline static void max_blocks(size_t shmem_size,
424  int& max_blocks,
425  int actual_threads)
426  {
427  auto func = Self::get_func();
428 
429  if (num_blocks <= 0)
430  {
431 
432  //
433  // determine blocks at runtime
434  //
435  if (num_threads <= 0 || num_threads != actual_threads)
436  {
437 
438  //
439  // determine blocks when actual_threads != num_threads
440  //
441  auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self>(
442  func, shmem_size, actual_threads);
443  max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
444  }
445  else
446  {
447 
448  //
449  // determine blocks when actual_threads == num_threads
450  //
451  auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(
452  func, shmem_size);
453  max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
454  }
455  }
456  else
457  {
458 
459  //
460  // blocks determined at compile-time
461  //
462  max_blocks = num_blocks;
463  }
464  }
465 };
466 
476 inline cuda_dim_t fitCudaDims(cuda_dim_member_t limit,
477  cuda_dim_t result,
478  cuda_dim_t minimum = cuda_dim_t())
479 {
480 
481 
482  // clamp things to at least 1
483  result.x = result.x ? result.x : 1;
484  result.y = result.y ? result.y : 1;
485  result.z = result.z ? result.z : 1;
486 
487  minimum.x = minimum.x ? minimum.x : 1;
488  minimum.y = minimum.y ? minimum.y : 1;
489  minimum.z = minimum.z ? minimum.z : 1;
490 
491  // if we are under the limit, we're done
492  if (result.x * result.y * result.z <= limit) return result;
493 
494  // Can we reduce z to fit?
495  if (result.x * result.y * minimum.z < limit)
496  {
497  // compute a new z
498  result.z = limit / (result.x * result.y);
499  return result;
500  }
501  // we don't fit, so reduce z to it's minimum and continue on to y
502  result.z = minimum.z;
503 
504 
505  // Can we reduce y to fit?
506  if (result.x * minimum.y * result.z < limit)
507  {
508  // compute a new y
509  result.y = limit / (result.x * result.z);
510  return result;
511  }
512  // we don't fit, so reduce y to it's minimum and continue on to x
513  result.y = minimum.y;
514 
515 
516  // Can we reduce y to fit?
517  if (minimum.x * result.y * result.z < limit)
518  {
519  // compute a new x
520  result.x = limit / (result.y * result.z);
521  return result;
522  }
523  // we don't fit, so we'll return the smallest possible thing
524  result.x = minimum.x;
525 
526  return result;
527 }
528 
532 template<typename LaunchConfig, typename... EnclosedStmts, typename Types>
533 struct StatementExecutor<
534  statement::CudaKernelExt<LaunchConfig, EnclosedStmts...>,
535  Types>
536 {
537 
538  using stmt_list_t = StatementList<EnclosedStmts...>;
539  using StatementType =
540  statement::CudaKernelExt<LaunchConfig, EnclosedStmts...>;
541 
542  template<typename Data>
543  static inline void exec(Data&& data)
544  {
545 
546  using data_t = camp::decay<Data>;
547  using executor_t =
548  cuda_statement_list_executor_t<stmt_list_t, data_t, Types>;
549  using launch_t = CudaLaunchHelper<LaunchConfig, stmt_list_t, data_t, Types>;
550 
551 
552  RAJA::resources::Cuda res = data.get_resource();
553 
554 
555  //
556  // Compute the requested kernel dimensions
557  //
558  LaunchDims launch_dims = executor_t::calculateDimensions(data);
559 
560 
561  // Only launch kernel if we have something to iterate over
562  bool active_threads = launch_dims.threads_are_active();
563  bool active_blocks = launch_dims.blocks_are_active();
564  int num_blocks = launch_dims.num_blocks();
565  int num_threads = launch_dims.num_threads();
566  if ((active_threads || active_blocks) &&
567  (!active_blocks || num_blocks > 0) &&
568  (!active_threads || num_threads > 0))
569  {
570 
571  //
572  // Setup shared memory buffers
573  //
574  size_t shmem = 0;
575 
576 
577  //
578  // Compute the recommended physical kernel blocks and threads
579  //
580  int recommended_blocks;
581  int recommended_threads;
582  launch_t::recommended_blocks_threads(shmem, recommended_blocks,
583  recommended_threads);
584 
585 
586  //
587  // Compute the MAX physical kernel threads
588  //
589  int max_threads;
590  launch_t::max_threads(shmem, max_threads);
591 
592 
593  //
594  // Fit the requested threads
595  //
596  cuda_dim_t fit_threads {0, 0, 0};
597 
598  if (recommended_threads >= get_size(launch_dims.min_dims.threads))
599  {
600 
601  fit_threads = fitCudaDims(recommended_threads, launch_dims.dims.threads,
602  launch_dims.min_dims.threads);
603  }
604 
605  //
606  // Redo fit with max threads
607  //
608  if (recommended_threads < max_threads &&
609  get_size(fit_threads) != recommended_threads)
610  {
611 
612  fit_threads = fitCudaDims(max_threads, launch_dims.dims.threads,
613  launch_dims.min_dims.threads);
614  }
615 
616  launch_dims.dims.threads = fit_threads;
617 
618 
619  //
620  // Compute the MAX physical kernel blocks
621  //
622  int max_blocks;
623  launch_t::max_blocks(shmem, max_blocks, launch_dims.num_threads());
624 
625  int use_blocks;
626 
627  if (launch_dims.num_threads() == recommended_threads)
628  {
629 
630  //
631  // Fit the requested blocks
632  //
633  use_blocks = recommended_blocks;
634  }
635  else
636  {
637 
638  //
639  // Fit the max blocks
640  //
641  use_blocks = max_blocks;
642  }
643 
644  launch_dims.dims.blocks = fitCudaDims(use_blocks, launch_dims.dims.blocks,
645  launch_dims.min_dims.blocks);
646 
647  //
648  // make sure that we fit
649  //
650  /* Doesn't make sense to check this anymore - AJK
651  if(launch_dims.num_blocks() > max_blocks){
652  RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num blocks");
653  }*/
654  if (launch_dims.num_threads() > max_threads)
655  {
656  RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num threads");
657  }
658 
659  {
660  auto func = launch_t::get_func();
661  // The exact policy here does not affect the reduction operation, but
662  // we do need to accurately pass a resource and launch dimensions to
663  // perform initialization and resolution of reduction parameters.
664  using EXEC_POL =
665  ::RAJA::policy::cuda::cuda_exec_explicit<LaunchConfig, void, void,
666  0, true>;
667 
668  RAJA::cuda::detail::cudaInfo launch_info;
669  launch_info.gridDim = launch_dims.dims.blocks;
670  launch_info.blockDim = launch_dims.dims.threads;
671  launch_info.dynamic_smem = &shmem;
672  launch_info.res = res;
673 
674  RAJA::expt::detail::init_params<EXEC_POL>(data.param_tuple,
675  launch_info);
676  //
677  // Privatize the LoopData, using make_launch_body to setup reductions
678  //
679  // Note that there is a circular dependency between the previous setup
680  // of the launch_dims and potential changes to shmem here that is
681  // currently an unresolved issue.
682  //
683  auto cuda_data = RAJA::cuda::make_launch_body(
684  func, launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res,
685  data);
686 
687  //
688  // Launch the kernel
689  //
690  void* args[] = {(void*)&cuda_data};
691  RAJA::cuda::launch(func, launch_dims.dims.blocks,
692  launch_dims.dims.threads, args, shmem, res,
693  launch_t::async);
694  RAJA::expt::detail::resolve_params<EXEC_POL>(data.param_tuple,
695  launch_info);
696  }
697  }
698  }
699 };
700 
701 
702 } // namespace internal
703 } // namespace RAJA
704 
705 #endif // closing endif for RAJA_ENABLE_CUDA guard
706 
707 #endif // closing endif for header file include guard
Header file defining prototypes for routines used to manage memory for CUDA reductions and other oper...
Header file containing RAJA CUDA policy definitions.
Header file for common RAJA internal macro definitions.
RAJA_HOST_DEVICE void RAJA_ABORT_OR_THROW(const char *str)
Definition: macros.hpp:143
#define RAJA_UNUSED_ARG(x)
Definition: macros.hpp:97
Args args
Definition: WorkRunner.hpp:212
camp::list< Stmts... > StatementList
Definition: StatementList.hpp:41
Definition: AlignedRangeIndexSetBuilders.cpp:35
__launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ void launch_new_reduce_global_fcn_fixed(const RAJA_CUDA_GRID_CONSTANT BODY body_in
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
void launch(LaunchParams const &launch_params, ReduceParams &&... rest_of_launch_args)
Definition: launch_core.hpp:268
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155
Header file for statement wrappers and executors.
Header file for kernel lambda executor.
RAJA header file containing user interface for RAJA::kernel.
RAJA header file containing constructs used to run kernel traversals on GPU with CUDA.
Header file for RAJA type definitions.