doxygen/html/CudaKernel_8hpp_source.html

 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//

 // Copyright (c) Lawrence Livermore National Security, LLC and other

 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT

 // files for dates and other details. No copyright assignment is required

 // to contribute to RAJA.

 //

 // SPDX-License-Identifier: (BSD-3-Clause)

 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//


 #ifndef RAJA_policy_cuda_kernel_CudaKernel_HPP

 #define RAJA_policy_cuda_kernel_CudaKernel_HPP


 #include "RAJA/config.hpp"


 #if defined(RAJA_ENABLE_CUDA)


 #include <cassert>

 #include <climits>


 #include "camp/camp.hpp"


 #include "RAJA/util/macros.hpp"

 #include "RAJA/util/types.hpp"


 #include "RAJA/pattern/kernel.hpp"

 #include "RAJA/pattern/kernel/For.hpp"

 #include "RAJA/pattern/kernel/Lambda.hpp"


 #include "RAJA/pattern/params/forall.hpp"


 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"

 #include "RAJA/policy/cuda/policy.hpp"


 #include "RAJA/policy/cuda/kernel/internal.hpp"


 namespace RAJA

 {


 template<bool async0, int num_blocks, int num_threads, int blocks_per_sm>

 struct cuda_explicit_launch

 {};


 template<bool async0, int num_blocks, int num_threads>

 using cuda_launch = cuda_explicit_launch<async0,

                                          num_blocks,

                                          num_threads,

                                          policy::cuda::MIN_BLOCKS_PER_SM>;


 template<int num_threads0, bool async0>

 using cuda_occ_calc_launch =

     cuda_explicit_launch<async0,

                          0,

                          num_threads0,

                          policy::cuda::MIN_BLOCKS_PER_SM>;


 namespace statement

 {


 template<typename LaunchConfig, typename... EnclosedStmts>

 struct CudaKernelExt

     : public internal::Statement<

           ::RAJA::policy::cuda::

               cuda_exec_explicit<LaunchConfig, void, void, 0, true>,

           EnclosedStmts...>

 {};


 template<int num_blocks, int num_threads, typename... EnclosedStmts>

 using CudaKernelExp = CudaKernelExt<cuda_launch<false, num_blocks, num_threads>,

                                     EnclosedStmts...>;


 template<int num_blocks, int num_threads, typename... EnclosedStmts>

 using CudaKernelExpAsync =

     CudaKernelExt<cuda_launch<true, num_blocks, num_threads>, EnclosedStmts...>;


 template<typename... EnclosedStmts>

 using CudaKernelOcc =

     CudaKernelExt<cuda_occ_calc_launch<1024, false>, EnclosedStmts...>;


 template<typename... EnclosedStmts>

 using CudaKernelOccAsync =

     CudaKernelExt<cuda_occ_calc_launch<1024, true>, EnclosedStmts...>;


 template<int num_threads, typename... EnclosedStmts>

 using CudaKernelFixed = CudaKernelExt<

     cuda_launch<false, operators::limits<int>::max(), num_threads>,

     EnclosedStmts...>;


 template<int num_threads, typename... EnclosedStmts>

 using CudaKernelFixedAsync =

     CudaKernelExt<cuda_launch<true, operators::limits<int>::max(), num_threads>,

                   EnclosedStmts...>;


 template<int num_threads, int blocks_per_sm, typename... EnclosedStmts>

 using CudaKernelFixedSM =

     CudaKernelExt<cuda_explicit_launch<false,

                                        operators::limits<int>::max(),

                                        num_threads,

                                        blocks_per_sm>,

                   EnclosedStmts...>;


 template<int num_threads, int blocks_per_sm, typename... EnclosedStmts>

 using CudaKernelFixedSMAsync =

     CudaKernelExt<cuda_explicit_launch<true,

                                        operators::limits<int>::max(),

                                        num_threads,

                                        blocks_per_sm>,

                   EnclosedStmts...>;


 template<typename... EnclosedStmts>

 using CudaKernel = CudaKernelFixed<1024, EnclosedStmts...>;


 template<typename... EnclosedStmts>

 using CudaKernelAsync = CudaKernelFixedAsync<1024, EnclosedStmts...>;


 }  // namespace statement


 namespace internal

 {


 template<typename Data, typename Exec>

 __global__ void CudaKernelLauncher(const RAJA_CUDA_GRID_CONSTANT Data data)

 {


   using data_t        = camp::decay<Data>;

   data_t private_data = data;


   Exec::exec(private_data, true);


   RAJA::expt::detail::combine_params<RAJA::cuda_flatten_global_xyz_direct>(

       private_data.param_tuple);

 }


 template<int BlockSize, int BlocksPerSM, typename Data, typename Exec>

 __launch_bounds__(BlockSize, BlocksPerSM) __global__

     void CudaKernelLauncherFixed(const RAJA_CUDA_GRID_CONSTANT Data data)

 {


   using data_t        = camp::decay<Data>;

   data_t private_data = data;


   // execute the the object

   Exec::exec(private_data, true);


   RAJA::expt::detail::combine_params<RAJA::cuda_flatten_global_xyz_direct>(

       private_data.param_tuple);

 }


 template<int BlockSize, int BlocksPerSM, typename Data, typename executor_t>

 struct CudaKernelLauncherGetter

 {

   using type =

       camp::decay<decltype(&internal::CudaKernelLauncherFixed<BlockSize,

                                                               BlocksPerSM,

                                                               Data,

                                                               executor_t>)>;


   static constexpr type get() noexcept

   {

     return &internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data,

                                               executor_t>;

   }

 };


 template<typename Data, typename executor_t>

 struct CudaKernelLauncherGetter<0, 0, Data, executor_t>

 {

   using type =

       camp::decay<decltype(&internal::CudaKernelLauncher<Data, executor_t>)>;


   static constexpr type get() noexcept

   {

     return &internal::CudaKernelLauncher<Data, executor_t>;

   }

 };


 template<typename LaunchPolicy,

          typename StmtList,

          typename Data,

          typename Types>

 struct CudaLaunchHelper;


 template<bool async0,

          int num_blocks,

          int num_threads,

          int blocks_per_sm,

          typename StmtList,

          typename Data,

          typename Types>

 struct CudaLaunchHelper<

     cuda_explicit_launch<async0, num_blocks, num_threads, blocks_per_sm>,

     StmtList,

     Data,

     Types>

 {

   using Self = CudaLaunchHelper;


   static constexpr bool async = async0;


   using executor_t =

       internal::cuda_statement_list_executor_t<StmtList, Data, Types>;


   using kernelGetter_t =

       CudaKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads,

                                (blocks_per_sm <= 0) ? 0 : blocks_per_sm,

                                Data,

                                executor_t>;


   inline static const void* get_func()

   {

     return reinterpret_cast<const void*>(kernelGetter_t::get());

   }


   inline static void recommended_blocks_threads(size_t shmem_size,

                                                 int& recommended_blocks,

                                                 int& recommended_threads)

   {

     auto func = Self::get_func();


     if (num_blocks <= 0)

     {


       if (num_threads <= 0)

       {


         //

         // determine blocks at runtime

         // determine threads at runtime

         //

         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks_threads<Self>(

             func, shmem_size);

         recommended_blocks  = data.func_max_blocks_per_device;

         recommended_threads = data.func_max_threads_per_block;

       }

       else

       {


         //

         // determine blocks at runtime

         // threads determined at compile-time

         //

         recommended_threads = num_threads;


         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(

             func, shmem_size);

         recommended_blocks =

             data.func_max_blocks_per_sm * data.device_sm_per_device;

       }

     }

     else

     {


       if (num_threads <= 0)

       {


         //

         // determine threads at runtime, unsure what use 1024

         // this value may be invalid for kernels with high register pressure

         //

         recommended_threads = 1024;

       }

       else

       {


         //

         // threads determined at compile-time

         //

         recommended_threads = num_threads;

       }


       //

       // blocks determined at compile-time

       //

       recommended_blocks = num_blocks;

     }

   }


   inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size),

                                  int& max_threads)

   {

     if (num_threads <= 0)

     {


       //

       // determine threads at runtime, unsure what use 1024

       // this value may be invalid for kernels with high register pressure

       //

       max_threads = 1024;

     }

     else

     {


       //

       // threads determined at compile-time

       //

       max_threads = num_threads;

     }

   }


   inline static void max_blocks(size_t shmem_size,

                                 int& max_blocks,

                                 int actual_threads)

   {

     auto func = Self::get_func();


     if (num_blocks <= 0)

     {


       //

       // determine blocks at runtime

       //

       if (num_threads <= 0 || num_threads != actual_threads)

       {


         //

         // determine blocks when actual_threads != num_threads

         //

         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self>(

             func, shmem_size, actual_threads);

         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;

       }

       else

       {


         //

         // determine blocks when actual_threads == num_threads

         //

         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(

             func, shmem_size);

         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;

       }

     }

     else

     {


       //

       // blocks determined at compile-time

       //

       max_blocks = num_blocks;

     }

   }

 };


 inline cuda_dim_t fitCudaDims(cuda_dim_member_t limit,

                               cuda_dim_t result,

                               cuda_dim_t minimum = cuda_dim_t())

 {


   // clamp things to at least 1

   result.x = result.x ? result.x : 1;

   result.y = result.y ? result.y : 1;

   result.z = result.z ? result.z : 1;


   minimum.x = minimum.x ? minimum.x : 1;

   minimum.y = minimum.y ? minimum.y : 1;

   minimum.z = minimum.z ? minimum.z : 1;


   // if we are under the limit, we're done

   if (result.x * result.y * result.z <= limit) return result;


   // Can we reduce z to fit?

   if (result.x * result.y * minimum.z < limit)

   {

     // compute a new z

     result.z = limit / (result.x * result.y);

     return result;

   }

   // we don't fit, so reduce z to it's minimum and continue on to y

   result.z = minimum.z;


   // Can we reduce y to fit?

   if (result.x * minimum.y * result.z < limit)

   {

     // compute a new y

     result.y = limit / (result.x * result.z);

     return result;

   }

   // we don't fit, so reduce y to it's minimum and continue on to x

   result.y = minimum.y;


   // Can we reduce y to fit?

   if (minimum.x * result.y * result.z < limit)

   {

     // compute a new x

     result.x = limit / (result.y * result.z);

     return result;

   }

   // we don't fit, so we'll return the smallest possible thing

   result.x = minimum.x;


   return result;

 }


 template<typename LaunchConfig, typename... EnclosedStmts, typename Types>

 struct StatementExecutor<

     statement::CudaKernelExt<LaunchConfig, EnclosedStmts...>,

     Types>

 {


   using stmt_list_t = StatementList<EnclosedStmts...>;

   using StatementType =

       statement::CudaKernelExt<LaunchConfig, EnclosedStmts...>;


   template<typename Data>

   static inline void exec(Data&& data)

   {


     using data_t = camp::decay<Data>;

     using executor_t =

         cuda_statement_list_executor_t<stmt_list_t, data_t, Types>;

     using launch_t = CudaLaunchHelper<LaunchConfig, stmt_list_t, data_t, Types>;


     RAJA::resources::Cuda res = data.get_resource();


     //

     // Compute the requested kernel dimensions

     //

     LaunchDims launch_dims = executor_t::calculateDimensions(data);


     // Only launch kernel if we have something to iterate over

     bool active_threads = launch_dims.threads_are_active();

     bool active_blocks  = launch_dims.blocks_are_active();

     int num_blocks      = launch_dims.num_blocks();

     int num_threads     = launch_dims.num_threads();

     if ((active_threads || active_blocks) &&

         (!active_blocks || num_blocks > 0) &&

         (!active_threads || num_threads > 0))

     {


       //

       // Setup shared memory buffers

       //

       size_t shmem = 0;


       //

       // Compute the recommended physical kernel blocks and threads

       //

       int recommended_blocks;

       int recommended_threads;

       launch_t::recommended_blocks_threads(shmem, recommended_blocks,

                                            recommended_threads);


       //

       // Compute the MAX physical kernel threads

       //

       int max_threads;

       launch_t::max_threads(shmem, max_threads);


       //

       // Fit the requested threads

       //

       cuda_dim_t fit_threads {0, 0, 0};


       if (recommended_threads >= get_size(launch_dims.min_dims.threads))

       {


         fit_threads = fitCudaDims(recommended_threads, launch_dims.dims.threads,

                                   launch_dims.min_dims.threads);

       }


       //

       // Redo fit with max threads

       //

       if (recommended_threads < max_threads &&

           get_size(fit_threads) != recommended_threads)

       {


         fit_threads = fitCudaDims(max_threads, launch_dims.dims.threads,

                                   launch_dims.min_dims.threads);

       }


       launch_dims.dims.threads = fit_threads;


       //

       // Compute the MAX physical kernel blocks

       //

       int max_blocks;

       launch_t::max_blocks(shmem, max_blocks, launch_dims.num_threads());


       int use_blocks;


       if (launch_dims.num_threads() == recommended_threads)

       {


         //

         // Fit the requested blocks

         //

         use_blocks = recommended_blocks;

       }

       else

       {


         //

         // Fit the max blocks

         //

         use_blocks = max_blocks;

       }


       launch_dims.dims.blocks = fitCudaDims(use_blocks, launch_dims.dims.blocks,

                                             launch_dims.min_dims.blocks);


       //

       // make sure that we fit

       //

       /* Doesn't make sense to check this anymore - AJK

       if(launch_dims.num_blocks() > max_blocks){

         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num blocks");

       }*/

       if (launch_dims.num_threads() > max_threads)

       {

         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num threads");

       }


       {

         auto func = launch_t::get_func();

         // The exact policy here does not affect the reduction operation, but

         // we do need to accurately pass a resource and launch dimensions to

         // perform initialization and resolution of reduction parameters.

         using EXEC_POL =

             ::RAJA::policy::cuda::cuda_exec_explicit<LaunchConfig, void, void,

                                                      0, true>;


         RAJA::cuda::detail::cudaInfo launch_info;

         launch_info.gridDim      = launch_dims.dims.blocks;

         launch_info.blockDim     = launch_dims.dims.threads;

         launch_info.dynamic_smem = &shmem;

         launch_info.res          = res;


         RAJA::expt::detail::init_params<EXEC_POL>(data.param_tuple,

                                                   launch_info);

         //

         // Privatize the LoopData, using make_launch_body to setup reductions

         //

         // Note that there is a circular dependency between the previous setup

         // of the launch_dims and potential changes to shmem here that is

         // currently an unresolved issue.

         //

         auto cuda_data = RAJA::cuda::make_launch_body(

             func, launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res,

             data);


         //

         // Launch the kernel

         //

         void* args[] = {(void*)&cuda_data};

         RAJA::cuda::launch(func, launch_dims.dims.blocks,

                            launch_dims.dims.threads, args, shmem, res,

                            launch_t::async);

         RAJA::expt::detail::resolve_params<EXEC_POL>(data.param_tuple,

                                                      launch_info);

       }

     }

   }

 };


 }  // namespace internal

 }  // namespace RAJA


 #endif  // closing endif for RAJA_ENABLE_CUDA guard


 #endif  // closing endif for header file include guard

MemUtils_CUDA.hpp
Header file defining prototypes for routines used to manage memory for CUDA reductions and other oper...

policy.hpp
Header file containing RAJA CUDA policy definitions.

macros.hpp
Header file for common RAJA internal macro definitions.

RAJA_ABORT_OR_THROW
RAJA_HOST_DEVICE void RAJA_ABORT_OR_THROW(const char *str)
Definition: macros.hpp:143

RAJA_UNUSED_ARG
#define RAJA_UNUSED_ARG(x)
Definition: macros.hpp:97

RAJA::detail::args
Args args
Definition: WorkRunner.hpp:212

RAJA::internal::StatementList
camp::list< Stmts... > StatementList
Definition: StatementList.hpp:41

RAJA
Definition: AlignedRangeIndexSetBuilders.cpp:35

RAJA::__launch_bounds__
__launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ void launch_new_reduce_global_fcn_fixed(const RAJA_CUDA_GRID_CONSTANT BODY body_in

RAJA::Launch::async
@ async

RAJA::get
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56

RAJA::launch
void launch(LaunchParams const &launch_params, ReduceParams &&... rest_of_launch_args)
Definition: launch_core.hpp:268

RAJA::max
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155

For.hpp
Header file for statement wrappers and executors.

Lambda.hpp
Header file for kernel lambda executor.

kernel.hpp
RAJA header file containing user interface for RAJA::kernel.

forall.hpp

internal.hpp
RAJA header file containing constructs used to run kernel traversals on GPU with CUDA.

types.hpp
Header file for RAJA type definitions.