21 #ifndef RAJA_MemUtils_CUDA_HPP
22 #define RAJA_MemUtils_CUDA_HPP
24 #include "RAJA/config.hpp"
26 #if defined(RAJA_ENABLE_CUDA)
33 #include <type_traits>
34 #include <unordered_map>
53 cudaDeviceProp get_device_prop()
56 CAMP_CUDA_API_INVOKE_AND_CHECK(cudaGetDevice, &device);
58 CAMP_CUDA_API_INVOKE_AND_CHECK(cudaGetDeviceProperties, &prop, device);
65 cudaDeviceProp& device_prop()
67 static thread_local cudaDeviceProp prop = get_device_prop();
72 struct PinnedAllocator
76 void* malloc(
size_t nbytes)
79 CAMP_CUDA_API_INVOKE_AND_CHECK(cudaHostAlloc, &ptr, nbytes,
87 CAMP_CUDA_API_INVOKE_AND_CHECK(cudaFreeHost, ptr);
93 struct DeviceAllocator
97 void* malloc(
size_t nbytes)
100 CAMP_CUDA_API_INVOKE_AND_CHECK(cudaMalloc, &ptr, nbytes);
107 CAMP_CUDA_API_INVOKE_AND_CHECK(cudaFree, ptr);
114 struct DeviceZeroedAllocator
118 void* malloc(
size_t nbytes)
120 auto res = ::camp::resources::Cuda::get_default();
122 CAMP_CUDA_API_INVOKE_AND_CHECK(cudaMalloc, &ptr, nbytes);
123 CAMP_CUDA_API_INVOKE_AND_CHECK(cudaMemsetAsync, ptr, 0, nbytes,
125 CAMP_CUDA_API_INVOKE_AND_CHECK(cudaStreamSynchronize, res.get_stream());
132 CAMP_CUDA_API_INVOKE_AND_CHECK(cudaFree, ptr);
138 struct DevicePinnedAllocator
142 void* malloc(
size_t nbytes)
145 CAMP_CUDA_API_INVOKE_AND_CHECK(cudaGetDevice, &device);
147 CAMP_CUDA_API_INVOKE_AND_CHECK(cudaMallocManaged, &ptr, nbytes,
148 cudaMemAttachGlobal);
149 CAMP_CUDA_API_INVOKE_AND_CHECK(cudaMemAdvise, ptr, nbytes,
150 cudaMemAdviseSetPreferredLocation, device);
151 CAMP_CUDA_API_INVOKE_AND_CHECK(cudaMemAdvise, ptr, nbytes,
152 cudaMemAdviseSetAccessedBy, cudaCpuDeviceId);
160 CAMP_CUDA_API_INVOKE_AND_CHECK(cudaFree, ptr);
165 using device_mempool_type = basic_mempool::MemPool<DeviceAllocator>;
166 using device_zeroed_mempool_type =
167 basic_mempool::MemPool<DeviceZeroedAllocator>;
168 using device_pinned_mempool_type =
169 basic_mempool::MemPool<DevicePinnedAllocator>;
170 using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
178 const void* func =
nullptr;
179 cuda_dim_t gridDim {0, 0, 0};
180 cuda_dim_t blockDim {0, 0, 0};
181 size_t* dynamic_smem =
nullptr;
182 ::RAJA::resources::Cuda res {::RAJA::resources::Cuda::CudaFromStream(0, 0)};
183 bool setup_reducers =
false;
186 struct cudaStatusInfo : cudaInfo
191 extern cudaStatusInfo g_status;
193 thread_local
extern cudaStatusInfo tl_status;
196 extern std::unordered_map<cudaStream_t, bool> g_stream_info_map;
207 std::lock_guard<std::mutex> lock(detail::g_status.lock);
209 for (
auto& val : detail::g_stream_info_map)
219 CAMP_CUDA_API_INVOKE_AND_CHECK(cudaDeviceSynchronize);
227 std::lock_guard<std::mutex> lock(detail::g_status.lock);
228 auto iter = detail::g_stream_info_map.find(res.get_stream());
229 if (
iter != detail::g_stream_info_map.end())
245 void launch(::RAJA::resources::Cuda res,
bool async =
true)
247 std::lock_guard<std::mutex> lock(detail::g_status.lock);
248 auto iter = detail::g_stream_info_map.find(res.get_stream());
249 if (
iter != detail::g_stream_info_map.end())
255 detail::g_stream_info_map.emplace(res.get_stream(), !async);
265 void launch(
const void* func,
270 ::RAJA::resources::Cuda res,
273 CAMP_CUDA_API_INVOKE_AND_CHECK(cudaLaunchKernel, func, gridDim, blockDim,
274 args, shmem, res.get_stream());
280 void peekAtLastError() { CAMP_CUDA_API_INVOKE_AND_CHECK(cudaPeekAtLastError); }
284 bool setupReducers() {
return detail::tl_status.setup_reducers; }
288 cuda_dim_t currentGridDim() {
return detail::tl_status.gridDim; }
292 cuda_dim_member_t currentGridSize()
294 return detail::tl_status.gridDim.x * detail::tl_status.gridDim.y *
295 detail::tl_status.gridDim.z;
300 cuda_dim_t currentBlockDim() {
return detail::tl_status.blockDim; }
304 cuda_dim_member_t currentBlockSize()
306 return detail::tl_status.blockDim.x * detail::tl_status.blockDim.y *
307 detail::tl_status.blockDim.z;
312 size_t currentDynamicShmem() {
return *detail::tl_status.dynamic_smem; }
316 size_t maxDynamicShmem()
318 cudaFuncAttributes func_attr;
319 CAMP_CUDA_API_INVOKE_AND_CHECK(cudaFuncGetAttributes, &func_attr,
320 detail::tl_status.func);
321 return func_attr.maxDynamicSharedSizeBytes;
324 constexpr
size_t dynamic_smem_allocation_failure =
337 template<
typename T,
typename GetNFromMax>
338 RAJA_INLINE
size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max,
339 size_t align =
alignof(T))
341 const size_t unaligned_shmem = *detail::tl_status.dynamic_smem;
342 const size_t align_offset = ((unaligned_shmem %
align) !=
size_t(0))
345 const size_t aligned_shmem = unaligned_shmem + align_offset;
347 const size_t max_shmem_bytes = maxDynamicShmem() - aligned_shmem;
348 const size_t n_bytes =
sizeof(T) * std::forward<GetNFromMax>(get_n_from_max)(
349 max_shmem_bytes /
sizeof(T));
351 if (
size_t(0) < n_bytes && n_bytes <= max_shmem_bytes)
353 *detail::tl_status.dynamic_smem = aligned_shmem + n_bytes;
354 return aligned_shmem;
358 return dynamic_smem_allocation_failure;
364 ::RAJA::resources::Cuda currentResource() {
return detail::tl_status.res; }
371 template<
typename LOOP_BODY>
372 RAJA_INLINE
typename std::remove_reference<LOOP_BODY>::type make_launch_body(
376 size_t& dynamic_smem,
377 ::RAJA::resources::Cuda res,
378 LOOP_BODY&& loop_body)
382 detail::cudaInfo {func, gridDim, blockDim, &dynamic_smem, res,
true});
384 using return_type =
typename std::remove_reference<LOOP_BODY>::type;
385 return return_type(std::forward<LOOP_BODY>(loop_body));
388 static constexpr
int cuda_occupancy_uninitialized_int = -1;
389 static constexpr
size_t cuda_occupancy_uninitialized_size_t =
393 struct CudaFixedMaxBlocksData
395 int device_sm_per_device = cuda::device_prop().multiProcessorCount;
396 int device_max_threads_per_sm =
397 cuda::device_prop().maxThreadsPerMultiProcessor;
402 CudaFixedMaxBlocksData cuda_max_blocks()
404 static thread_local CudaFixedMaxBlocksData data;
410 struct CudaOccMaxBlocksThreadsData
412 size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t;
413 int func_max_blocks_per_device = cuda_occupancy_uninitialized_int;
414 int func_max_threads_per_block = cuda_occupancy_uninitialized_int;
418 template<
typename RAJA_UNUSED_ARG(UniqueMarker)>
419 RAJA_INLINE CudaOccMaxBlocksThreadsData
420 cuda_occupancy_max_blocks_threads(
const void* func,
421 size_t func_dynamic_shmem_per_block)
423 static thread_local CudaOccMaxBlocksThreadsData data;
425 if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
428 data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
430 CAMP_CUDA_API_INVOKE_AND_CHECK(
431 cudaOccupancyMaxPotentialBlockSize, &data.func_max_blocks_per_device,
432 &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block);
439 struct CudaOccMaxBlocksData : CudaFixedMaxBlocksData
441 size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t;
442 int func_threads_per_block = cuda_occupancy_uninitialized_int;
443 int func_max_blocks_per_sm = cuda_occupancy_uninitialized_int;
447 template<
typename RAJA_UNUSED_ARG(UniqueMarker),
int func_threads_per_block>
448 RAJA_INLINE CudaOccMaxBlocksData
449 cuda_occupancy_max_blocks(
const void* func,
size_t func_dynamic_shmem_per_block)
451 static thread_local CudaOccMaxBlocksData data;
453 if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
456 data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
457 data.func_threads_per_block = func_threads_per_block;
459 CAMP_CUDA_API_INVOKE_AND_CHECK(
460 cudaOccupancyMaxActiveBlocksPerMultiprocessor,
461 &data.func_max_blocks_per_sm, func, func_threads_per_block,
462 func_dynamic_shmem_per_block);
469 template<
typename RAJA_UNUSED_ARG(UniqueMarker)>
470 RAJA_INLINE CudaOccMaxBlocksData
471 cuda_occupancy_max_blocks(
const void* func,
472 size_t func_dynamic_shmem_per_block,
473 int func_threads_per_block)
475 static thread_local CudaOccMaxBlocksData data;
477 if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
478 data.func_threads_per_block != func_threads_per_block)
481 data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
482 data.func_threads_per_block = func_threads_per_block;
484 CAMP_CUDA_API_INVOKE_AND_CHECK(
485 cudaOccupancyMaxActiveBlocksPerMultiprocessor,
486 &data.func_max_blocks_per_sm, func, func_threads_per_block,
487 func_dynamic_shmem_per_block);
519 template<
typename IdxT,
typename Concretizer,
typename UniqueMarker>
520 struct ConcretizerImpl
522 ConcretizerImpl(
const void* func,
523 size_t func_dynamic_shmem_per_block,
526 m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block),
530 IdxT get_max_block_size()
const
532 auto data = cuda_occupancy_max_blocks_threads<UniqueMarker>(
533 m_func, m_func_dynamic_shmem_per_block);
534 IdxT func_max_threads_per_block = data.func_max_threads_per_block;
535 return func_max_threads_per_block;
539 IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device)
const
541 IdxT func_max_threads_per_block = this->get_max_block_size();
542 IdxT func_threads_per_block =
544 if (func_threads_per_block <= func_max_threads_per_block)
546 return func_threads_per_block;
555 IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block)
const
557 IdxT func_blocks_per_device =
559 return func_blocks_per_device;
563 auto get_block_and_grid_size_to_fit_len()
const
565 IdxT func_max_threads_per_block = this->get_max_block_size();
566 IdxT func_blocks_per_device =
568 return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
572 IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device)
const
574 IdxT func_max_threads_per_block = this->get_max_block_size();
575 IdxT func_threads_per_block =
577 return std::min(func_threads_per_block, func_max_threads_per_block);
581 IdxT get_grid_size_to_fit_device(IdxT func_threads_per_block)
const
583 auto data = cuda_occupancy_max_blocks<UniqueMarker>(
584 m_func, m_func_dynamic_shmem_per_block, func_threads_per_block);
585 IdxT func_max_blocks_per_device =
586 Concretizer::template get_max_grid_size<IdxT>(data);
587 IdxT func_blocks_per_device =
589 return std::min(func_blocks_per_device, func_max_blocks_per_device);
593 auto get_block_and_grid_size_to_fit_device()
const
595 IdxT func_max_threads_per_block = this->get_max_block_size();
596 IdxT func_blocks_per_device =
597 this->get_grid_size_to_fit_device(func_max_threads_per_block);
598 return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
603 size_t m_func_dynamic_shmem_per_block;
RAJA header file containing an implementation of a memory pool.
Header file containing RAJA CUDA policy definitions.
Header file for common RAJA internal macro definitions.
RAJA_HOST_DEVICE void RAJA_ABORT_OR_THROW(const char *str)
Definition: macros.hpp:143
#define RAJA_DIVIDE_CEILING_INT(dividend, divisor)
Definition: macros.hpp:122
Args args
Definition: WorkRunner.hpp:212
value_type::device_call &[i_loop] iter
Definition: WorkRunner.hpp:216
RAJA_INLINE void synchronize_impl(const omp_synchronize &)
Synchronize all OpenMP threads and tasks.
Definition: synchronize.hpp:36
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result min(Args... args)
Definition: foldl.hpp:161
void launch(LaunchParams const &launch_params, ReduceParams &&... rest_of_launch_args)
Definition: launch_core.hpp:268
RAJA_INLINE void * align(size_t alignment, size_t size, void *&ptr, size_t &space)
Definition: align.hpp:33
void synchronize()
Synchronize all current RAJA executions for the specified policy.
Definition: synchronize.hpp:44
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155
Header file containing utility methods used in CUDA operations.
Header file for RAJA resource definitions.
Assign a new value to an object and restore the object's previous value at the end of the current sco...
Definition: types.hpp:1028
Header file for RAJA type definitions.