21 #ifndef RAJA_MemUtils_HIP_HPP
22 #define RAJA_MemUtils_HIP_HPP
24 #include "RAJA/config.hpp"
26 #if defined(RAJA_ENABLE_HIP)
33 #include <type_traits>
34 #include <unordered_map>
53 hipDeviceProp_t get_device_prop()
56 CAMP_HIP_API_INVOKE_AND_CHECK(hipGetDevice, &device);
58 CAMP_HIP_API_INVOKE_AND_CHECK(hipGetDeviceProperties, &prop, device);
65 hipDeviceProp_t& device_prop()
67 static thread_local hipDeviceProp_t prop = get_device_prop();
72 struct PinnedAllocator
76 void* malloc(
size_t nbytes)
79 CAMP_HIP_API_INVOKE_AND_CHECK(hipHostMalloc, &ptr, nbytes,
81 hipHostMallocNonCoherent);
88 CAMP_HIP_API_INVOKE_AND_CHECK(hipHostFree, ptr);
94 struct DeviceAllocator
98 void* malloc(
size_t nbytes)
101 CAMP_HIP_API_INVOKE_AND_CHECK(hipMalloc, &ptr, nbytes);
108 CAMP_HIP_API_INVOKE_AND_CHECK(hipFree, ptr);
115 struct DeviceZeroedAllocator
119 void* malloc(
size_t nbytes)
121 auto res = ::camp::resources::Hip::get_default();
123 CAMP_HIP_API_INVOKE_AND_CHECK(hipMalloc, &ptr, nbytes);
124 CAMP_HIP_API_INVOKE_AND_CHECK(hipMemsetAsync, ptr, 0, nbytes,
126 CAMP_HIP_API_INVOKE_AND_CHECK(hipStreamSynchronize, res.get_stream());
133 CAMP_HIP_API_INVOKE_AND_CHECK(hipFree, ptr);
139 struct DevicePinnedAllocator
143 void* malloc(
size_t nbytes)
146 CAMP_HIP_API_INVOKE_AND_CHECK(hipMalloc, &ptr, nbytes);
153 CAMP_HIP_API_INVOKE_AND_CHECK(hipFree, ptr);
158 using device_mempool_type = basic_mempool::MemPool<DeviceAllocator>;
159 using device_zeroed_mempool_type =
160 basic_mempool::MemPool<DeviceZeroedAllocator>;
161 using device_pinned_mempool_type =
162 basic_mempool::MemPool<DevicePinnedAllocator>;
163 using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
171 const void* func =
nullptr;
172 hip_dim_t gridDim {0, 0, 0};
173 hip_dim_t blockDim {0, 0, 0};
174 size_t* dynamic_smem =
nullptr;
175 ::RAJA::resources::Hip res {::RAJA::resources::Hip::HipFromStream(0, 0)};
176 bool setup_reducers =
false;
179 struct hipStatusInfo : hipInfo
184 extern hipStatusInfo g_status;
186 thread_local
extern hipStatusInfo tl_status;
189 extern std::unordered_map<hipStream_t, bool> g_stream_info_map;
200 std::lock_guard<std::mutex> lock(detail::g_status.lock);
202 for (
auto& val : detail::g_stream_info_map)
212 CAMP_HIP_API_INVOKE_AND_CHECK(hipDeviceSynchronize);
220 std::lock_guard<std::mutex> lock(detail::g_status.lock);
221 auto iter = detail::g_stream_info_map.find(res.get_stream());
222 if (
iter != detail::g_stream_info_map.end())
238 void launch(::RAJA::resources::Hip res,
bool async =
true)
240 std::lock_guard<std::mutex> lock(detail::g_status.lock);
241 auto iter = detail::g_stream_info_map.find(res.get_stream());
242 if (
iter != detail::g_stream_info_map.end())
248 detail::g_stream_info_map.emplace(res.get_stream(), !async);
258 void launch(
const void* func,
263 ::RAJA::resources::Hip res,
266 CAMP_HIP_API_INVOKE_AND_CHECK(hipLaunchKernel, func, gridDim, blockDim,
args,
267 shmem, res.get_stream());
273 void peekAtLastError() { CAMP_HIP_API_INVOKE_AND_CHECK(hipPeekAtLastError); }
277 bool setupReducers() {
return detail::tl_status.setup_reducers; }
281 hip_dim_t currentGridDim() {
return detail::tl_status.gridDim; }
285 hip_dim_member_t currentGridSize()
287 return detail::tl_status.gridDim.x * detail::tl_status.gridDim.y *
288 detail::tl_status.gridDim.z;
293 hip_dim_t currentBlockDim() {
return detail::tl_status.blockDim; }
297 hip_dim_member_t currentBlockSize()
299 return detail::tl_status.blockDim.x * detail::tl_status.blockDim.y *
300 detail::tl_status.blockDim.z;
305 size_t currentDynamicShmem() {
return *detail::tl_status.dynamic_smem; }
309 size_t maxDynamicShmem()
311 hipFuncAttributes func_attr;
312 CAMP_HIP_API_INVOKE_AND_CHECK(hipFuncGetAttributes, &func_attr,
313 detail::tl_status.func);
314 return func_attr.maxDynamicSharedSizeBytes;
317 constexpr
size_t dynamic_smem_allocation_failure =
330 template<
typename T,
typename GetNFromMax>
331 RAJA_INLINE
size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max,
332 size_t align =
alignof(T))
334 const size_t unaligned_shmem = *detail::tl_status.dynamic_smem;
335 const size_t align_offset = ((unaligned_shmem %
align) !=
size_t(0))
338 const size_t aligned_shmem = unaligned_shmem + align_offset;
340 const size_t max_shmem_bytes = maxDynamicShmem() - aligned_shmem;
341 const size_t n_bytes =
sizeof(T) * std::forward<GetNFromMax>(get_n_from_max)(
342 max_shmem_bytes /
sizeof(T));
344 if (
size_t(0) < n_bytes && n_bytes <= max_shmem_bytes)
346 *detail::tl_status.dynamic_smem = aligned_shmem + n_bytes;
347 return aligned_shmem;
351 return dynamic_smem_allocation_failure;
357 ::RAJA::resources::Hip currentResource() {
return detail::tl_status.res; }
364 template<
typename LOOP_BODY>
365 RAJA_INLINE
typename std::remove_reference<LOOP_BODY>::type make_launch_body(
369 size_t& dynamic_smem,
370 ::RAJA::resources::Hip res,
371 LOOP_BODY&& loop_body)
375 detail::hipInfo {func, gridDim, blockDim, &dynamic_smem, res,
true});
377 using return_type =
typename std::remove_reference<LOOP_BODY>::type;
378 return return_type(std::forward<LOOP_BODY>(loop_body));
381 static constexpr
int hip_occupancy_uninitialized_int = -1;
382 static constexpr
size_t hip_occupancy_uninitialized_size_t =
386 struct HipFixedMaxBlocksData
388 int device_sm_per_device = hip::device_prop().multiProcessorCount;
389 int device_max_threads_per_sm =
390 hip::device_prop().maxThreadsPerMultiProcessor;
395 HipFixedMaxBlocksData hip_max_blocks()
397 static thread_local HipFixedMaxBlocksData data;
403 struct HipOccMaxBlocksThreadsData
405 size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t;
406 int func_max_blocks_per_device = hip_occupancy_uninitialized_int;
407 int func_max_threads_per_block = hip_occupancy_uninitialized_int;
411 template<
typename RAJA_UNUSED_ARG(UniqueMarker)>
412 RAJA_INLINE HipOccMaxBlocksThreadsData
413 hip_occupancy_max_blocks_threads(
const void* func,
414 size_t func_dynamic_shmem_per_block)
416 static thread_local HipOccMaxBlocksThreadsData data;
418 if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
421 data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
423 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
424 CAMP_HIP_API_INVOKE_AND_CHECK(
425 hipOccupancyMaxPotentialBlockSize, &data.func_max_blocks_per_device,
426 &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block);
429 hipDeviceProp_t& prop = hip::device_prop();
430 data.func_max_blocks_per_device = prop.multiProcessorCount;
431 data.func_max_threads_per_block = 1024;
439 struct HipOccMaxBlocksData : HipFixedMaxBlocksData
441 size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t;
442 int func_threads_per_block = hip_occupancy_uninitialized_int;
443 int func_max_blocks_per_sm = hip_occupancy_uninitialized_int;
447 template<
typename RAJA_UNUSED_ARG(UniqueMarker),
int func_threads_per_block>
448 RAJA_INLINE HipOccMaxBlocksData
449 hip_occupancy_max_blocks(
const void* func,
size_t func_dynamic_shmem_per_block)
451 static thread_local HipOccMaxBlocksData data;
453 if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
456 data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
457 data.func_threads_per_block = func_threads_per_block;
459 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
460 CAMP_HIP_API_INVOKE_AND_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor,
461 &data.func_max_blocks_per_sm, func,
462 func_threads_per_block,
463 func_dynamic_shmem_per_block);
466 data.func_max_blocks_per_sm =
467 hip::device_prop().maxThreadsPerMultiProcessor / 1024;
468 if (data.func_max_blocks_per_sm <= 0)
470 data.func_max_blocks_per_sm = 1
479 template<
typename RAJA_UNUSED_ARG(UniqueMarker)>
480 RAJA_INLINE HipOccMaxBlocksData
481 hip_occupancy_max_blocks(
const void* func,
482 size_t func_dynamic_shmem_per_block,
483 int func_threads_per_block)
485 static thread_local HipOccMaxBlocksData data;
487 if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
488 data.func_threads_per_block != func_threads_per_block)
491 data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
492 data.func_threads_per_block = func_threads_per_block;
494 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
495 CAMP_HIP_API_INVOKE_AND_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor,
496 &data.func_max_blocks_per_sm, func,
497 func_threads_per_block,
498 func_dynamic_shmem_per_block);
501 data.func_max_blocks_per_sm =
502 hip::device_prop().maxThreadsPerMultiProcessor / 1024;
503 if (data.func_max_blocks_per_sm <= 0)
505 data.func_max_blocks_per_sm = 1
539 template<
typename IdxT,
typename Concretizer,
typename UniqueMarker>
540 struct ConcretizerImpl
542 ConcretizerImpl(
const void* func,
543 size_t func_dynamic_shmem_per_block,
546 m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block),
550 IdxT get_max_block_size()
const
552 auto data = hip_occupancy_max_blocks_threads<UniqueMarker>(
553 m_func, m_func_dynamic_shmem_per_block);
554 IdxT func_max_threads_per_block = data.func_max_threads_per_block;
555 return func_max_threads_per_block;
559 IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device)
const
561 IdxT func_max_threads_per_block = this->get_max_block_size();
562 IdxT func_threads_per_block =
564 if (func_threads_per_block <= func_max_threads_per_block)
566 return func_threads_per_block;
575 IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block)
const
577 IdxT func_blocks_per_device =
579 return func_blocks_per_device;
583 auto get_block_and_grid_size_to_fit_len()
const
585 IdxT func_max_threads_per_block = this->get_max_block_size();
586 IdxT func_blocks_per_device =
588 return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
592 IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device)
const
594 IdxT func_max_threads_per_block = this->get_max_block_size();
595 IdxT func_threads_per_block =
597 return std::min(func_threads_per_block, func_max_threads_per_block);
601 IdxT get_grid_size_to_fit_device(IdxT func_threads_per_block)
const
603 auto data = hip_occupancy_max_blocks<UniqueMarker>(
604 m_func, m_func_dynamic_shmem_per_block, func_threads_per_block);
605 IdxT func_max_blocks_per_device =
606 Concretizer::template get_max_grid_size<IdxT>(data);
607 IdxT func_blocks_per_device =
609 return std::min(func_blocks_per_device, func_max_blocks_per_device);
613 auto get_block_and_grid_size_to_fit_device()
const
615 IdxT func_max_threads_per_block = this->get_max_block_size();
616 IdxT func_blocks_per_device =
617 this->get_grid_size_to_fit_device(func_max_threads_per_block);
618 return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
623 size_t m_func_dynamic_shmem_per_block;
RAJA header file containing an implementation of a memory pool.
Header file containing RAJA HIP policy definitions.
Header file for common RAJA internal macro definitions.
RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T &&...) noexcept
Definition: macros.hpp:120
RAJA_HOST_DEVICE void RAJA_ABORT_OR_THROW(const char *str)
Definition: macros.hpp:143
#define RAJA_DIVIDE_CEILING_INT(dividend, divisor)
Definition: macros.hpp:122
Args args
Definition: WorkRunner.hpp:212
value_type::device_call &[i_loop] iter
Definition: WorkRunner.hpp:216
RAJA_INLINE void synchronize_impl(const omp_synchronize &)
Synchronize all OpenMP threads and tasks.
Definition: synchronize.hpp:36
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result min(Args... args)
Definition: foldl.hpp:161
void launch(LaunchParams const &launch_params, ReduceParams &&... rest_of_launch_args)
Definition: launch_core.hpp:268
RAJA_INLINE void * align(size_t alignment, size_t size, void *&ptr, size_t &space)
Definition: align.hpp:33
void synchronize()
Synchronize all current RAJA executions for the specified policy.
Definition: synchronize.hpp:44
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155
Header file containing utility methods used in HIP operations.
Header file for RAJA resource definitions.
Assign a new value to an object and restore the object's previous value at the end of the current sco...
Definition: types.hpp:1028
Header file for RAJA type definitions.