RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
MemUtils_HIP.hpp
Go to the documentation of this file.
1 
12 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
13 // Copyright (c) Lawrence Livermore National Security, LLC and other
14 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
15 // files for dates and other details. No copyright assignment is required
16 // to contribute to RAJA.
17 //
18 // SPDX-License-Identifier: (BSD-3-Clause)
19 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
20 
21 #ifndef RAJA_MemUtils_HIP_HPP
22 #define RAJA_MemUtils_HIP_HPP
23 
24 #include "RAJA/config.hpp"
25 
26 #if defined(RAJA_ENABLE_HIP)
27 
28 #include <cassert>
29 #include <cstddef>
30 #include <cstdio>
31 #include <limits>
32 #include <mutex>
33 #include <type_traits>
34 #include <unordered_map>
35 
36 
38 #include "RAJA/util/types.hpp"
39 #include "RAJA/util/macros.hpp"
40 #include "RAJA/util/resource.hpp"
41 
44 
45 namespace RAJA
46 {
47 
48 namespace hip
49 {
50 
52 RAJA_INLINE
53 hipDeviceProp_t get_device_prop()
54 {
55  int device;
56  CAMP_HIP_API_INVOKE_AND_CHECK(hipGetDevice, &device);
57  hipDeviceProp_t prop;
58  CAMP_HIP_API_INVOKE_AND_CHECK(hipGetDeviceProperties, &prop, device);
59  return prop;
60 }
61 
63 // This caches a copy on first use to speedup later calls.
64 RAJA_INLINE
65 hipDeviceProp_t& device_prop()
66 {
67  static thread_local hipDeviceProp_t prop = get_device_prop();
68  return prop;
69 }
70 
72 struct PinnedAllocator
73 {
74 
75  // returns a valid pointer on success, nullptr on failure
76  void* malloc(size_t nbytes)
77  {
78  void* ptr;
79  CAMP_HIP_API_INVOKE_AND_CHECK(hipHostMalloc, &ptr, nbytes,
80  hipHostMallocMapped |
81  hipHostMallocNonCoherent);
82  return ptr;
83  }
84 
85  // returns true on success, throws a run time error exception on failure
86  bool free(void* ptr)
87  {
88  CAMP_HIP_API_INVOKE_AND_CHECK(hipHostFree, ptr);
89  return true;
90  }
91 };
92 
94 struct DeviceAllocator
95 {
96 
97  // returns a valid pointer on success, nullptr on failure
98  void* malloc(size_t nbytes)
99  {
100  void* ptr;
101  CAMP_HIP_API_INVOKE_AND_CHECK(hipMalloc, &ptr, nbytes);
102  return ptr;
103  }
104 
105  // returns true on success, throws a run time error exception on failure
106  bool free(void* ptr)
107  {
108  CAMP_HIP_API_INVOKE_AND_CHECK(hipFree, ptr);
109  return true;
110  }
111 };
112 
114 // Note: Memory must be zero when returned to mempool
115 struct DeviceZeroedAllocator
116 {
117 
118  // returns a valid pointer on success, nullptr on failure
119  void* malloc(size_t nbytes)
120  {
121  auto res = ::camp::resources::Hip::get_default();
122  void* ptr;
123  CAMP_HIP_API_INVOKE_AND_CHECK(hipMalloc, &ptr, nbytes);
124  CAMP_HIP_API_INVOKE_AND_CHECK(hipMemsetAsync, ptr, 0, nbytes,
125  res.get_stream());
126  CAMP_HIP_API_INVOKE_AND_CHECK(hipStreamSynchronize, res.get_stream());
127  return ptr;
128  }
129 
130  // returns true on success, throws a run time error exception on failure
131  bool free(void* ptr)
132  {
133  CAMP_HIP_API_INVOKE_AND_CHECK(hipFree, ptr);
134  return true;
135  }
136 };
137 
139 struct DevicePinnedAllocator
140 {
141 
142  // returns a valid pointer on success, nullptr on failure
143  void* malloc(size_t nbytes)
144  {
145  void* ptr;
146  CAMP_HIP_API_INVOKE_AND_CHECK(hipMalloc, &ptr, nbytes);
147  return ptr;
148  }
149 
150  // returns true on success, throws a run time error exception on failure
151  bool free(void* ptr)
152  {
153  CAMP_HIP_API_INVOKE_AND_CHECK(hipFree, ptr);
154  return true;
155  }
156 };
157 
158 using device_mempool_type = basic_mempool::MemPool<DeviceAllocator>;
159 using device_zeroed_mempool_type =
160  basic_mempool::MemPool<DeviceZeroedAllocator>;
161 using device_pinned_mempool_type =
162  basic_mempool::MemPool<DevicePinnedAllocator>;
163 using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
164 
165 namespace detail
166 {
167 
169 struct hipInfo
170 {
171  const void* func = nullptr;
172  hip_dim_t gridDim {0, 0, 0};
173  hip_dim_t blockDim {0, 0, 0};
174  size_t* dynamic_smem = nullptr;
175  ::RAJA::resources::Hip res {::RAJA::resources::Hip::HipFromStream(0, 0)};
176  bool setup_reducers = false;
177 };
178 
179 struct hipStatusInfo : hipInfo
180 {
181  std::mutex lock;
182 };
183 
184 extern hipStatusInfo g_status;
185 
186 thread_local extern hipStatusInfo tl_status;
187 
188 // stream to synchronization status: true synchronized, false running
189 extern std::unordered_map<hipStream_t, bool> g_stream_info_map;
190 
191 RAJA_INLINE
192 void synchronize_impl(::RAJA::resources::Hip res) { res.wait(); }
193 
194 } // namespace detail
195 
197 RAJA_INLINE
198 void synchronize()
199 {
200  std::lock_guard<std::mutex> lock(detail::g_status.lock);
201  bool synchronize = false;
202  for (auto& val : detail::g_stream_info_map)
203  {
204  if (!val.second)
205  {
206  synchronize = true;
207  val.second = true;
208  }
209  }
210  if (synchronize)
211  {
212  CAMP_HIP_API_INVOKE_AND_CHECK(hipDeviceSynchronize);
213  }
214 }
215 
217 RAJA_INLINE
218 void synchronize(::RAJA::resources::Hip res)
219 {
220  std::lock_guard<std::mutex> lock(detail::g_status.lock);
221  auto iter = detail::g_stream_info_map.find(res.get_stream());
222  if (iter != detail::g_stream_info_map.end())
223  {
224  if (!iter->second)
225  {
226  iter->second = true;
228  }
229  }
230  else
231  {
232  RAJA_ABORT_OR_THROW("Cannot synchronize unknown resource.");
233  }
234 }
235 
237 RAJA_INLINE
238 void launch(::RAJA::resources::Hip res, bool async = true)
239 {
240  std::lock_guard<std::mutex> lock(detail::g_status.lock);
241  auto iter = detail::g_stream_info_map.find(res.get_stream());
242  if (iter != detail::g_stream_info_map.end())
243  {
244  iter->second = !async;
245  }
246  else
247  {
248  detail::g_stream_info_map.emplace(res.get_stream(), !async);
249  }
250  if (!async)
251  {
253  }
254 }
255 
257 RAJA_INLINE
258 void launch(const void* func,
259  hip_dim_t gridDim,
260  hip_dim_t blockDim,
261  void** args,
262  size_t shmem,
263  ::RAJA::resources::Hip res,
264  bool async = true)
265 {
266  CAMP_HIP_API_INVOKE_AND_CHECK(hipLaunchKernel, func, gridDim, blockDim, args,
267  shmem, res.get_stream());
268  launch(res, async);
269 }
270 
272 RAJA_INLINE
273 void peekAtLastError() { CAMP_HIP_API_INVOKE_AND_CHECK(hipPeekAtLastError); }
274 
276 RAJA_INLINE
277 bool setupReducers() { return detail::tl_status.setup_reducers; }
278 
280 RAJA_INLINE
281 hip_dim_t currentGridDim() { return detail::tl_status.gridDim; }
282 
284 RAJA_INLINE
285 hip_dim_member_t currentGridSize()
286 {
287  return detail::tl_status.gridDim.x * detail::tl_status.gridDim.y *
288  detail::tl_status.gridDim.z;
289 }
290 
292 RAJA_INLINE
293 hip_dim_t currentBlockDim() { return detail::tl_status.blockDim; }
294 
296 RAJA_INLINE
297 hip_dim_member_t currentBlockSize()
298 {
299  return detail::tl_status.blockDim.x * detail::tl_status.blockDim.y *
300  detail::tl_status.blockDim.z;
301 }
302 
304 RAJA_INLINE
305 size_t currentDynamicShmem() { return *detail::tl_status.dynamic_smem; }
306 
308 RAJA_INLINE
309 size_t maxDynamicShmem()
310 {
311  hipFuncAttributes func_attr;
312  CAMP_HIP_API_INVOKE_AND_CHECK(hipFuncGetAttributes, &func_attr,
313  detail::tl_status.func);
314  return func_attr.maxDynamicSharedSizeBytes;
315 }
316 
317 constexpr size_t dynamic_smem_allocation_failure =
319 
321 //
322 // The first argument is a functional object that takes the maximum number of
323 // objects that can fit into the dynamic shared memory available and returns
324 // the number of objects to allocate.
325 // The second argument is the required alignment.
326 //
327 // Returns an offset into dynamic shared memory aligned to align on success,
328 // or dynamic_smem_allocation_failure on failure. Note that asking for 0 memory
329 // takes the failure return path.
330 template<typename T, typename GetNFromMax>
331 RAJA_INLINE size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max,
332  size_t align = alignof(T))
333 {
334  const size_t unaligned_shmem = *detail::tl_status.dynamic_smem;
335  const size_t align_offset = ((unaligned_shmem % align) != size_t(0))
336  ? align - (unaligned_shmem % align)
337  : size_t(0);
338  const size_t aligned_shmem = unaligned_shmem + align_offset;
339 
340  const size_t max_shmem_bytes = maxDynamicShmem() - aligned_shmem;
341  const size_t n_bytes = sizeof(T) * std::forward<GetNFromMax>(get_n_from_max)(
342  max_shmem_bytes / sizeof(T));
343 
344  if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes)
345  {
346  *detail::tl_status.dynamic_smem = aligned_shmem + n_bytes;
347  return aligned_shmem;
348  }
349  else
350  {
351  return dynamic_smem_allocation_failure;
352  }
353 }
354 
356 RAJA_INLINE
357 ::RAJA::resources::Hip currentResource() { return detail::tl_status.res; }
358 
360 //
361 // Note: This is done to setup the Reducer and MultiReducer objects through
362 // their copy constructors. Both look at tl_status to setup per kernel launch
363 // resources.
364 template<typename LOOP_BODY>
365 RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
366  const void* func,
367  hip_dim_t gridDim,
368  hip_dim_t blockDim,
369  size_t& dynamic_smem,
370  ::RAJA::resources::Hip res,
371  LOOP_BODY&& loop_body)
372 {
374  detail::tl_status,
375  detail::hipInfo {func, gridDim, blockDim, &dynamic_smem, res, true});
376 
377  using return_type = typename std::remove_reference<LOOP_BODY>::type;
378  return return_type(std::forward<LOOP_BODY>(loop_body));
379 }
380 
381 static constexpr int hip_occupancy_uninitialized_int = -1;
382 static constexpr size_t hip_occupancy_uninitialized_size_t =
384 
386 struct HipFixedMaxBlocksData
387 {
388  int device_sm_per_device = hip::device_prop().multiProcessorCount;
389  int device_max_threads_per_sm =
390  hip::device_prop().maxThreadsPerMultiProcessor;
391 };
392 
394 RAJA_INLINE
395 HipFixedMaxBlocksData hip_max_blocks()
396 {
397  static thread_local HipFixedMaxBlocksData data;
398 
399  return data;
400 }
401 
403 struct HipOccMaxBlocksThreadsData
404 {
405  size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t;
406  int func_max_blocks_per_device = hip_occupancy_uninitialized_int;
407  int func_max_threads_per_block = hip_occupancy_uninitialized_int;
408 };
409 
411 template<typename RAJA_UNUSED_ARG(UniqueMarker)>
412 RAJA_INLINE HipOccMaxBlocksThreadsData
413 hip_occupancy_max_blocks_threads(const void* func,
414  size_t func_dynamic_shmem_per_block)
415 {
416  static thread_local HipOccMaxBlocksThreadsData data;
417 
418  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
419  {
420 
421  data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
422 
423 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
424  CAMP_HIP_API_INVOKE_AND_CHECK(
425  hipOccupancyMaxPotentialBlockSize, &data.func_max_blocks_per_device,
426  &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block);
427 #else
428  RAJA_UNUSED_VAR(func);
429  hipDeviceProp_t& prop = hip::device_prop();
430  data.func_max_blocks_per_device = prop.multiProcessorCount;
431  data.func_max_threads_per_block = 1024;
432 #endif
433  }
434 
435  return data;
436 }
437 
439 struct HipOccMaxBlocksData : HipFixedMaxBlocksData
440 {
441  size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t;
442  int func_threads_per_block = hip_occupancy_uninitialized_int;
443  int func_max_blocks_per_sm = hip_occupancy_uninitialized_int;
444 };
445 
447 template<typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block>
448 RAJA_INLINE HipOccMaxBlocksData
449 hip_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block)
450 {
451  static thread_local HipOccMaxBlocksData data;
452 
453  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
454  {
455 
456  data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
457  data.func_threads_per_block = func_threads_per_block;
458 
459 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
460  CAMP_HIP_API_INVOKE_AND_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor,
461  &data.func_max_blocks_per_sm, func,
462  func_threads_per_block,
463  func_dynamic_shmem_per_block);
464 #else
465  RAJA_UNUSED_VAR(func);
466  data.func_max_blocks_per_sm =
467  hip::device_prop().maxThreadsPerMultiProcessor / 1024;
468  if (data.func_max_blocks_per_sm <= 0)
469  {
470  data.func_max_blocks_per_sm = 1
471  }
472 #endif
473  }
474 
475  return data;
476 }
477 
479 template<typename RAJA_UNUSED_ARG(UniqueMarker)>
480 RAJA_INLINE HipOccMaxBlocksData
481 hip_occupancy_max_blocks(const void* func,
482  size_t func_dynamic_shmem_per_block,
483  int func_threads_per_block)
484 {
485  static thread_local HipOccMaxBlocksData data;
486 
487  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
488  data.func_threads_per_block != func_threads_per_block)
489  {
490 
491  data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
492  data.func_threads_per_block = func_threads_per_block;
493 
494 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
495  CAMP_HIP_API_INVOKE_AND_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor,
496  &data.func_max_blocks_per_sm, func,
497  func_threads_per_block,
498  func_dynamic_shmem_per_block);
499 #else
500  RAJA_UNUSED_VAR(func);
501  data.func_max_blocks_per_sm =
502  hip::device_prop().maxThreadsPerMultiProcessor / 1024;
503  if (data.func_max_blocks_per_sm <= 0)
504  {
505  data.func_max_blocks_per_sm = 1
506  }
507 #endif
508  }
509 
510  return data;
511 }
512 
539 template<typename IdxT, typename Concretizer, typename UniqueMarker>
540 struct ConcretizerImpl
541 {
542  ConcretizerImpl(const void* func,
543  size_t func_dynamic_shmem_per_block,
544  IdxT len)
545  : m_func(func),
546  m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block),
547  m_len(len)
548  {}
549 
550  IdxT get_max_block_size() const
551  {
552  auto data = hip_occupancy_max_blocks_threads<UniqueMarker>(
553  m_func, m_func_dynamic_shmem_per_block);
554  IdxT func_max_threads_per_block = data.func_max_threads_per_block;
555  return func_max_threads_per_block;
556  }
557 
559  IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const
560  {
561  IdxT func_max_threads_per_block = this->get_max_block_size();
562  IdxT func_threads_per_block =
563  RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
564  if (func_threads_per_block <= func_max_threads_per_block)
565  {
566  return func_threads_per_block;
567  }
568  else
569  {
570  return IdxT(0);
571  }
572  }
573 
575  IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const
576  {
577  IdxT func_blocks_per_device =
578  RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
579  return func_blocks_per_device;
580  }
581 
583  auto get_block_and_grid_size_to_fit_len() const
584  {
585  IdxT func_max_threads_per_block = this->get_max_block_size();
586  IdxT func_blocks_per_device =
587  RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
588  return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
589  }
590 
592  IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const
593  {
594  IdxT func_max_threads_per_block = this->get_max_block_size();
595  IdxT func_threads_per_block =
596  RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
597  return std::min(func_threads_per_block, func_max_threads_per_block);
598  }
599 
601  IdxT get_grid_size_to_fit_device(IdxT func_threads_per_block) const
602  {
603  auto data = hip_occupancy_max_blocks<UniqueMarker>(
604  m_func, m_func_dynamic_shmem_per_block, func_threads_per_block);
605  IdxT func_max_blocks_per_device =
606  Concretizer::template get_max_grid_size<IdxT>(data);
607  IdxT func_blocks_per_device =
608  RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
609  return std::min(func_blocks_per_device, func_max_blocks_per_device);
610  }
611 
613  auto get_block_and_grid_size_to_fit_device() const
614  {
615  IdxT func_max_threads_per_block = this->get_max_block_size();
616  IdxT func_blocks_per_device =
617  this->get_grid_size_to_fit_device(func_max_threads_per_block);
618  return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
619  }
620 
621 private:
622  const void* m_func;
623  size_t m_func_dynamic_shmem_per_block;
624  IdxT m_len;
625 };
626 
627 } // namespace hip
628 
629 } // namespace RAJA
630 
631 #endif // closing endif for RAJA_ENABLE_HIP
632 
633 #endif // closing endif for header file include guard
RAJA header file containing an implementation of a memory pool.
Header file containing RAJA HIP policy definitions.
Header file for common RAJA internal macro definitions.
RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T &&...) noexcept
Definition: macros.hpp:120
RAJA_HOST_DEVICE void RAJA_ABORT_OR_THROW(const char *str)
Definition: macros.hpp:143
#define RAJA_DIVIDE_CEILING_INT(dividend, divisor)
Definition: macros.hpp:122
Args args
Definition: WorkRunner.hpp:212
value_type::device_call &[i_loop] iter
Definition: WorkRunner.hpp:216
RAJA_INLINE void synchronize_impl(const omp_synchronize &)
Synchronize all OpenMP threads and tasks.
Definition: synchronize.hpp:36
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result min(Args... args)
Definition: foldl.hpp:161
void launch(LaunchParams const &launch_params, ReduceParams &&... rest_of_launch_args)
Definition: launch_core.hpp:268
RAJA_INLINE void * align(size_t alignment, size_t size, void *&ptr, size_t &space)
Definition: align.hpp:33
void synchronize()
Synchronize all current RAJA executions for the specified policy.
Definition: synchronize.hpp:44
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155
Header file containing utility methods used in HIP operations.
Header file for RAJA resource definitions.
Assign a new value to an object and restore the object's previous value at the end of the current sco...
Definition: types.hpp:1028
Header file for RAJA type definitions.