RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
MemUtils_CUDA.hpp
Go to the documentation of this file.
1 
12 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
13 // Copyright (c) Lawrence Livermore National Security, LLC and other
14 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
15 // files for dates and other details. No copyright assignment is required
16 // to contribute to RAJA.
17 //
18 // SPDX-License-Identifier: (BSD-3-Clause)
19 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
20 
21 #ifndef RAJA_MemUtils_CUDA_HPP
22 #define RAJA_MemUtils_CUDA_HPP
23 
24 #include "RAJA/config.hpp"
25 
26 #if defined(RAJA_ENABLE_CUDA)
27 
28 #include <cassert>
29 #include <cstddef>
30 #include <cstdio>
31 #include <limits>
32 #include <mutex>
33 #include <type_traits>
34 #include <unordered_map>
35 
36 
38 #include "RAJA/util/types.hpp"
39 #include "RAJA/util/macros.hpp"
40 #include "RAJA/util/resource.hpp"
41 
44 
45 namespace RAJA
46 {
47 
48 namespace cuda
49 {
50 
52 RAJA_INLINE
53 cudaDeviceProp get_device_prop()
54 {
55  int device;
56  CAMP_CUDA_API_INVOKE_AND_CHECK(cudaGetDevice, &device);
57  cudaDeviceProp prop;
58  CAMP_CUDA_API_INVOKE_AND_CHECK(cudaGetDeviceProperties, &prop, device);
59  return prop;
60 }
61 
63 // This caches a copy on first use to speedup later calls.
64 RAJA_INLINE
65 cudaDeviceProp& device_prop()
66 {
67  static thread_local cudaDeviceProp prop = get_device_prop();
68  return prop;
69 }
70 
72 struct PinnedAllocator
73 {
74 
75  // returns a valid pointer on success, nullptr on failure
76  void* malloc(size_t nbytes)
77  {
78  void* ptr;
79  CAMP_CUDA_API_INVOKE_AND_CHECK(cudaHostAlloc, &ptr, nbytes,
80  cudaHostAllocMapped);
81  return ptr;
82  }
83 
84  // returns true on success, throws a run time error exception on failure
85  bool free(void* ptr)
86  {
87  CAMP_CUDA_API_INVOKE_AND_CHECK(cudaFreeHost, ptr);
88  return true;
89  }
90 };
91 
93 struct DeviceAllocator
94 {
95 
96  // returns a valid pointer on success, nullptr on failure
97  void* malloc(size_t nbytes)
98  {
99  void* ptr;
100  CAMP_CUDA_API_INVOKE_AND_CHECK(cudaMalloc, &ptr, nbytes);
101  return ptr;
102  }
103 
104  // returns true on success, throws a run time error exception on failure
105  bool free(void* ptr)
106  {
107  CAMP_CUDA_API_INVOKE_AND_CHECK(cudaFree, ptr);
108  return true;
109  }
110 };
111 
113 // Note: Memory must be zero when returned to mempool
114 struct DeviceZeroedAllocator
115 {
116 
117  // returns a valid pointer on success, nullptr on failure
118  void* malloc(size_t nbytes)
119  {
120  auto res = ::camp::resources::Cuda::get_default();
121  void* ptr;
122  CAMP_CUDA_API_INVOKE_AND_CHECK(cudaMalloc, &ptr, nbytes);
123  CAMP_CUDA_API_INVOKE_AND_CHECK(cudaMemsetAsync, ptr, 0, nbytes,
124  res.get_stream());
125  CAMP_CUDA_API_INVOKE_AND_CHECK(cudaStreamSynchronize, res.get_stream());
126  return ptr;
127  }
128 
129  // returns true on success, throws a run time error exception on failure
130  bool free(void* ptr)
131  {
132  CAMP_CUDA_API_INVOKE_AND_CHECK(cudaFree, ptr);
133  return true;
134  }
135 };
136 
138 struct DevicePinnedAllocator
139 {
140 
141  // returns a valid pointer on success, nullptr on failure
142  void* malloc(size_t nbytes)
143  {
144  int device;
145  CAMP_CUDA_API_INVOKE_AND_CHECK(cudaGetDevice, &device);
146  void* ptr;
147  CAMP_CUDA_API_INVOKE_AND_CHECK(cudaMallocManaged, &ptr, nbytes,
148  cudaMemAttachGlobal);
149  CAMP_CUDA_API_INVOKE_AND_CHECK(cudaMemAdvise, ptr, nbytes,
150  cudaMemAdviseSetPreferredLocation, device);
151  CAMP_CUDA_API_INVOKE_AND_CHECK(cudaMemAdvise, ptr, nbytes,
152  cudaMemAdviseSetAccessedBy, cudaCpuDeviceId);
153 
154  return ptr;
155  }
156 
157  // returns true on success, throws a run time error exception on failure
158  bool free(void* ptr)
159  {
160  CAMP_CUDA_API_INVOKE_AND_CHECK(cudaFree, ptr);
161  return true;
162  }
163 };
164 
165 using device_mempool_type = basic_mempool::MemPool<DeviceAllocator>;
166 using device_zeroed_mempool_type =
167  basic_mempool::MemPool<DeviceZeroedAllocator>;
168 using device_pinned_mempool_type =
169  basic_mempool::MemPool<DevicePinnedAllocator>;
170 using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
171 
172 namespace detail
173 {
174 
176 struct cudaInfo
177 {
178  const void* func = nullptr;
179  cuda_dim_t gridDim {0, 0, 0};
180  cuda_dim_t blockDim {0, 0, 0};
181  size_t* dynamic_smem = nullptr;
182  ::RAJA::resources::Cuda res {::RAJA::resources::Cuda::CudaFromStream(0, 0)};
183  bool setup_reducers = false;
184 };
185 
186 struct cudaStatusInfo : cudaInfo
187 {
188  std::mutex lock;
189 };
190 
191 extern cudaStatusInfo g_status;
192 
193 thread_local extern cudaStatusInfo tl_status;
194 
195 // stream to synchronization status: true synchronized, false running
196 extern std::unordered_map<cudaStream_t, bool> g_stream_info_map;
197 
198 RAJA_INLINE
199 void synchronize_impl(::RAJA::resources::Cuda res) { res.wait(); }
200 
201 } // namespace detail
202 
204 RAJA_INLINE
205 void synchronize()
206 {
207  std::lock_guard<std::mutex> lock(detail::g_status.lock);
208  bool synchronize = false;
209  for (auto& val : detail::g_stream_info_map)
210  {
211  if (!val.second)
212  {
213  synchronize = true;
214  val.second = true;
215  }
216  }
217  if (synchronize)
218  {
219  CAMP_CUDA_API_INVOKE_AND_CHECK(cudaDeviceSynchronize);
220  }
221 }
222 
224 RAJA_INLINE
225 void synchronize(::RAJA::resources::Cuda res)
226 {
227  std::lock_guard<std::mutex> lock(detail::g_status.lock);
228  auto iter = detail::g_stream_info_map.find(res.get_stream());
229  if (iter != detail::g_stream_info_map.end())
230  {
231  if (!iter->second)
232  {
233  iter->second = true;
235  }
236  }
237  else
238  {
239  RAJA_ABORT_OR_THROW("Cannot synchronize unknown resource.");
240  }
241 }
242 
244 RAJA_INLINE
245 void launch(::RAJA::resources::Cuda res, bool async = true)
246 {
247  std::lock_guard<std::mutex> lock(detail::g_status.lock);
248  auto iter = detail::g_stream_info_map.find(res.get_stream());
249  if (iter != detail::g_stream_info_map.end())
250  {
251  iter->second = !async;
252  }
253  else
254  {
255  detail::g_stream_info_map.emplace(res.get_stream(), !async);
256  }
257  if (!async)
258  {
260  }
261 }
262 
264 RAJA_INLINE
265 void launch(const void* func,
266  cuda_dim_t gridDim,
267  cuda_dim_t blockDim,
268  void** args,
269  size_t shmem,
270  ::RAJA::resources::Cuda res,
271  bool async = true)
272 {
273  CAMP_CUDA_API_INVOKE_AND_CHECK(cudaLaunchKernel, func, gridDim, blockDim,
274  args, shmem, res.get_stream());
275  launch(res, async);
276 }
277 
279 RAJA_INLINE
280 void peekAtLastError() { CAMP_CUDA_API_INVOKE_AND_CHECK(cudaPeekAtLastError); }
281 
283 RAJA_INLINE
284 bool setupReducers() { return detail::tl_status.setup_reducers; }
285 
287 RAJA_INLINE
288 cuda_dim_t currentGridDim() { return detail::tl_status.gridDim; }
289 
291 RAJA_INLINE
292 cuda_dim_member_t currentGridSize()
293 {
294  return detail::tl_status.gridDim.x * detail::tl_status.gridDim.y *
295  detail::tl_status.gridDim.z;
296 }
297 
299 RAJA_INLINE
300 cuda_dim_t currentBlockDim() { return detail::tl_status.blockDim; }
301 
303 RAJA_INLINE
304 cuda_dim_member_t currentBlockSize()
305 {
306  return detail::tl_status.blockDim.x * detail::tl_status.blockDim.y *
307  detail::tl_status.blockDim.z;
308 }
309 
311 RAJA_INLINE
312 size_t currentDynamicShmem() { return *detail::tl_status.dynamic_smem; }
313 
315 RAJA_INLINE
316 size_t maxDynamicShmem()
317 {
318  cudaFuncAttributes func_attr;
319  CAMP_CUDA_API_INVOKE_AND_CHECK(cudaFuncGetAttributes, &func_attr,
320  detail::tl_status.func);
321  return func_attr.maxDynamicSharedSizeBytes;
322 }
323 
324 constexpr size_t dynamic_smem_allocation_failure =
326 
328 //
329 // The first argument is a functional object that takes the maximum number of
330 // objects that can fit into the dynamic shared memory available and returns
331 // the number of objects to allocate.
332 // The second argument is the required alignment.
333 //
334 // Returns an offset into dynamic shared memory aligned to align on success,
335 // or dynamic_smem_allocation_failure on failure. Note that asking for 0 memory
336 // takes the failure return path.
337 template<typename T, typename GetNFromMax>
338 RAJA_INLINE size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max,
339  size_t align = alignof(T))
340 {
341  const size_t unaligned_shmem = *detail::tl_status.dynamic_smem;
342  const size_t align_offset = ((unaligned_shmem % align) != size_t(0))
343  ? align - (unaligned_shmem % align)
344  : size_t(0);
345  const size_t aligned_shmem = unaligned_shmem + align_offset;
346 
347  const size_t max_shmem_bytes = maxDynamicShmem() - aligned_shmem;
348  const size_t n_bytes = sizeof(T) * std::forward<GetNFromMax>(get_n_from_max)(
349  max_shmem_bytes / sizeof(T));
350 
351  if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes)
352  {
353  *detail::tl_status.dynamic_smem = aligned_shmem + n_bytes;
354  return aligned_shmem;
355  }
356  else
357  {
358  return dynamic_smem_allocation_failure;
359  }
360 }
361 
363 RAJA_INLINE
364 ::RAJA::resources::Cuda currentResource() { return detail::tl_status.res; }
365 
367 //
368 // Note: This is done to setup the Reducer and MultiReducer objects through
369 // their copy constructors. Both look at tl_status to setup per kernel launch
370 // resources.
371 template<typename LOOP_BODY>
372 RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
373  const void* func,
374  cuda_dim_t gridDim,
375  cuda_dim_t blockDim,
376  size_t& dynamic_smem,
377  ::RAJA::resources::Cuda res,
378  LOOP_BODY&& loop_body)
379 {
381  detail::tl_status,
382  detail::cudaInfo {func, gridDim, blockDim, &dynamic_smem, res, true});
383 
384  using return_type = typename std::remove_reference<LOOP_BODY>::type;
385  return return_type(std::forward<LOOP_BODY>(loop_body));
386 }
387 
388 static constexpr int cuda_occupancy_uninitialized_int = -1;
389 static constexpr size_t cuda_occupancy_uninitialized_size_t =
391 
393 struct CudaFixedMaxBlocksData
394 {
395  int device_sm_per_device = cuda::device_prop().multiProcessorCount;
396  int device_max_threads_per_sm =
397  cuda::device_prop().maxThreadsPerMultiProcessor;
398 };
399 
401 RAJA_INLINE
402 CudaFixedMaxBlocksData cuda_max_blocks()
403 {
404  static thread_local CudaFixedMaxBlocksData data;
405 
406  return data;
407 }
408 
410 struct CudaOccMaxBlocksThreadsData
411 {
412  size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t;
413  int func_max_blocks_per_device = cuda_occupancy_uninitialized_int;
414  int func_max_threads_per_block = cuda_occupancy_uninitialized_int;
415 };
416 
418 template<typename RAJA_UNUSED_ARG(UniqueMarker)>
419 RAJA_INLINE CudaOccMaxBlocksThreadsData
420 cuda_occupancy_max_blocks_threads(const void* func,
421  size_t func_dynamic_shmem_per_block)
422 {
423  static thread_local CudaOccMaxBlocksThreadsData data;
424 
425  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
426  {
427 
428  data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
429 
430  CAMP_CUDA_API_INVOKE_AND_CHECK(
431  cudaOccupancyMaxPotentialBlockSize, &data.func_max_blocks_per_device,
432  &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block);
433  }
434 
435  return data;
436 }
437 
439 struct CudaOccMaxBlocksData : CudaFixedMaxBlocksData
440 {
441  size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t;
442  int func_threads_per_block = cuda_occupancy_uninitialized_int;
443  int func_max_blocks_per_sm = cuda_occupancy_uninitialized_int;
444 };
445 
447 template<typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block>
448 RAJA_INLINE CudaOccMaxBlocksData
449 cuda_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block)
450 {
451  static thread_local CudaOccMaxBlocksData data;
452 
453  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
454  {
455 
456  data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
457  data.func_threads_per_block = func_threads_per_block;
458 
459  CAMP_CUDA_API_INVOKE_AND_CHECK(
460  cudaOccupancyMaxActiveBlocksPerMultiprocessor,
461  &data.func_max_blocks_per_sm, func, func_threads_per_block,
462  func_dynamic_shmem_per_block);
463  }
464 
465  return data;
466 }
467 
469 template<typename RAJA_UNUSED_ARG(UniqueMarker)>
470 RAJA_INLINE CudaOccMaxBlocksData
471 cuda_occupancy_max_blocks(const void* func,
472  size_t func_dynamic_shmem_per_block,
473  int func_threads_per_block)
474 {
475  static thread_local CudaOccMaxBlocksData data;
476 
477  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
478  data.func_threads_per_block != func_threads_per_block)
479  {
480 
481  data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
482  data.func_threads_per_block = func_threads_per_block;
483 
484  CAMP_CUDA_API_INVOKE_AND_CHECK(
485  cudaOccupancyMaxActiveBlocksPerMultiprocessor,
486  &data.func_max_blocks_per_sm, func, func_threads_per_block,
487  func_dynamic_shmem_per_block);
488  }
489 
490  return data;
491 }
492 
519 template<typename IdxT, typename Concretizer, typename UniqueMarker>
520 struct ConcretizerImpl
521 {
522  ConcretizerImpl(const void* func,
523  size_t func_dynamic_shmem_per_block,
524  IdxT len)
525  : m_func(func),
526  m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block),
527  m_len(len)
528  {}
529 
530  IdxT get_max_block_size() const
531  {
532  auto data = cuda_occupancy_max_blocks_threads<UniqueMarker>(
533  m_func, m_func_dynamic_shmem_per_block);
534  IdxT func_max_threads_per_block = data.func_max_threads_per_block;
535  return func_max_threads_per_block;
536  }
537 
539  IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const
540  {
541  IdxT func_max_threads_per_block = this->get_max_block_size();
542  IdxT func_threads_per_block =
543  RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
544  if (func_threads_per_block <= func_max_threads_per_block)
545  {
546  return func_threads_per_block;
547  }
548  else
549  {
550  return IdxT(0);
551  }
552  }
553 
555  IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const
556  {
557  IdxT func_blocks_per_device =
558  RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
559  return func_blocks_per_device;
560  }
561 
563  auto get_block_and_grid_size_to_fit_len() const
564  {
565  IdxT func_max_threads_per_block = this->get_max_block_size();
566  IdxT func_blocks_per_device =
567  RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
568  return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
569  }
570 
572  IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const
573  {
574  IdxT func_max_threads_per_block = this->get_max_block_size();
575  IdxT func_threads_per_block =
576  RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
577  return std::min(func_threads_per_block, func_max_threads_per_block);
578  }
579 
581  IdxT get_grid_size_to_fit_device(IdxT func_threads_per_block) const
582  {
583  auto data = cuda_occupancy_max_blocks<UniqueMarker>(
584  m_func, m_func_dynamic_shmem_per_block, func_threads_per_block);
585  IdxT func_max_blocks_per_device =
586  Concretizer::template get_max_grid_size<IdxT>(data);
587  IdxT func_blocks_per_device =
588  RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
589  return std::min(func_blocks_per_device, func_max_blocks_per_device);
590  }
591 
593  auto get_block_and_grid_size_to_fit_device() const
594  {
595  IdxT func_max_threads_per_block = this->get_max_block_size();
596  IdxT func_blocks_per_device =
597  this->get_grid_size_to_fit_device(func_max_threads_per_block);
598  return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
599  }
600 
601 private:
602  const void* m_func;
603  size_t m_func_dynamic_shmem_per_block;
604  IdxT m_len;
605 };
606 
607 } // namespace cuda
608 
609 } // namespace RAJA
610 
611 #endif // closing endif for RAJA_ENABLE_CUDA
612 
613 #endif // closing endif for header file include guard
RAJA header file containing an implementation of a memory pool.
Header file containing RAJA CUDA policy definitions.
Header file for common RAJA internal macro definitions.
RAJA_HOST_DEVICE void RAJA_ABORT_OR_THROW(const char *str)
Definition: macros.hpp:143
#define RAJA_DIVIDE_CEILING_INT(dividend, divisor)
Definition: macros.hpp:122
Args args
Definition: WorkRunner.hpp:212
value_type::device_call &[i_loop] iter
Definition: WorkRunner.hpp:216
RAJA_INLINE void synchronize_impl(const omp_synchronize &)
Synchronize all OpenMP threads and tasks.
Definition: synchronize.hpp:36
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result min(Args... args)
Definition: foldl.hpp:161
void launch(LaunchParams const &launch_params, ReduceParams &&... rest_of_launch_args)
Definition: launch_core.hpp:268
RAJA_INLINE void * align(size_t alignment, size_t size, void *&ptr, size_t &space)
Definition: align.hpp:33
void synchronize()
Synchronize all current RAJA executions for the specified policy.
Definition: synchronize.hpp:44
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155
Header file containing utility methods used in CUDA operations.
Header file for RAJA resource definitions.
Assign a new value to an object and restore the object's previous value at the end of the current sco...
Definition: types.hpp:1028
Header file for RAJA type definitions.