20 #ifndef RAJA_sort_cuda_HPP
21 #define RAJA_sort_cuda_HPP
23 #include "RAJA/config.hpp"
25 #if defined(RAJA_ENABLE_CUDA)
29 #include <type_traits>
31 #include "cub/device/device_radix_sort.cuh"
49 template<
typename IterationMapping,
50 typename IterationGetter,
56 concepts::enable_if_t<
57 resources::EventProxy<resources::Cuda>,
58 concepts::negate<concepts::all_of<
59 type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
60 std::is_pointer<Iter>,
62 camp::is_same<Compare,
63 operators::less<RAJA::detail::IterVal<Iter>>>,
64 camp::is_same<Compare,
65 operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
66 stable(resources::Cuda cuda_res,
67 ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
76 static_assert(std::is_pointer<Iter>::value,
77 "stable_sort<cuda_exec> is only implemented for pointers");
80 type_traits::is_arithmetic<iterval>::value,
81 "stable_sort<cuda_exec> is only implemented for arithmetic types");
82 static_assert(concepts::any_of<
83 camp::is_same<Compare, operators::less<iterval>>,
84 camp::is_same<Compare, operators::greater<iterval>>>::value,
85 "stable_sort<cuda_exec> is only implemented for "
86 "RAJA::operators::less or RAJA::operators::greater");
88 return resources::EventProxy<resources::Cuda>(cuda_res);
94 template<
typename IterationMapping,
95 typename IterationGetter,
100 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
101 type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
102 std::is_pointer<Iter>>
103 stable(resources::Cuda cuda_res,
104 ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
113 cudaStream_t stream = cuda_res.get_stream();
117 int len = std::distance(begin, end);
119 int end_bit =
sizeof(R) * CHAR_BIT;
122 R* d_out = cuda::device_mempool_type::getInstance().malloc<R>(len);
126 cub::DoubleBuffer<R> d_keys(begin, d_out);
129 void* d_temp_storage =
nullptr;
130 size_t temp_storage_bytes = 0;
131 CAMP_CUDA_API_INVOKE_AND_CHECK(::cub::DeviceRadixSort::SortKeys,
132 d_temp_storage, temp_storage_bytes, d_keys,
133 len, begin_bit, end_bit, stream);
136 cuda::device_mempool_type::getInstance().malloc<
unsigned char>(
140 CAMP_CUDA_API_INVOKE_AND_CHECK(::cub::DeviceRadixSort::SortKeys,
141 d_temp_storage, temp_storage_bytes, d_keys,
142 len, begin_bit, end_bit, stream);
144 cuda::device_mempool_type::getInstance().free(d_temp_storage);
146 if (d_keys.Current() == d_out)
150 CAMP_CUDA_API_INVOKE_AND_CHECK(cudaMemcpyAsync, begin, d_out,
151 len *
sizeof(R), cudaMemcpyDefault, stream);
154 cuda::device_mempool_type::getInstance().free(d_out);
158 return resources::EventProxy<resources::Cuda>(cuda_res);
164 template<
typename IterationMapping,
165 typename IterationGetter,
166 typename Concretizer,
167 size_t BLOCKS_PER_SM,
170 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
171 type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
172 std::is_pointer<Iter>>
173 stable(resources::Cuda cuda_res,
174 ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
183 cudaStream_t stream = cuda_res.get_stream();
187 int len = std::distance(begin, end);
189 int end_bit =
sizeof(R) * CHAR_BIT;
192 R* d_out = cuda::device_mempool_type::getInstance().malloc<R>(len);
196 cub::DoubleBuffer<R> d_keys(begin, d_out);
199 void* d_temp_storage =
nullptr;
200 size_t temp_storage_bytes = 0;
201 CAMP_CUDA_API_INVOKE_AND_CHECK(::cub::DeviceRadixSort::SortKeysDescending,
202 d_temp_storage, temp_storage_bytes, d_keys,
203 len, begin_bit, end_bit, stream);
206 cuda::device_mempool_type::getInstance().malloc<
unsigned char>(
210 CAMP_CUDA_API_INVOKE_AND_CHECK(::cub::DeviceRadixSort::SortKeysDescending,
211 d_temp_storage, temp_storage_bytes, d_keys,
212 len, begin_bit, end_bit, stream);
214 cuda::device_mempool_type::getInstance().free(d_temp_storage);
216 if (d_keys.Current() == d_out)
220 CAMP_CUDA_API_INVOKE_AND_CHECK(cudaMemcpyAsync, begin, d_out,
221 len *
sizeof(R), cudaMemcpyDefault, stream);
224 cuda::device_mempool_type::getInstance().free(d_out);
228 return resources::EventProxy<resources::Cuda>(cuda_res);
234 template<
typename IterationMapping,
235 typename IterationGetter,
236 typename Concretizer,
237 size_t BLOCKS_PER_SM,
241 concepts::enable_if_t<
242 resources::EventProxy<resources::Cuda>,
243 concepts::negate<concepts::all_of<
244 type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
245 std::is_pointer<Iter>,
247 camp::is_same<Compare,
248 operators::less<RAJA::detail::IterVal<Iter>>>,
249 camp::is_same<Compare,
250 operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
252 ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
261 static_assert(std::is_pointer<Iter>::value,
262 "sort<cuda_exec> is only implemented for pointers");
264 static_assert(type_traits::is_arithmetic<iterval>::value,
265 "sort<cuda_exec> is only implemented for arithmetic types");
266 static_assert(concepts::any_of<
267 camp::is_same<Compare, operators::less<iterval>>,
268 camp::is_same<Compare, operators::greater<iterval>>>::value,
269 "sort<cuda_exec> is only implemented for RAJA::operators::less "
270 "or RAJA::operators::greater");
272 return resources::EventProxy<resources::Cuda>(cuda_res);
278 template<
typename IterationMapping,
279 typename IterationGetter,
280 typename Concretizer,
281 size_t BLOCKS_PER_SM,
284 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
285 type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
286 std::is_pointer<Iter>>
288 ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
297 return stable(cuda_res, p, begin, end, comp);
303 template<
typename IterationMapping,
304 typename IterationGetter,
305 typename Concretizer,
306 size_t BLOCKS_PER_SM,
309 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
310 type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
311 std::is_pointer<Iter>>
313 ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
322 return stable(cuda_res, p, begin, end, comp);
328 template<
typename IterationMapping,
329 typename IterationGetter,
330 typename Concretizer,
331 size_t BLOCKS_PER_SM,
336 concepts::enable_if_t<
337 resources::EventProxy<resources::Cuda>,
338 concepts::negate<concepts::all_of<
339 type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
340 std::is_pointer<KeyIter>,
341 std::is_pointer<ValIter>,
343 camp::is_same<Compare,
344 operators::less<RAJA::detail::IterVal<KeyIter>>>,
347 operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
349 ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
360 std::is_pointer<KeyIter>::value,
361 "stable_sort_pairs<cuda_exec> is only implemented for pointers");
363 std::is_pointer<ValIter>::value,
364 "stable_sort_pairs<cuda_exec> is only implemented for pointers");
367 type_traits::is_arithmetic<K>::value,
368 "stable_sort_pairs<cuda_exec> is only implemented for arithmetic types");
370 concepts::any_of<camp::is_same<Compare, operators::less<K>>,
371 camp::is_same<Compare, operators::greater<K>>>::value,
372 "stable_sort_pairs<cuda_exec> is only implemented for "
373 "RAJA::operators::less or RAJA::operators::greater");
375 return resources::EventProxy<resources::Cuda>(cuda_res);
381 template<
typename IterationMapping,
382 typename IterationGetter,
383 typename Concretizer,
384 size_t BLOCKS_PER_SM,
388 concepts::enable_if_t<
389 resources::EventProxy<resources::Cuda>,
390 type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
391 std::is_pointer<KeyIter>,
392 std::is_pointer<ValIter>>
394 ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
404 cudaStream_t stream = cuda_res.get_stream();
409 int len = std::distance(keys_begin, keys_end);
411 int end_bit =
sizeof(K) * CHAR_BIT;
414 K* d_keys_out = cuda::device_mempool_type::getInstance().malloc<K>(len);
415 V* d_vals_out = cuda::device_mempool_type::getInstance().malloc<V>(len);
419 cub::DoubleBuffer<K> d_keys(keys_begin, d_keys_out);
420 cub::DoubleBuffer<V> d_vals(vals_begin, d_vals_out);
423 void* d_temp_storage =
nullptr;
424 size_t temp_storage_bytes = 0;
425 CAMP_CUDA_API_INVOKE_AND_CHECK(::cub::DeviceRadixSort::SortPairs,
426 d_temp_storage, temp_storage_bytes, d_keys,
427 d_vals, len, begin_bit, end_bit, stream);
430 cuda::device_mempool_type::getInstance().malloc<
unsigned char>(
434 CAMP_CUDA_API_INVOKE_AND_CHECK(::cub::DeviceRadixSort::SortPairs,
435 d_temp_storage, temp_storage_bytes, d_keys,
436 d_vals, len, begin_bit, end_bit, stream);
438 cuda::device_mempool_type::getInstance().free(d_temp_storage);
440 if (d_keys.Current() == d_keys_out)
444 CAMP_CUDA_API_INVOKE_AND_CHECK(cudaMemcpyAsync, keys_begin, d_keys_out,
445 len *
sizeof(K), cudaMemcpyDefault, stream);
447 if (d_vals.Current() == d_vals_out)
451 CAMP_CUDA_API_INVOKE_AND_CHECK(cudaMemcpyAsync, vals_begin, d_vals_out,
452 len *
sizeof(V), cudaMemcpyDefault, stream);
455 cuda::device_mempool_type::getInstance().free(d_keys_out);
456 cuda::device_mempool_type::getInstance().free(d_vals_out);
460 return resources::EventProxy<resources::Cuda>(cuda_res);
466 template<
typename IterationMapping,
467 typename IterationGetter,
468 typename Concretizer,
469 size_t BLOCKS_PER_SM,
473 concepts::enable_if_t<
474 resources::EventProxy<resources::Cuda>,
475 type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
476 std::is_pointer<KeyIter>,
477 std::is_pointer<ValIter>>
479 ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
489 cudaStream_t stream = cuda_res.get_stream();
494 int len = std::distance(keys_begin, keys_end);
496 int end_bit =
sizeof(K) * CHAR_BIT;
499 K* d_keys_out = cuda::device_mempool_type::getInstance().malloc<K>(len);
500 V* d_vals_out = cuda::device_mempool_type::getInstance().malloc<V>(len);
504 cub::DoubleBuffer<K> d_keys(keys_begin, d_keys_out);
505 cub::DoubleBuffer<V> d_vals(vals_begin, d_vals_out);
508 void* d_temp_storage =
nullptr;
509 size_t temp_storage_bytes = 0;
510 CAMP_CUDA_API_INVOKE_AND_CHECK(::cub::DeviceRadixSort::SortPairsDescending,
511 d_temp_storage, temp_storage_bytes, d_keys,
512 d_vals, len, begin_bit, end_bit, stream);
515 cuda::device_mempool_type::getInstance().malloc<
unsigned char>(
519 CAMP_CUDA_API_INVOKE_AND_CHECK(::cub::DeviceRadixSort::SortPairsDescending,
520 d_temp_storage, temp_storage_bytes, d_keys,
521 d_vals, len, begin_bit, end_bit, stream);
523 cuda::device_mempool_type::getInstance().free(d_temp_storage);
525 if (d_keys.Current() == d_keys_out)
529 CAMP_CUDA_API_INVOKE_AND_CHECK(cudaMemcpyAsync, keys_begin, d_keys_out,
530 len *
sizeof(K), cudaMemcpyDefault, stream);
532 if (d_vals.Current() == d_vals_out)
536 CAMP_CUDA_API_INVOKE_AND_CHECK(cudaMemcpyAsync, vals_begin, d_vals_out,
537 len *
sizeof(V), cudaMemcpyDefault, stream);
540 cuda::device_mempool_type::getInstance().free(d_keys_out);
541 cuda::device_mempool_type::getInstance().free(d_vals_out);
545 return resources::EventProxy<resources::Cuda>(cuda_res);
551 template<
typename IterationMapping,
552 typename IterationGetter,
553 typename Concretizer,
554 size_t BLOCKS_PER_SM,
559 concepts::enable_if_t<
560 resources::EventProxy<resources::Cuda>,
561 concepts::negate<concepts::all_of<
562 type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
563 std::is_pointer<KeyIter>,
564 std::is_pointer<ValIter>,
566 camp::is_same<Compare,
567 operators::less<RAJA::detail::IterVal<KeyIter>>>,
570 operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
572 ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
582 static_assert(std::is_pointer<KeyIter>::value,
583 "sort_pairs<cuda_exec> is only implemented for pointers");
584 static_assert(std::is_pointer<ValIter>::value,
585 "sort_pairs<cuda_exec> is only implemented for pointers");
588 type_traits::is_arithmetic<K>::value,
589 "sort_pairs<cuda_exec> is only implemented for arithmetic types");
591 concepts::any_of<camp::is_same<Compare, operators::less<K>>,
592 camp::is_same<Compare, operators::greater<K>>>::value,
593 "sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or "
594 "RAJA::operators::greater");
596 return resources::EventProxy<resources::Cuda>(cuda_res);
602 template<
typename IterationMapping,
603 typename IterationGetter,
604 typename Concretizer,
605 size_t BLOCKS_PER_SM,
609 concepts::enable_if_t<
610 resources::EventProxy<resources::Cuda>,
611 type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
612 std::is_pointer<KeyIter>,
613 std::is_pointer<ValIter>>
615 ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
625 return stable_pairs(cuda_res, p, keys_begin, keys_end, vals_begin, comp);
631 template<
typename IterationMapping,
632 typename IterationGetter,
633 typename Concretizer,
634 size_t BLOCKS_PER_SM,
638 concepts::enable_if_t<
639 resources::EventProxy<resources::Cuda>,
640 type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
641 std::is_pointer<KeyIter>,
642 std::is_pointer<ValIter>>
644 ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
654 return stable_pairs(cuda_res, p, keys_begin, keys_end, vals_begin, comp);
Header file defining prototypes for routines used to manage memory for CUDA reductions and other oper...
Header file for RAJA operator definitions.
Header file for RAJA algorithm definitions.
Header file for RAJA concept definitions.
Header file containing RAJA CUDA policy definitions.
typename ::std::iterator_traits< Iter >::value_type IterVal
Definition: algorithm.hpp:38
concepts::enable_if_t< resources::EventProxy< resources::Host >, type_traits::is_openmp_policy< ExecPolicy > > stable_pairs(resources::Host host_res, const ExecPolicy &, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, Compare comp)
Definition: sort.hpp:276
concepts::enable_if_t< resources::EventProxy< resources::Host >, type_traits::is_openmp_policy< ExecPolicy > > stable(resources::Host host_res, const ExecPolicy &, Iter begin, Iter end, Compare comp)
stable sort given range using comparison function
Definition: sort.hpp:230
concepts::enable_if_t< resources::EventProxy< resources::Host >, type_traits::is_openmp_policy< ExecPolicy > > unstable(resources::Host host_res, const ExecPolicy &, Iter begin, Iter end, Compare comp)
sort given range using comparison function
Definition: sort.hpp:213
concepts::enable_if_t< resources::EventProxy< resources::Host >, type_traits::is_openmp_policy< ExecPolicy > > unstable_pairs(resources::Host host_res, const ExecPolicy &, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, Compare comp)
sort given range of pairs using comparison function on keys
Definition: sort.hpp:250
concepts::enable_if_t< resources::EventProxy< Res >, type_traits::is_execution_policy< ExecPolicy >, type_traits::is_resource< Res >, std::is_constructible< camp::resources::Resource, Res >, type_traits::is_range< Container > > sort(ExecPolicy &&p, Res r, Container &&c, Compare comp=Compare {})
sort execution pattern
Definition: sort.hpp:61
Definition: AlignedRangeIndexSetBuilders.cpp:35
void launch(LaunchParams const &launch_params, ReduceParams &&... rest_of_launch_args)
Definition: launch_core.hpp:268