RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
sort.hpp
Go to the documentation of this file.
1 
11 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
12 // Copyright (c) Lawrence Livermore National Security, LLC and other
13 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
14 // files for dates and other details. No copyright assignment is required
15 // to contribute to RAJA.
16 //
17 // SPDX-License-Identifier: (BSD-3-Clause)
18 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
19 
20 #ifndef RAJA_sort_cuda_HPP
21 #define RAJA_sort_cuda_HPP
22 
23 #include "RAJA/config.hpp"
24 
25 #if defined(RAJA_ENABLE_CUDA)
26 
27 #include <climits>
28 #include <iterator>
29 #include <type_traits>
30 
31 #include "cub/device/device_radix_sort.cuh"
32 
33 #include "RAJA/util/concepts.hpp"
34 #include "RAJA/util/Operators.hpp"
38 
39 namespace RAJA
40 {
41 namespace impl
42 {
43 namespace sort
44 {
45 
49 template<typename IterationMapping,
50  typename IterationGetter,
51  typename Concretizer,
52  size_t BLOCKS_PER_SM,
53  bool Async,
54  typename Iter,
55  typename Compare>
56 concepts::enable_if_t<
57  resources::EventProxy<resources::Cuda>,
58  concepts::negate<concepts::all_of<
59  type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
60  std::is_pointer<Iter>,
61  concepts::any_of<
62  camp::is_same<Compare,
63  operators::less<RAJA::detail::IterVal<Iter>>>,
64  camp::is_same<Compare,
65  operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
66 stable(resources::Cuda cuda_res,
67  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
68  IterationGetter,
69  Concretizer,
70  BLOCKS_PER_SM,
71  Async>,
72  Iter,
73  Iter,
74  Compare)
75 {
76  static_assert(std::is_pointer<Iter>::value,
77  "stable_sort<cuda_exec> is only implemented for pointers");
78  using iterval = RAJA::detail::IterVal<Iter>;
79  static_assert(
80  type_traits::is_arithmetic<iterval>::value,
81  "stable_sort<cuda_exec> is only implemented for arithmetic types");
82  static_assert(concepts::any_of<
83  camp::is_same<Compare, operators::less<iterval>>,
84  camp::is_same<Compare, operators::greater<iterval>>>::value,
85  "stable_sort<cuda_exec> is only implemented for "
86  "RAJA::operators::less or RAJA::operators::greater");
87 
88  return resources::EventProxy<resources::Cuda>(cuda_res);
89 }
90 
94 template<typename IterationMapping,
95  typename IterationGetter,
96  typename Concretizer,
97  size_t BLOCKS_PER_SM,
98  bool Async,
99  typename Iter>
100 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
101  type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
102  std::is_pointer<Iter>>
103 stable(resources::Cuda cuda_res,
104  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
105  IterationGetter,
106  Concretizer,
107  BLOCKS_PER_SM,
108  Async>,
109  Iter begin,
110  Iter end,
111  operators::less<RAJA::detail::IterVal<Iter>>)
112 {
113  cudaStream_t stream = cuda_res.get_stream();
114 
115  using R = RAJA::detail::IterVal<Iter>;
116 
117  int len = std::distance(begin, end);
118  int begin_bit = 0;
119  int end_bit = sizeof(R) * CHAR_BIT;
120 
121  // Allocate temporary storage for the output array
122  R* d_out = cuda::device_mempool_type::getInstance().malloc<R>(len);
123 
124  // use cub double buffer to reduce temporary memory requirements
125  // by allowing cub to write to the begin buffer
126  cub::DoubleBuffer<R> d_keys(begin, d_out);
127 
128  // Determine temporary device storage requirements
129  void* d_temp_storage = nullptr;
130  size_t temp_storage_bytes = 0;
131  CAMP_CUDA_API_INVOKE_AND_CHECK(::cub::DeviceRadixSort::SortKeys,
132  d_temp_storage, temp_storage_bytes, d_keys,
133  len, begin_bit, end_bit, stream);
134  // Allocate temporary storage
135  d_temp_storage =
136  cuda::device_mempool_type::getInstance().malloc<unsigned char>(
137  temp_storage_bytes);
138 
139  // Run
140  CAMP_CUDA_API_INVOKE_AND_CHECK(::cub::DeviceRadixSort::SortKeys,
141  d_temp_storage, temp_storage_bytes, d_keys,
142  len, begin_bit, end_bit, stream);
143  // Free temporary storage
144  cuda::device_mempool_type::getInstance().free(d_temp_storage);
145 
146  if (d_keys.Current() == d_out)
147  {
148 
149  // copy
150  CAMP_CUDA_API_INVOKE_AND_CHECK(cudaMemcpyAsync, begin, d_out,
151  len * sizeof(R), cudaMemcpyDefault, stream);
152  }
153 
154  cuda::device_mempool_type::getInstance().free(d_out);
155 
156  cuda::launch(cuda_res, Async);
157 
158  return resources::EventProxy<resources::Cuda>(cuda_res);
159 }
160 
164 template<typename IterationMapping,
165  typename IterationGetter,
166  typename Concretizer,
167  size_t BLOCKS_PER_SM,
168  bool Async,
169  typename Iter>
170 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
171  type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
172  std::is_pointer<Iter>>
173 stable(resources::Cuda cuda_res,
174  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
175  IterationGetter,
176  Concretizer,
177  BLOCKS_PER_SM,
178  Async>,
179  Iter begin,
180  Iter end,
181  operators::greater<RAJA::detail::IterVal<Iter>>)
182 {
183  cudaStream_t stream = cuda_res.get_stream();
184 
185  using R = RAJA::detail::IterVal<Iter>;
186 
187  int len = std::distance(begin, end);
188  int begin_bit = 0;
189  int end_bit = sizeof(R) * CHAR_BIT;
190 
191  // Allocate temporary storage for the output array
192  R* d_out = cuda::device_mempool_type::getInstance().malloc<R>(len);
193 
194  // use cub double buffer to reduce temporary memory requirements
195  // by allowing cub to write to the begin buffer
196  cub::DoubleBuffer<R> d_keys(begin, d_out);
197 
198  // Determine temporary device storage requirements
199  void* d_temp_storage = nullptr;
200  size_t temp_storage_bytes = 0;
201  CAMP_CUDA_API_INVOKE_AND_CHECK(::cub::DeviceRadixSort::SortKeysDescending,
202  d_temp_storage, temp_storage_bytes, d_keys,
203  len, begin_bit, end_bit, stream);
204  // Allocate temporary storage
205  d_temp_storage =
206  cuda::device_mempool_type::getInstance().malloc<unsigned char>(
207  temp_storage_bytes);
208 
209  // Run
210  CAMP_CUDA_API_INVOKE_AND_CHECK(::cub::DeviceRadixSort::SortKeysDescending,
211  d_temp_storage, temp_storage_bytes, d_keys,
212  len, begin_bit, end_bit, stream);
213  // Free temporary storage
214  cuda::device_mempool_type::getInstance().free(d_temp_storage);
215 
216  if (d_keys.Current() == d_out)
217  {
218 
219  // copy
220  CAMP_CUDA_API_INVOKE_AND_CHECK(cudaMemcpyAsync, begin, d_out,
221  len * sizeof(R), cudaMemcpyDefault, stream);
222  }
223 
224  cuda::device_mempool_type::getInstance().free(d_out);
225 
226  cuda::launch(cuda_res, Async);
227 
228  return resources::EventProxy<resources::Cuda>(cuda_res);
229 }
230 
234 template<typename IterationMapping,
235  typename IterationGetter,
236  typename Concretizer,
237  size_t BLOCKS_PER_SM,
238  bool Async,
239  typename Iter,
240  typename Compare>
241 concepts::enable_if_t<
242  resources::EventProxy<resources::Cuda>,
243  concepts::negate<concepts::all_of<
244  type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
245  std::is_pointer<Iter>,
246  concepts::any_of<
247  camp::is_same<Compare,
248  operators::less<RAJA::detail::IterVal<Iter>>>,
249  camp::is_same<Compare,
250  operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
251 unstable(resources::Cuda cuda_res,
252  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
253  IterationGetter,
254  Concretizer,
255  BLOCKS_PER_SM,
256  Async>,
257  Iter,
258  Iter,
259  Compare)
260 {
261  static_assert(std::is_pointer<Iter>::value,
262  "sort<cuda_exec> is only implemented for pointers");
263  using iterval = RAJA::detail::IterVal<Iter>;
264  static_assert(type_traits::is_arithmetic<iterval>::value,
265  "sort<cuda_exec> is only implemented for arithmetic types");
266  static_assert(concepts::any_of<
267  camp::is_same<Compare, operators::less<iterval>>,
268  camp::is_same<Compare, operators::greater<iterval>>>::value,
269  "sort<cuda_exec> is only implemented for RAJA::operators::less "
270  "or RAJA::operators::greater");
271 
272  return resources::EventProxy<resources::Cuda>(cuda_res);
273 }
274 
278 template<typename IterationMapping,
279  typename IterationGetter,
280  typename Concretizer,
281  size_t BLOCKS_PER_SM,
282  bool Async,
283  typename Iter>
284 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
285  type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
286  std::is_pointer<Iter>>
287 unstable(resources::Cuda cuda_res,
288  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
289  IterationGetter,
290  Concretizer,
291  BLOCKS_PER_SM,
292  Async> p,
293  Iter begin,
294  Iter end,
295  operators::less<RAJA::detail::IterVal<Iter>> comp)
296 {
297  return stable(cuda_res, p, begin, end, comp);
298 }
299 
303 template<typename IterationMapping,
304  typename IterationGetter,
305  typename Concretizer,
306  size_t BLOCKS_PER_SM,
307  bool Async,
308  typename Iter>
309 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
310  type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
311  std::is_pointer<Iter>>
312 unstable(resources::Cuda cuda_res,
313  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
314  IterationGetter,
315  Concretizer,
316  BLOCKS_PER_SM,
317  Async> p,
318  Iter begin,
319  Iter end,
320  operators::greater<RAJA::detail::IterVal<Iter>> comp)
321 {
322  return stable(cuda_res, p, begin, end, comp);
323 }
324 
328 template<typename IterationMapping,
329  typename IterationGetter,
330  typename Concretizer,
331  size_t BLOCKS_PER_SM,
332  bool Async,
333  typename KeyIter,
334  typename ValIter,
335  typename Compare>
336 concepts::enable_if_t<
337  resources::EventProxy<resources::Cuda>,
338  concepts::negate<concepts::all_of<
339  type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
340  std::is_pointer<KeyIter>,
341  std::is_pointer<ValIter>,
342  concepts::any_of<
343  camp::is_same<Compare,
344  operators::less<RAJA::detail::IterVal<KeyIter>>>,
345  camp::is_same<
346  Compare,
347  operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
348 stable_pairs(resources::Cuda cuda_res,
349  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
350  IterationGetter,
351  Concretizer,
352  BLOCKS_PER_SM,
353  Async>,
354  KeyIter,
355  KeyIter,
356  ValIter,
357  Compare)
358 {
359  static_assert(
360  std::is_pointer<KeyIter>::value,
361  "stable_sort_pairs<cuda_exec> is only implemented for pointers");
362  static_assert(
363  std::is_pointer<ValIter>::value,
364  "stable_sort_pairs<cuda_exec> is only implemented for pointers");
366  static_assert(
367  type_traits::is_arithmetic<K>::value,
368  "stable_sort_pairs<cuda_exec> is only implemented for arithmetic types");
369  static_assert(
370  concepts::any_of<camp::is_same<Compare, operators::less<K>>,
371  camp::is_same<Compare, operators::greater<K>>>::value,
372  "stable_sort_pairs<cuda_exec> is only implemented for "
373  "RAJA::operators::less or RAJA::operators::greater");
374 
375  return resources::EventProxy<resources::Cuda>(cuda_res);
376 }
377 
381 template<typename IterationMapping,
382  typename IterationGetter,
383  typename Concretizer,
384  size_t BLOCKS_PER_SM,
385  bool Async,
386  typename KeyIter,
387  typename ValIter>
388 concepts::enable_if_t<
389  resources::EventProxy<resources::Cuda>,
390  type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
391  std::is_pointer<KeyIter>,
392  std::is_pointer<ValIter>>
393 stable_pairs(resources::Cuda cuda_res,
394  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
395  IterationGetter,
396  Concretizer,
397  BLOCKS_PER_SM,
398  Async>,
399  KeyIter keys_begin,
400  KeyIter keys_end,
401  ValIter vals_begin,
402  operators::less<RAJA::detail::IterVal<KeyIter>>)
403 {
404  cudaStream_t stream = cuda_res.get_stream();
405 
408 
409  int len = std::distance(keys_begin, keys_end);
410  int begin_bit = 0;
411  int end_bit = sizeof(K) * CHAR_BIT;
412 
413  // Allocate temporary storage for the output arrays
414  K* d_keys_out = cuda::device_mempool_type::getInstance().malloc<K>(len);
415  V* d_vals_out = cuda::device_mempool_type::getInstance().malloc<V>(len);
416 
417  // use cub double buffer to reduce temporary memory requirements
418  // by allowing cub to write to the keys_begin and vals_begin buffers
419  cub::DoubleBuffer<K> d_keys(keys_begin, d_keys_out);
420  cub::DoubleBuffer<V> d_vals(vals_begin, d_vals_out);
421 
422  // Determine temporary device storage requirements
423  void* d_temp_storage = nullptr;
424  size_t temp_storage_bytes = 0;
425  CAMP_CUDA_API_INVOKE_AND_CHECK(::cub::DeviceRadixSort::SortPairs,
426  d_temp_storage, temp_storage_bytes, d_keys,
427  d_vals, len, begin_bit, end_bit, stream);
428  // Allocate temporary storage
429  d_temp_storage =
430  cuda::device_mempool_type::getInstance().malloc<unsigned char>(
431  temp_storage_bytes);
432 
433  // Run
434  CAMP_CUDA_API_INVOKE_AND_CHECK(::cub::DeviceRadixSort::SortPairs,
435  d_temp_storage, temp_storage_bytes, d_keys,
436  d_vals, len, begin_bit, end_bit, stream);
437  // Free temporary storage
438  cuda::device_mempool_type::getInstance().free(d_temp_storage);
439 
440  if (d_keys.Current() == d_keys_out)
441  {
442 
443  // copy keys
444  CAMP_CUDA_API_INVOKE_AND_CHECK(cudaMemcpyAsync, keys_begin, d_keys_out,
445  len * sizeof(K), cudaMemcpyDefault, stream);
446  }
447  if (d_vals.Current() == d_vals_out)
448  {
449 
450  // copy vals
451  CAMP_CUDA_API_INVOKE_AND_CHECK(cudaMemcpyAsync, vals_begin, d_vals_out,
452  len * sizeof(V), cudaMemcpyDefault, stream);
453  }
454 
455  cuda::device_mempool_type::getInstance().free(d_keys_out);
456  cuda::device_mempool_type::getInstance().free(d_vals_out);
457 
458  cuda::launch(cuda_res, Async);
459 
460  return resources::EventProxy<resources::Cuda>(cuda_res);
461 }
462 
466 template<typename IterationMapping,
467  typename IterationGetter,
468  typename Concretizer,
469  size_t BLOCKS_PER_SM,
470  bool Async,
471  typename KeyIter,
472  typename ValIter>
473 concepts::enable_if_t<
474  resources::EventProxy<resources::Cuda>,
475  type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
476  std::is_pointer<KeyIter>,
477  std::is_pointer<ValIter>>
478 stable_pairs(resources::Cuda cuda_res,
479  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
480  IterationGetter,
481  Concretizer,
482  BLOCKS_PER_SM,
483  Async>,
484  KeyIter keys_begin,
485  KeyIter keys_end,
486  ValIter vals_begin,
487  operators::greater<RAJA::detail::IterVal<KeyIter>>)
488 {
489  cudaStream_t stream = cuda_res.get_stream();
490 
493 
494  int len = std::distance(keys_begin, keys_end);
495  int begin_bit = 0;
496  int end_bit = sizeof(K) * CHAR_BIT;
497 
498  // Allocate temporary storage for the output arrays
499  K* d_keys_out = cuda::device_mempool_type::getInstance().malloc<K>(len);
500  V* d_vals_out = cuda::device_mempool_type::getInstance().malloc<V>(len);
501 
502  // use cub double buffer to reduce temporary memory requirements
503  // by allowing cub to write to the keys_begin and vals_begin buffers
504  cub::DoubleBuffer<K> d_keys(keys_begin, d_keys_out);
505  cub::DoubleBuffer<V> d_vals(vals_begin, d_vals_out);
506 
507  // Determine temporary device storage requirements
508  void* d_temp_storage = nullptr;
509  size_t temp_storage_bytes = 0;
510  CAMP_CUDA_API_INVOKE_AND_CHECK(::cub::DeviceRadixSort::SortPairsDescending,
511  d_temp_storage, temp_storage_bytes, d_keys,
512  d_vals, len, begin_bit, end_bit, stream);
513  // Allocate temporary storage
514  d_temp_storage =
515  cuda::device_mempool_type::getInstance().malloc<unsigned char>(
516  temp_storage_bytes);
517 
518  // Run
519  CAMP_CUDA_API_INVOKE_AND_CHECK(::cub::DeviceRadixSort::SortPairsDescending,
520  d_temp_storage, temp_storage_bytes, d_keys,
521  d_vals, len, begin_bit, end_bit, stream);
522  // Free temporary storage
523  cuda::device_mempool_type::getInstance().free(d_temp_storage);
524 
525  if (d_keys.Current() == d_keys_out)
526  {
527 
528  // copy keys
529  CAMP_CUDA_API_INVOKE_AND_CHECK(cudaMemcpyAsync, keys_begin, d_keys_out,
530  len * sizeof(K), cudaMemcpyDefault, stream);
531  }
532  if (d_vals.Current() == d_vals_out)
533  {
534 
535  // copy vals
536  CAMP_CUDA_API_INVOKE_AND_CHECK(cudaMemcpyAsync, vals_begin, d_vals_out,
537  len * sizeof(V), cudaMemcpyDefault, stream);
538  }
539 
540  cuda::device_mempool_type::getInstance().free(d_keys_out);
541  cuda::device_mempool_type::getInstance().free(d_vals_out);
542 
543  cuda::launch(cuda_res, Async);
544 
545  return resources::EventProxy<resources::Cuda>(cuda_res);
546 }
547 
551 template<typename IterationMapping,
552  typename IterationGetter,
553  typename Concretizer,
554  size_t BLOCKS_PER_SM,
555  bool Async,
556  typename KeyIter,
557  typename ValIter,
558  typename Compare>
559 concepts::enable_if_t<
560  resources::EventProxy<resources::Cuda>,
561  concepts::negate<concepts::all_of<
562  type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
563  std::is_pointer<KeyIter>,
564  std::is_pointer<ValIter>,
565  concepts::any_of<
566  camp::is_same<Compare,
567  operators::less<RAJA::detail::IterVal<KeyIter>>>,
568  camp::is_same<
569  Compare,
570  operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
571 unstable_pairs(resources::Cuda cuda_res,
572  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
573  IterationGetter,
574  Concretizer,
575  BLOCKS_PER_SM,
576  Async>,
577  KeyIter,
578  KeyIter,
579  ValIter,
580  Compare)
581 {
582  static_assert(std::is_pointer<KeyIter>::value,
583  "sort_pairs<cuda_exec> is only implemented for pointers");
584  static_assert(std::is_pointer<ValIter>::value,
585  "sort_pairs<cuda_exec> is only implemented for pointers");
587  static_assert(
588  type_traits::is_arithmetic<K>::value,
589  "sort_pairs<cuda_exec> is only implemented for arithmetic types");
590  static_assert(
591  concepts::any_of<camp::is_same<Compare, operators::less<K>>,
592  camp::is_same<Compare, operators::greater<K>>>::value,
593  "sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or "
594  "RAJA::operators::greater");
595 
596  return resources::EventProxy<resources::Cuda>(cuda_res);
597 }
598 
602 template<typename IterationMapping,
603  typename IterationGetter,
604  typename Concretizer,
605  size_t BLOCKS_PER_SM,
606  bool Async,
607  typename KeyIter,
608  typename ValIter>
609 concepts::enable_if_t<
610  resources::EventProxy<resources::Cuda>,
611  type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
612  std::is_pointer<KeyIter>,
613  std::is_pointer<ValIter>>
614 unstable_pairs(resources::Cuda cuda_res,
615  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
616  IterationGetter,
617  Concretizer,
618  BLOCKS_PER_SM,
619  Async> p,
620  KeyIter keys_begin,
621  KeyIter keys_end,
622  ValIter vals_begin,
623  operators::less<RAJA::detail::IterVal<KeyIter>> comp)
624 {
625  return stable_pairs(cuda_res, p, keys_begin, keys_end, vals_begin, comp);
626 }
627 
631 template<typename IterationMapping,
632  typename IterationGetter,
633  typename Concretizer,
634  size_t BLOCKS_PER_SM,
635  bool Async,
636  typename KeyIter,
637  typename ValIter>
638 concepts::enable_if_t<
639  resources::EventProxy<resources::Cuda>,
640  type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
641  std::is_pointer<KeyIter>,
642  std::is_pointer<ValIter>>
643 unstable_pairs(resources::Cuda cuda_res,
644  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
645  IterationGetter,
646  Concretizer,
647  BLOCKS_PER_SM,
648  Async> p,
649  KeyIter keys_begin,
650  KeyIter keys_end,
651  ValIter vals_begin,
652  operators::greater<RAJA::detail::IterVal<KeyIter>> comp)
653 {
654  return stable_pairs(cuda_res, p, keys_begin, keys_end, vals_begin, comp);
655 }
656 
657 } // namespace sort
658 
659 } // namespace impl
660 
661 } // namespace RAJA
662 
663 #endif // closing endif for RAJA_ENABLE_CUDA guard
664 
665 #endif // closing endif for header file include guard
Header file defining prototypes for routines used to manage memory for CUDA reductions and other oper...
Header file for RAJA operator definitions.
Header file for RAJA algorithm definitions.
Header file for RAJA concept definitions.
Header file containing RAJA CUDA policy definitions.
typename ::std::iterator_traits< Iter >::value_type IterVal
Definition: algorithm.hpp:38
concepts::enable_if_t< resources::EventProxy< resources::Host >, type_traits::is_openmp_policy< ExecPolicy > > stable_pairs(resources::Host host_res, const ExecPolicy &, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, Compare comp)
Definition: sort.hpp:276
concepts::enable_if_t< resources::EventProxy< resources::Host >, type_traits::is_openmp_policy< ExecPolicy > > stable(resources::Host host_res, const ExecPolicy &, Iter begin, Iter end, Compare comp)
stable sort given range using comparison function
Definition: sort.hpp:230
concepts::enable_if_t< resources::EventProxy< resources::Host >, type_traits::is_openmp_policy< ExecPolicy > > unstable(resources::Host host_res, const ExecPolicy &, Iter begin, Iter end, Compare comp)
sort given range using comparison function
Definition: sort.hpp:213
concepts::enable_if_t< resources::EventProxy< resources::Host >, type_traits::is_openmp_policy< ExecPolicy > > unstable_pairs(resources::Host host_res, const ExecPolicy &, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, Compare comp)
sort given range of pairs using comparison function on keys
Definition: sort.hpp:250
concepts::enable_if_t< resources::EventProxy< Res >, type_traits::is_execution_policy< ExecPolicy >, type_traits::is_resource< Res >, std::is_constructible< camp::resources::Resource, Res >, type_traits::is_range< Container > > sort(ExecPolicy &&p, Res r, Container &&c, Compare comp=Compare {})
sort execution pattern
Definition: sort.hpp:61
Definition: AlignedRangeIndexSetBuilders.cpp:35
void launch(LaunchParams const &launch_params, ReduceParams &&... rest_of_launch_args)
Definition: launch_core.hpp:268