doxygen/html/cuda__warp_8hpp_source.html

 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//

 // Copyright (c) Lawrence Livermore National Security, LLC and other

 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT

 // files for dates and other details. No copyright assignment is required

 // to contribute to RAJA.

 //

 // SPDX-License-Identifier: (BSD-3-Clause)

 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//


 #ifndef RAJA_policy_tensor_arch_cuda_cuda_warp_register_HPP

 #define RAJA_policy_tensor_arch_cuda_cuda_warp_register_HPP


 #include "RAJA/config.hpp"


 #if defined(RAJA_CUDA_ACTIVE)


 #include "RAJA/util/macros.hpp"

 #include "RAJA/pattern/tensor/internal/RegisterBase.hpp"

 #include "RAJA/util/macros.hpp"

 #include "RAJA/util/Operators.hpp"


 #include "RAJA/policy/cuda/intrinsics.hpp"


 namespace RAJA

 {

 namespace expt

 {


 template<typename ELEMENT_TYPE>

 class Register<ELEMENT_TYPE, cuda_warp_register>

     : public internal::expt::RegisterBase<

           Register<ELEMENT_TYPE, cuda_warp_register>>

 {

 public:

   using base_type =

       internal::expt::RegisterBase<Register<ELEMENT_TYPE, cuda_warp_register>>;


   using register_policy = cuda_warp_register;

   using self_type       = Register<ELEMENT_TYPE, cuda_warp_register>;

   using element_type    = ELEMENT_TYPE;

   using register_type   = ELEMENT_TYPE;


   using int_vector_type = Register<int64_t, cuda_warp_register>;


 private:

   element_type m_value;


 public:

   static constexpr int s_num_elem = RAJA_CUDA_WARPSIZE;


   RAJA_INLINE


   RAJA_DEVICE

   constexpr Register() : base_type(), m_value(0) {}


   RAJA_INLINE


   RAJA_DEVICE

   constexpr Register(element_type c) : base_type(), m_value(c) {}


   RAJA_INLINE


   RAJA_DEVICE

   constexpr Register(self_type const& c) : base_type(), m_value(c.m_value) {}


   RAJA_INLINE


   RAJA_DEVICE

   self_type& operator=(self_type const& c)

   {

     m_value = c.m_value;

     return *this;

   }


   RAJA_INLINE


   RAJA_DEVICE

   self_type& operator=(element_type c)

   {

     m_value = c;

     return *this;

   }


   RAJA_INLINE


   RAJA_DEVICE

   constexpr static int get_lane() { return threadIdx.x; }


   RAJA_DEVICE


   RAJA_INLINE

   constexpr element_type const& get_raw_value() const { return m_value; }


   RAJA_DEVICE


   RAJA_INLINE

   element_type& get_raw_value() { return m_value; }


   RAJA_DEVICE


   RAJA_INLINE

   static constexpr bool is_root() { return get_lane() == 0; }


   RAJA_INLINE


   RAJA_DEVICE

   self_type& load_packed(element_type const* ptr)

   {


     auto lane = get_lane();


     m_value = ptr[lane];


     return *this;

   }


   RAJA_INLINE


   RAJA_DEVICE

   self_type& load_packed_n(element_type const* ptr, int N)

   {

     auto lane = get_lane();

     if (lane < N)

     {

       m_value = ptr[lane];

     }

     else

     {

       m_value = element_type(0);

     }

     return *this;

   }


   RAJA_INLINE


   RAJA_DEVICE

   self_type& load_strided(element_type const* ptr, int stride)

   {


     auto lane = get_lane();


     m_value = ptr[stride * lane];


     return *this;

   }


   RAJA_INLINE


   RAJA_DEVICE

   self_type& load_strided_n(element_type const* ptr, int stride, int N)

   {

     auto lane = get_lane();


     if (lane < N)

     {

       m_value = ptr[stride * lane];

     }

     else

     {

       m_value = element_type(0);

     }

     return *this;

   }


   RAJA_INLINE


   RAJA_DEVICE

   self_type& gather(element_type const* ptr, int_vector_type offsets)

   {


     m_value = ptr[offsets.get_raw_value()];


     return *this;

   }


   RAJA_INLINE


   RAJA_DEVICE

   self_type& gather_n(element_type const* ptr,

                       int_vector_type offsets,

                       camp::idx_t N)

   {

     if (get_lane() < N)

     {

       m_value = ptr[offsets.get_raw_value()];

     }

     else

     {

       m_value = element_type(0);

     }


     return *this;

   }


   RAJA_DEVICE


   RAJA_INLINE

   self_type& segmented_load(element_type const* ptr,

                             camp::idx_t segbits,

                             camp::idx_t stride_inner,

                             camp::idx_t stride_outer)

   {

     auto lane = get_lane();


     // compute segment and segment_size

     auto seg = lane >> segbits;

     auto i   = lane & ((1 << segbits) - 1);


     m_value = ptr[seg * stride_outer + i * stride_inner];


     return *this;

   }


   RAJA_DEVICE


   RAJA_INLINE

   self_type& segmented_load_nm(element_type const* ptr,

                                camp::idx_t segbits,

                                camp::idx_t stride_inner,

                                camp::idx_t stride_outer,

                                camp::idx_t num_inner,

                                camp::idx_t num_outer)

   {

     auto lane = get_lane();


     // compute segment and segment_size

     auto seg = lane >> segbits;

     auto i   = lane & ((1 << segbits) - 1);


     if (seg >= num_outer || i >= num_inner)

     {

       m_value = element_type(0);

     }

     else

     {

       m_value = ptr[seg * stride_outer + i * stride_inner];

     }


     return *this;

   }


   RAJA_INLINE


   RAJA_DEVICE

   self_type const& store_packed(element_type* ptr) const

   {


     auto lane = get_lane();


     ptr[lane] = m_value;


     return *this;

   }


   RAJA_INLINE


   RAJA_DEVICE

   self_type const& store_packed_n(element_type* ptr, int N) const

   {


     auto lane = get_lane();


     if (lane < N)

     {

       ptr[lane] = m_value;

     }

     return *this;

   }


   RAJA_INLINE


   RAJA_DEVICE

   self_type const& store_strided(element_type* ptr, int stride) const

   {


     auto lane = get_lane();


     ptr[lane * stride] = m_value;


     return *this;

   }


   RAJA_INLINE


   RAJA_DEVICE

   self_type const& store_strided_n(element_type* ptr, int stride, int N) const

   {


     auto lane = get_lane();


     if (lane < N)

     {

       ptr[lane * stride] = m_value;

     }

     return *this;

   }


   template<typename T2>

   RAJA_DEVICE RAJA_INLINE self_type const& scatter(element_type* ptr,

                                                    T2 const& offsets) const

   {


     ptr[offsets.get_raw_value()] = m_value;


     return *this;

   }


   template<typename T2>

   RAJA_DEVICE RAJA_INLINE self_type const& scatter_n(element_type* ptr,

                                                      T2 const& offsets,

                                                      camp::idx_t N) const

   {

     if (get_lane() < N)

     {

       ptr[offsets.get_raw_value()] = m_value;

     }


     return *this;

   }


   RAJA_DEVICE


   RAJA_INLINE

   self_type const& segmented_store(element_type* ptr,

                                    camp::idx_t segbits,

                                    camp::idx_t stride_inner,

                                    camp::idx_t stride_outer) const

   {

     auto lane = get_lane();


     // compute segment and segment_size

     auto seg = lane >> segbits;

     auto i   = lane & ((1 << segbits) - 1);


     ptr[seg * stride_outer + i * stride_inner] = m_value;


     return *this;

   }


   RAJA_DEVICE


   RAJA_INLINE

   self_type const& segmented_store_nm(element_type* ptr,

                                       camp::idx_t segbits,

                                       camp::idx_t stride_inner,

                                       camp::idx_t stride_outer,

                                       camp::idx_t num_inner,

                                       camp::idx_t num_outer) const

   {

     auto lane = get_lane();


     // compute segment and segment_size

     auto seg = lane >> segbits;

     auto i   = lane & ((1 << segbits) - 1);


     if (seg >= num_outer || i >= num_inner)

     {

       // nop

     }

     else

     {

       ptr[seg * stride_outer + i * stride_inner] = m_value;

     }


     return *this;

   }


   constexpr RAJA_INLINE RAJA_DEVICE element_type get(int i) const

   {

     return __shfl_sync(0xffffffff, m_value, i);

   }


   RAJA_INLINE


   RAJA_DEVICE

   self_type& set(element_type value, int i)

   {

     auto lane = get_lane();

     if (lane == i)

     {

       m_value = value;

     }

     return *this;

   }


   RAJA_DEVICE


   RAJA_INLINE

   self_type& broadcast(element_type const& a)

   {

     m_value = a;

     return *this;

   }


   RAJA_DEVICE


   RAJA_INLINE

   self_type get_and_broadcast(int i) const

   {

     self_type x;

     x.m_value = __shfl_sync(0xffffffff, m_value, i);

     return x;

   }


   RAJA_DEVICE


   RAJA_INLINE

   self_type& copy(self_type const& src)

   {

     m_value = src.m_value;

     return *this;

   }


   RAJA_DEVICE


   RAJA_INLINE

   self_type add(self_type const& b) const

   {

     return self_type(m_value + b.m_value);

   }


   RAJA_DEVICE


   RAJA_INLINE

   self_type subtract(self_type const& b) const

   {

     return self_type(m_value - b.m_value);

   }


   RAJA_DEVICE


   RAJA_INLINE

   self_type multiply(self_type const& b) const

   {

     return self_type(m_value * b.m_value);

   }


   RAJA_DEVICE


   RAJA_INLINE

   self_type divide(self_type const& b) const

   {

     return self_type(m_value / b.m_value);

   }


   RAJA_DEVICE


   RAJA_INLINE

   self_type divide_n(self_type const& b, int N) const

   {

     return get_lane() < N ? self_type(m_value / b.m_value)

                           : self_type(element_type(0));

   }


   template<typename RETURN_TYPE = self_type>

   RAJA_DEVICE RAJA_INLINE

       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,

                               RETURN_TYPE>::type

       multiply_add(self_type const& b, self_type const& c) const

   {

     return self_type(fma(m_value, b.m_value, c.m_value));

   }


   template<typename RETURN_TYPE = self_type>

   RAJA_DEVICE RAJA_INLINE

       typename std::enable_if<std::numeric_limits<element_type>::is_integer,

                               RETURN_TYPE>::type

       multiply_add(self_type const& b, self_type const& c) const

   {

     return self_type(m_value * b.m_value + c.m_value);

   }


   template<typename RETURN_TYPE = self_type>

   RAJA_DEVICE RAJA_INLINE

       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,

                               RETURN_TYPE>::type

       multiply_subtract(self_type const& b, self_type const& c) const

   {

     return self_type(fma(m_value, b.m_value, -c.m_value));

   }


   template<typename RETURN_TYPE = self_type>

   RAJA_DEVICE RAJA_INLINE

       typename std::enable_if<std::numeric_limits<element_type>::is_integer,

                               RETURN_TYPE>::type

       multiply_subtract(self_type const& b, self_type const& c) const

   {

     return self_type(m_value * b.m_value - c.m_value);

   }


   RAJA_INLINE


   RAJA_DEVICE

   element_type sum() const

   {

     // Allreduce sum

     using combiner_t =

         RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::plus>;


     return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);

   }


   RAJA_INLINE


   RAJA_DEVICE

   element_type max() const

   {

     // Allreduce maximum

     using combiner_t =

         RAJA::reduce::detail::op_adapter<element_type,

                                          RAJA::operators::maximum>;


     return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);

   }


   RAJA_INLINE


   RAJA_DEVICE

   element_type max_n(int N) const

   {

     // Allreduce maximum

     using combiner_t =

         RAJA::reduce::detail::op_adapter<element_type,

                                          RAJA::operators::maximum>;


     auto ident = RAJA::operators::limits<element_type>::min();

     auto lane  = get_lane();

     auto value = lane < N ? m_value : ident;

     return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);

   }


   RAJA_INLINE


   RAJA_DEVICE

   self_type vmax(self_type a) const

   {

     return self_type {RAJA::max<element_type>(m_value, a.m_value)};

   }


   RAJA_INLINE


   RAJA_DEVICE

   element_type min() const

   {

     // Allreduce minimum

     using combiner_t =

         RAJA::reduce::detail::op_adapter<element_type,

                                          RAJA::operators::minimum>;


     return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);

   }


   RAJA_INLINE


   RAJA_DEVICE

   element_type min_n(int N) const

   {

     // Allreduce minimum

     using combiner_t =

         RAJA::reduce::detail::op_adapter<element_type,

                                          RAJA::operators::minimum>;


     auto ident = RAJA::operators::limits<element_type>::max();

     auto lane  = get_lane();

     auto value = lane < N ? m_value : ident;

     return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);

   }


   RAJA_INLINE


   RAJA_DEVICE

   self_type vmin(self_type a) const

   {

     return self_type {RAJA::min<element_type>(m_value, a.m_value)};

   }


   RAJA_INLINE


   RAJA_DEVICE

   static int_vector_type s_segmented_offsets(camp::idx_t segbits,

                                              camp::idx_t stride_inner,

                                              camp::idx_t stride_outer)

   {

     int_vector_type result;


     auto lane = get_lane();


     // compute segment and segment_size

     auto seg = lane >> segbits;

     auto i   = lane & ((1 << segbits) - 1);


     result.get_raw_value() = seg * stride_outer + i * stride_inner;


     return result;

   }


   RAJA_INLINE


   RAJA_DEVICE

   self_type segmented_sum_inner(camp::idx_t segbits,

                                 camp::idx_t output_segment) const

   {


     // First: tree reduce values within each segment

     element_type x = m_value;

     RAJA_UNROLL

     for (int delta = 1; delta < 1 << segbits; delta = delta << 1)

     {


       // tree shuffle

       element_type y = __shfl_sync(0xffffffff, x, get_lane() + delta);


       // reduce

       x += y;

     }


     // Second: send result to output segment lanes

     self_type result;

     result.get_raw_value() = __shfl_sync(0xffffffff, x, get_lane() << segbits);


     // Third: mask off everything but output_segment

     //        this is because all output segments are valid at this point

     static constexpr int log2_warp_size = RAJA::log2(RAJA_CUDA_WARPSIZE);

     int our_output_segment = get_lane() >> (log2_warp_size - segbits);

     bool in_output_segment = our_output_segment == output_segment;

     if (!in_output_segment)

     {

       result.get_raw_value() = 0;

     }


     return result;

   }


   RAJA_INLINE


   RAJA_DEVICE

   self_type segmented_sum_outer(camp::idx_t segbits,

                                 camp::idx_t output_segment) const

   {


     // First: tree reduce values within each segment

     element_type x                      = m_value;

     static constexpr int log2_warp_size = RAJA::log2(RAJA_CUDA_WARPSIZE);

     RAJA_UNROLL

     for (int i = 0; i < log2_warp_size - segbits; ++i)

     {


       // tree shuffle

       int delta      = s_num_elem >> (i + 1);

       element_type y = __shfl_sync(0xffffffff, x, get_lane() + delta);


       // reduce

       x += y;

     }


     // Second: send result to output segment lanes

     self_type result;

     int get_from           = get_lane() & ((1 << segbits) - 1);

     result.get_raw_value() = __shfl_sync(0xffffffff, x, get_from);


     int mask = (get_lane() >> segbits) == output_segment;


     // Third: mask off everything but output_segment

     if (!mask)

     {

       result.get_raw_value() = 0;

     }


     return result;

   }


   RAJA_INLINE


   RAJA_DEVICE

   self_type segmented_divide_nm(self_type den,

                                 camp::idx_t segbits,

                                 camp::idx_t num_inner,

                                 camp::idx_t num_outer) const

   {

     self_type result;


     auto lane = get_lane();


     // compute segment and segment_size

     auto seg = lane >> segbits;

     auto i   = lane & ((1 << segbits) - 1);


     if (seg >= num_outer || i >= num_inner)

     {

       // nop

     }

     else

     {

       result.get_raw_value() = m_value / den.get_raw_value();

     }


     return result;

   }


   RAJA_INLINE


   RAJA_DEVICE

   self_type segmented_broadcast_inner(camp::idx_t segbits,

                                       camp::idx_t input_segment) const

   {

     self_type result;


     camp::idx_t mask   = (1 << segbits) - 1;

     camp::idx_t offset = input_segment << segbits;


     camp::idx_t i = (get_lane() & mask) + offset;


     result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);


     return result;

   }


   RAJA_INLINE


   RAJA_DEVICE

   self_type segmented_broadcast_outer(camp::idx_t segbits,

                                       camp::idx_t input_segment) const

   {

     self_type result;


     camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);


     camp::idx_t i = (get_lane() >> segbits) + offset;


     result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);


     return result;

   }

 };


 }  // namespace expt


 }  // namespace RAJA


 #endif  // CUDA


 #endif  // Guard

Operators.hpp
Header file for RAJA operator definitions.

RegisterBase.hpp
RAJA header file defining SIMD/SIMT register operations.

intrinsics.hpp
Header file containing RAJA intrinsics templates for CUDA execution.

macros.hpp
Header file for common RAJA internal macro definitions.

RAJA_DEVICE
#define RAJA_DEVICE
Definition: macros.hpp:66

RAJA
Definition: AlignedRangeIndexSetBuilders.cpp:35

RAJA::min
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result min(Args... args)
Definition: foldl.hpp:161

RAJA::named_dim::y
@ y

RAJA::named_dim::x
@ x

RAJA::sum
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result sum(Args... args)
Definition: foldl.hpp:143

RAJA::get
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56

RAJA::log2
RAJA_HOST_DEVICE constexpr RAJA_INLINE T log2(T n) noexcept
evaluate log base 2 of n
Definition: math.hpp:40

RAJA::max
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155

RAJA::operators::maximum
Definition: Operators.hpp:580

RAJA::operators::minimum
Definition: Operators.hpp:559

RAJA::reduce::detail::op_adapter
Definition: reduce.hpp:70