22 #ifndef RAJA_policy_vector_register_avx512_double_HPP
23 #define RAJA_policy_vector_register_avx512_double_HPP
25 #include "RAJA/config.hpp"
30 #include <immintrin.h>
39 class Register<double, avx512_register>
40 :
public internal::expt::RegisterBase<Register<double, avx512_register>>
44 internal::expt::RegisterBase<Register<double, avx512_register>>;
47 using register_policy = avx512_register;
48 using self_type = Register<double, avx512_register>;
49 using element_type = double;
50 using register_type = __m512d;
52 using int_vector_type = Register<int64_t, avx512_register>;
56 register_type m_value;
59 __mmask8 createMask(camp::idx_t N)
const
65 return __mmask8(0x00);
67 return __mmask8(0x01);
69 return __mmask8(0x03);
71 return __mmask8(0x07);
73 return __mmask8(0x0F);
75 return __mmask8(0x1F);
77 return __mmask8(0x3F);
79 return __mmask8(0x7F);
81 return __mmask8(0xFF);
87 __m512i createStridedOffsets(camp::idx_t stride)
const
90 auto vstride = _mm512_set1_epi64(stride);
91 auto vseq = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
92 return _mm512_mullo_epi64(vstride, vseq);
96 static constexpr camp::idx_t s_num_elem = 8;
103 Register() : base_type(), m_value(_mm512_setzero_pd()) {}
109 explicit Register(register_type
const& c) : base_type(), m_value(c) {}
115 Register(self_type
const& c) : base_type(), m_value(c.m_value) {}
121 self_type& operator=(self_type
const& c)
133 Register(element_type
const& c) : base_type(), m_value(_mm512_set1_pd(c)) {}
140 self_type& load_packed(element_type
const* ptr)
143 m_value = _mm512_loadu_pd(ptr);
153 self_type& load_packed_n(element_type
const* ptr, camp::idx_t N)
156 m_value = _mm512_mask_loadu_pd(_mm512_setzero_pd(), createMask(N), ptr);
165 self_type& load_strided(element_type
const* ptr, camp::idx_t stride)
168 m_value = _mm512_i64gather_pd(createStridedOffsets(stride), ptr,
169 sizeof(element_type));
179 self_type& load_strided_n(element_type
const* ptr,
184 m_value = _mm512_mask_i64gather_pd(_mm512_setzero_pd(), createMask(N),
185 createStridedOffsets(stride), ptr,
186 sizeof(element_type));
195 self_type
const& store_packed(element_type* ptr)
const
198 _mm512_storeu_pd(ptr, m_value);
207 self_type
const& store_packed_n(element_type* ptr, camp::idx_t N)
const
210 _mm512_mask_storeu_pd(ptr, createMask(N), m_value);
219 self_type
const& store_strided(element_type* ptr, camp::idx_t stride)
const
222 _mm512_i64scatter_pd(ptr, createStridedOffsets(stride), m_value,
223 sizeof(element_type));
232 self_type
const& store_strided_n(element_type* ptr,
237 _mm512_mask_i64scatter_pd(ptr, createMask(N), createStridedOffsets(stride),
238 m_value,
sizeof(element_type));
248 element_type
get(camp::idx_t i)
const {
return m_value[i]; }
256 self_type& set(element_type value, camp::idx_t i)
265 self_type& broadcast(element_type
const& value)
267 m_value = _mm512_set1_pd(value);
274 self_type& copy(self_type
const& src)
276 m_value = src.m_value;
283 self_type add(self_type
const& b)
const
285 return self_type(_mm512_add_pd(m_value, b.m_value));
291 self_type subtract(self_type
const& b)
const
293 return self_type(_mm512_sub_pd(m_value, b.m_value));
299 self_type multiply(self_type
const& b)
const
301 return self_type(_mm512_mul_pd(m_value, b.m_value));
307 self_type divide(self_type
const& b)
const
309 return self_type(_mm512_div_pd(m_value, b.m_value));
315 self_type divide_n(self_type
const& b, camp::idx_t N)
const
317 return self_type(_mm512_maskz_div_pd(createMask(N), m_value, b.m_value));
325 self_type multiply_add(self_type
const& b, self_type
const& c)
const
327 return self_type(_mm512_fmadd_pd(m_value, b.m_value, c.m_value));
333 self_type multiply_subtract(self_type
const& b, self_type
const& c)
const
335 return self_type(_mm512_fmsub_pd(m_value, b.m_value, c.m_value));
344 element_type
sum()
const {
return _mm512_reduce_add_pd(m_value); }
351 element_type
max()
const {
return _mm512_reduce_max_pd(m_value); }
358 element_type max_n(camp::idx_t N)
const
360 return _mm512_mask_reduce_max_pd(createMask(N), m_value);
368 self_type vmax(self_type a)
const
370 return self_type(_mm512_max_pd(m_value, a.m_value));
378 element_type
min()
const {
return _mm512_reduce_min_pd(m_value); }
385 element_type min_n(camp::idx_t N)
const
387 return _mm512_mask_reduce_min_pd(createMask(N), m_value);
395 self_type vmin(self_type a)
const
397 return self_type(_mm512_min_pd(m_value, a.m_value));
RAJA header file defining SIMD/SIMT register operations.
Header file for common RAJA internal macro definitions.
#define RAJA_HOST_DEVICE
Definition: macros.hpp:65
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result min(Args... args)
Definition: foldl.hpp:161
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result sum(Args... args)
Definition: foldl.hpp:143
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155