22 #ifndef RAJA_policy_vector_register_avx_double_HPP
23 #define RAJA_policy_vector_register_avx_double_HPP
25 #include "RAJA/config.hpp"
30 #include <immintrin.h>
39 class Register<double, avx_register>
40 :
public internal::expt::RegisterBase<Register<double, avx_register>>
44 internal::expt::RegisterBase<Register<double, avx_register>>;
46 using register_policy = avx_register;
47 using self_type = Register<double, avx_register>;
48 using element_type = double;
49 using register_type = __m256d;
51 using int_vector_type = Register<int64_t, avx_register>;
55 register_type m_value;
58 __m256i createMask(camp::idx_t N)
const
61 return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
66 __m256i createStridedOffsets(camp::idx_t stride)
const
69 return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
73 static constexpr camp::idx_t s_num_elem = 4;
79 Register() : base_type(), m_value(_mm256_setzero_pd()) {}
85 Register(element_type x0, element_type x1, element_type x2, element_type x3)
87 m_value(_mm256_set_pd(x3, x2, x1, x0))
94 explicit Register(register_type
const& c) : base_type(), m_value(c) {}
100 Register(self_type
const& c) : base_type(), m_value(c.m_value) {}
106 self_type& operator=(self_type
const& c)
117 Register(element_type
const& c) : m_value(_mm256_set1_pd(c)) {}
124 self_type& load_packed(element_type
const* ptr)
126 m_value = _mm256_loadu_pd(ptr);
136 self_type& load_packed_n(element_type
const* ptr, camp::idx_t N)
138 m_value = _mm256_maskload_pd(ptr, createMask(N));
147 self_type& load_strided(element_type
const* ptr, camp::idx_t stride)
149 for (camp::idx_t i = 0; i < 4; ++i)
151 m_value[i] = ptr[i * stride];
162 self_type& load_strided_n(element_type
const* ptr,
166 m_value = _mm256_setzero_pd();
167 for (camp::idx_t i = 0; i < N; ++i)
169 m_value[i] = ptr[i * stride];
179 self_type
const& store_packed(element_type* ptr)
const
181 _mm256_storeu_pd(ptr, m_value);
190 self_type
const& store_packed_n(element_type* ptr, camp::idx_t N)
const
192 _mm256_maskstore_pd(ptr, createMask(N), m_value);
201 self_type
const& store_strided(element_type* ptr, camp::idx_t stride)
const
203 for (camp::idx_t i = 0; i < 4; ++i)
205 ptr[i * stride] = m_value[i];
215 self_type
const& store_strided_n(element_type* ptr,
219 for (camp::idx_t i = 0; i < N; ++i)
221 ptr[i * stride] = m_value[i];
232 element_type
get(camp::idx_t i)
const {
return m_value[i]; }
240 self_type& set(element_type value, camp::idx_t i)
249 self_type& broadcast(element_type
const& value)
251 m_value = _mm256_set1_pd(value);
258 self_type& copy(self_type
const& src)
260 m_value = src.m_value;
267 self_type add(self_type
const& b)
const
269 return self_type(_mm256_add_pd(m_value, b.m_value));
275 self_type subtract(self_type
const& b)
const
277 return self_type(_mm256_sub_pd(m_value, b.m_value));
283 self_type multiply(self_type
const& b)
const
285 return self_type(_mm256_mul_pd(m_value, b.m_value));
291 self_type divide(self_type
const& b)
const
293 return self_type(_mm256_div_pd(m_value, b.m_value));
299 self_type divide_n(self_type
const& b, camp::idx_t N)
const
302 return self_type(_mm256_set_pd(
303 N >= 4 ?
get(3) / b.get(3) : 0, N >= 3 ?
get(2) / b.get(2) : 0,
304 N >= 2 ?
get(1) / b.get(1) : 0, N >= 1 ?
get(0) / b.get(0) : 0));
312 element_type
sum()
const
314 auto sh1 = _mm256_permute_pd(m_value, 0x5);
315 auto red1 = _mm256_add_pd(m_value, sh1);
316 return red1[0] + red1[2];
324 element_type
max()
const
328 register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
335 register_type b = _mm256_max_pd(m_value, a);
338 return RAJA::max<element_type>(b[0], b[2]);
346 element_type max_n(camp::idx_t N)
const
352 register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
359 register_type b = _mm256_max_pd(m_value, a);
362 return RAJA::max<element_type>(b[0], b[2]);
370 register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
377 register_type b = _mm256_max_pd(m_value, a);
380 return RAJA::max<element_type>(b[0], b[2]);
384 return RAJA::max<element_type>(m_value[0], m_value[1]);
398 self_type vmax(self_type a)
const
400 return self_type(_mm256_max_pd(m_value, a.m_value));
408 element_type
min()
const
412 register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
419 register_type b = _mm256_min_pd(m_value, a);
422 return RAJA::min<element_type>(b[0], b[2]);
430 element_type min_n(camp::idx_t N)
const
436 register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
443 register_type b = _mm256_min_pd(m_value, a);
446 return RAJA::min<element_type>(b[0], b[2]);
454 register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
461 register_type b = _mm256_min_pd(m_value, a);
464 return RAJA::min<element_type>(b[0], b[2]);
468 return RAJA::min<element_type>(m_value[0], m_value[1]);
482 self_type vmin(self_type a)
const
484 return self_type(_mm256_min_pd(m_value, a.m_value));
RAJA header file defining SIMD/SIMT register operations.
Header file for common RAJA internal macro definitions.
#define RAJA_HOST_DEVICE
Definition: macros.hpp:65
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result min(Args... args)
Definition: foldl.hpp:161
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result sum(Args... args)
Definition: foldl.hpp:143
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155