22 #ifndef RAJA_policy_vector_register_avx2_double_HPP
23 #define RAJA_policy_vector_register_avx2_double_HPP
25 #include "RAJA/config.hpp"
30 #include <immintrin.h>
39 class Register<double, avx2_register>
40 :
public internal::expt::RegisterBase<Register<double, avx2_register>>
44 internal::expt::RegisterBase<Register<double, avx2_register>>;
46 using register_policy = avx2_register;
47 using self_type = Register<double, avx2_register>;
48 using element_type = double;
49 using register_type = __m256d;
51 using int_vector_type = Register<int64_t, avx2_register>;
54 register_type m_value;
57 __m256i createMask(camp::idx_t N)
const
60 return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
65 __m256i createStridedOffsets(camp::idx_t stride)
const
68 return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
72 static constexpr camp::idx_t s_num_elem = 4;
78 Register() : m_value(_mm256_setzero_pd()) {}
84 Register(element_type x0, element_type x1, element_type x2, element_type x3)
85 : m_value(_mm256_set_pd(x3, x2, x1, x0))
92 explicit Register(register_type
const& c) : m_value(c) {}
98 Register(self_type
const& c) : base_type(c), m_value(c.m_value) {}
104 self_type& operator=(self_type
const& c)
115 Register(element_type
const& c) : m_value(_mm256_set1_pd(c)) {}
121 constexpr register_type get_register()
const {
return m_value; }
128 self_type& load_packed(element_type
const* ptr)
130 #ifdef RAJA_ENABLE_VECTOR_STATS
131 RAJA::tensor_stats::num_vector_load_packed++;
133 m_value = _mm256_loadu_pd(ptr);
143 self_type& load_packed_n(element_type
const* ptr, camp::idx_t N)
145 #ifdef RAJA_ENABLE_VECTOR_STATS
146 RAJA::tensor_stats::num_vector_load_packed_n++;
148 m_value = _mm256_maskload_pd(ptr, createMask(N));
157 self_type& load_strided(element_type
const* ptr, camp::idx_t stride)
159 #ifdef RAJA_ENABLE_VECTOR_STATS
160 RAJA::tensor_stats::num_vector_load_strided++;
162 m_value = _mm256_i64gather_pd(ptr, createStridedOffsets(stride),
163 sizeof(element_type));
173 self_type& load_strided_n(element_type
const* ptr,
177 #ifdef RAJA_ENABLE_VECTOR_STATS
178 RAJA::tensor_stats::num_vector_load_strided_n++;
180 m_value = _mm256_mask_i64gather_pd(
181 _mm256_setzero_pd(), ptr, createStridedOffsets(stride),
182 _mm256_castsi256_pd(createMask(N)),
sizeof(element_type));
196 self_type& gather(element_type
const* ptr, int_vector_type offsets)
198 #ifdef RAJA_ENABLE_VECTOR_STATS
199 RAJA::tensor_stats::num_vector_load_strided_n++;
202 _mm256_i64gather_pd(ptr, offsets.get_register(),
sizeof(element_type));
216 self_type& gather_n(element_type
const* ptr,
217 int_vector_type offsets,
220 #ifdef RAJA_ENABLE_VECTOR_STATS
221 RAJA::tensor_stats::num_vector_load_strided_n++;
223 m_value = _mm256_mask_i64gather_pd(
224 _mm256_setzero_pd(), ptr, offsets.get_register(),
225 _mm256_castsi256_pd(createMask(N)),
sizeof(element_type));
234 self_type
const& store_packed(element_type* ptr)
const
236 #ifdef RAJA_ENABLE_VECTOR_STATS
237 RAJA::tensor_stats::num_vector_store_packed++;
239 _mm256_storeu_pd(ptr, m_value);
248 self_type
const& store_packed_n(element_type* ptr, camp::idx_t N)
const
250 #ifdef RAJA_ENABLE_VECTOR_STATS
251 RAJA::tensor_stats::num_vector_store_packed_n++;
253 _mm256_maskstore_pd(ptr, createMask(N), m_value);
262 self_type
const& store_strided(element_type* ptr, camp::idx_t stride)
const
264 #ifdef RAJA_ENABLE_VECTOR_STATS
265 RAJA::tensor_stats::num_vector_store_strided++;
267 for (camp::idx_t i = 0; i < 4; ++i)
269 ptr[i * stride] = m_value[i];
279 self_type
const& store_strided_n(element_type* ptr,
283 #ifdef RAJA_ENABLE_VECTOR_STATS
284 RAJA::tensor_stats::num_vector_store_strided_n++;
286 for (camp::idx_t i = 0; i < N; ++i)
288 ptr[i * stride] = m_value[i];
299 element_type
get(camp::idx_t i)
const {
return m_value[i]; }
307 self_type& set(element_type value, camp::idx_t i)
316 self_type& broadcast(element_type
const& value)
318 m_value = _mm256_set1_pd(value);
328 self_type get_and_broadcast(
int i)
const
333 return self_type(_mm256_permute4x64_pd(m_value, 0x00));
335 return self_type(_mm256_permute4x64_pd(m_value, 0x55));
337 return self_type(_mm256_permute4x64_pd(m_value, 0xAA));
339 return self_type(_mm256_permute4x64_pd(m_value, 0xFF));
347 self_type& copy(self_type
const& src)
349 m_value = src.m_value;
356 self_type add(self_type
const& b)
const
358 return self_type(_mm256_add_pd(m_value, b.m_value));
364 self_type subtract(self_type
const& b)
const
366 return self_type(_mm256_sub_pd(m_value, b.m_value));
372 self_type multiply(self_type
const& b)
const
374 return self_type(_mm256_mul_pd(m_value, b.m_value));
380 self_type divide(self_type
const& b)
const
382 return self_type(_mm256_div_pd(m_value, b.m_value));
388 self_type divide_n(self_type
const& b, camp::idx_t N)
const
391 return self_type(_mm256_set_pd(
392 N >= 4 ?
get(3) / b.get(3) : 0, N >= 3 ?
get(2) / b.get(2) : 0,
393 N >= 2 ?
get(1) / b.get(1) : 0, N >= 1 ?
get(0) / b.get(0) : 0));
401 self_type multiply_add(self_type
const& b, self_type
const& c)
const
403 return self_type(_mm256_fmadd_pd(m_value, b.m_value, c.m_value));
409 self_type multiply_subtract(self_type
const& b, self_type
const& c)
const
411 return self_type(_mm256_fmsub_pd(m_value, b.m_value, c.m_value));
420 element_type
sum(camp::idx_t = 4)
const
422 auto sh1 = _mm256_permute_pd(m_value, 0x5);
423 auto red1 = _mm256_add_pd(m_value, sh1);
424 return red1[0] + red1[2];
432 element_type
max(camp::idx_t N = 4)
const
438 register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
445 register_type b = _mm256_max_pd(m_value, a);
448 return RAJA::max<element_type>(b[0], b[2]);
456 register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
463 register_type b = _mm256_max_pd(m_value, a);
466 return RAJA::max<element_type>(b[0], b[2]);
470 return RAJA::max<element_type>(m_value[0], m_value[1]);
484 self_type vmax(self_type a)
const
486 return self_type(_mm256_max_pd(m_value, a.m_value));
494 element_type
min()
const
498 register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
505 register_type b = _mm256_min_pd(m_value, a);
508 return RAJA::min<element_type>(b[0], b[2]);
516 element_type min_n(camp::idx_t N)
const
522 register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
529 register_type b = _mm256_min_pd(m_value, a);
532 return std::min<element_type>(b[0], b[2]);
540 register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
547 register_type b = _mm256_min_pd(m_value, a);
550 return std::min<element_type>(b[0], b[2]);
554 return std::min<element_type>(m_value[0], m_value[1]);
568 self_type vmin(self_type a)
const
570 return self_type(_mm256_min_pd(m_value, a.m_value));
RAJA header file defining SIMD/SIMT register operations.
Header file for common RAJA internal macro definitions.
#define RAJA_HOST_DEVICE
Definition: macros.hpp:65
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result min(Args... args)
Definition: foldl.hpp:161
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result sum(Args... args)
Definition: foldl.hpp:143
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155