22 #ifndef RAJA_policy_vector_register_avx512_long_HPP
23 #define RAJA_policy_vector_register_avx512_long_HPP
25 #include "RAJA/config.hpp"
30 #include <immintrin.h>
38 class Register<int64_t, avx512_register>
39 :
public internal::expt::RegisterBase<Register<int64_t, avx512_register>>
43 internal::expt::RegisterBase<Register<int64_t, avx512_register>>;
45 using register_policy = avx512_register;
46 using self_type = Register<int64_t, avx512_register>;
47 using element_type = int64_t;
48 using register_type = __m512i;
50 using int_vector_type = Register<int64_t, avx512_register>;
54 register_type m_value;
57 __mmask8 createMask(camp::idx_t N)
const
63 return __mmask8(0x00);
65 return __mmask8(0x01);
67 return __mmask8(0x03);
69 return __mmask8(0x07);
71 return __mmask8(0x0F);
73 return __mmask8(0x1F);
75 return __mmask8(0x3F);
77 return __mmask8(0x7F);
79 return __mmask8(0xFF);
85 __m512i createStridedOffsets(camp::idx_t stride)
const
88 auto vstride = _mm512_set1_epi64(stride);
89 auto vseq = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
90 return _mm512_mullo_epi64(vstride, vseq);
94 static constexpr camp::idx_t s_num_elem = 8;
101 Register() : base_type(), m_value(_mm512_setzero_epi32()) {}
107 explicit Register(register_type
const& c) : base_type(), m_value(c) {}
113 Register(self_type
const& c) : base_type(), m_value(c.m_value) {}
119 self_type& operator=(self_type
const& c)
131 Register(element_type
const& c) : base_type(), m_value(_mm512_set1_epi64(c))
139 self_type& load_packed(element_type
const* ptr)
142 #if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) || \
143 (!defined(SYCL_LANGUAGE_VERSION) && \
144 defined(__INTEL_LLVM_COMPILER))
145 m_value = _mm512_maskz_loadu_epi64(
150 _mm512_loadu_epi64(ptr);
162 self_type& load_packed_n(element_type
const* ptr, camp::idx_t N)
166 _mm512_mask_loadu_epi64(_mm512_setzero_epi32(), createMask(N), ptr);
175 self_type& load_strided(element_type
const* ptr, camp::idx_t stride)
178 m_value = _mm512_i64gather_epi64(createStridedOffsets(stride), ptr,
179 sizeof(element_type));
189 self_type& load_strided_n(element_type
const* ptr,
194 m_value = _mm512_mask_i64gather_epi64(_mm512_setzero_epi32(), createMask(N),
195 createStridedOffsets(stride), ptr,
196 sizeof(element_type));
205 self_type
const& store_packed(element_type* ptr)
const
208 #if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) || \
209 (!defined(SYCL_LANGUAGE_VERSION) && \
210 defined(__INTEL_LLVM_COMPILER))
211 _mm512_mask_storeu_epi64(ptr, ~0,
215 _mm512_storeu_epi64(ptr,
227 self_type
const& store_packed_n(element_type* ptr, camp::idx_t N)
const
230 _mm512_mask_storeu_epi64(ptr, createMask(N), m_value);
239 self_type
const& store_strided(element_type* ptr, camp::idx_t stride)
const
242 _mm512_i64scatter_epi64(ptr, createStridedOffsets(stride), m_value,
243 sizeof(element_type));
252 self_type
const& store_strided_n(element_type* ptr,
257 _mm512_mask_i64scatter_epi64(ptr, createMask(N),
258 createStridedOffsets(stride), m_value,
259 sizeof(element_type));
269 element_type
get(camp::idx_t i)
const {
return m_value[i]; }
277 self_type& set(element_type value, camp::idx_t i)
286 self_type& broadcast(element_type
const& value)
288 m_value = _mm512_set1_epi64(value);
295 self_type& copy(self_type
const& src)
297 m_value = src.m_value;
304 self_type add(self_type
const& b)
const
306 return self_type(_mm512_add_epi64(m_value, b.m_value));
312 self_type subtract(self_type
const& b)
const
314 return self_type(_mm512_sub_epi64(m_value, b.m_value));
320 self_type multiply(self_type
const& b)
const
322 return self_type(_mm512_mullo_epi64(m_value, b.m_value));
328 self_type divide(self_type
const& b)
const
331 return self_type(_mm512_set_epi64(
get(7) / b.get(7),
get(6) / b.get(6),
332 get(5) / b.get(5),
get(4) / b.get(4),
333 get(3) / b.get(3),
get(2) / b.get(2),
334 get(1) / b.get(1),
get(0) / b.get(0)));
340 self_type divide_n(self_type
const& b, camp::idx_t N)
const
343 return self_type(_mm512_set_epi64(
344 N >= 8 ?
get(7) / b.get(7) : 0, N >= 7 ?
get(6) / b.get(6) : 0,
345 N >= 6 ?
get(5) / b.get(5) : 0, N >= 5 ?
get(4) / b.get(4) : 0,
346 N >= 4 ?
get(3) / b.get(3) : 0, N >= 3 ?
get(2) / b.get(2) : 0,
347 N >= 2 ?
get(1) / b.get(1) : 0, N >= 1 ?
get(0) / b.get(0) : 0));
355 element_type
sum()
const {
return _mm512_reduce_add_epi64(m_value); }
362 element_type
max()
const {
return _mm512_reduce_max_epi64(m_value); }
369 element_type max_n(camp::idx_t N)
const
371 return _mm512_mask_reduce_max_epi64(createMask(N), m_value);
379 self_type vmax(self_type a)
const
381 return self_type(_mm512_max_epi64(m_value, a.m_value));
389 element_type
min()
const {
return _mm512_reduce_min_epi64(m_value); }
396 element_type min_n(camp::idx_t N)
const
398 return _mm512_mask_reduce_min_epi64(createMask(N), m_value);
406 self_type vmin(self_type a)
const
408 return self_type(_mm512_min_epi64(m_value, a.m_value));
RAJA header file defining SIMD/SIMT register operations.
Header file for common RAJA internal macro definitions.
#define RAJA_HOST_DEVICE
Definition: macros.hpp:65
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result min(Args... args)
Definition: foldl.hpp:161
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result sum(Args... args)
Definition: foldl.hpp:143
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155