RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
avx2_double.hpp
Go to the documentation of this file.
1 
11 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
12 // Copyright (c) Lawrence Livermore National Security, LLC and other
13 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
14 // files for dates and other details. No copyright assignment is required
15 // to contribute to RAJA.
16 //
17 // SPDX-License-Identifier: (BSD-3-Clause)
18 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
19 
20 #ifdef __AVX2__
21 
22 #ifndef RAJA_policy_vector_register_avx2_double_HPP
23 #define RAJA_policy_vector_register_avx2_double_HPP
24 
25 #include "RAJA/config.hpp"
26 #include "RAJA/util/macros.hpp"
28 
29 // Include SIMD intrinsics header file
30 #include <immintrin.h>
31 #include <cmath>
32 
33 namespace RAJA
34 {
35 namespace expt
36 {
37 
38 template<>
39 class Register<double, avx2_register>
40  : public internal::expt::RegisterBase<Register<double, avx2_register>>
41 {
42 public:
43  using base_type =
44  internal::expt::RegisterBase<Register<double, avx2_register>>;
45 
46  using register_policy = avx2_register;
47  using self_type = Register<double, avx2_register>;
48  using element_type = double;
49  using register_type = __m256d;
50 
51  using int_vector_type = Register<int64_t, avx2_register>;
52 
53 private:
54  register_type m_value;
55 
56  RAJA_INLINE
57  __m256i createMask(camp::idx_t N) const
58  {
59  // Generate a mask
60  return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
61  N >= 1 ? -1 : 0);
62  }
63 
64  RAJA_INLINE
65  __m256i createStridedOffsets(camp::idx_t stride) const
66  {
67  // Generate a strided offset list
68  return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
69  }
70 
71 public:
72  static constexpr camp::idx_t s_num_elem = 4;
73 
77  RAJA_INLINE
78  Register() : m_value(_mm256_setzero_pd()) {}
79 
83  RAJA_INLINE
84  Register(element_type x0, element_type x1, element_type x2, element_type x3)
85  : m_value(_mm256_set_pd(x3, x2, x1, x0))
86  {}
87 
91  RAJA_INLINE
92  explicit Register(register_type const& c) : m_value(c) {}
93 
97  RAJA_INLINE
98  Register(self_type const& c) : base_type(c), m_value(c.m_value) {}
99 
103  RAJA_INLINE
104  self_type& operator=(self_type const& c)
105  {
106  m_value = c.m_value;
107  return *this;
108  }
109 
114  RAJA_INLINE
115  Register(element_type const& c) : m_value(_mm256_set1_pd(c)) {}
116 
120  RAJA_INLINE
121  constexpr register_type get_register() const { return m_value; }
122 
127  RAJA_INLINE
128  self_type& load_packed(element_type const* ptr)
129  {
130 #ifdef RAJA_ENABLE_VECTOR_STATS
131  RAJA::tensor_stats::num_vector_load_packed++;
132 #endif
133  m_value = _mm256_loadu_pd(ptr);
134  return *this;
135  }
136 
142  RAJA_INLINE
143  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
144  {
145 #ifdef RAJA_ENABLE_VECTOR_STATS
146  RAJA::tensor_stats::num_vector_load_packed_n++;
147 #endif
148  m_value = _mm256_maskload_pd(ptr, createMask(N));
149  return *this;
150  }
151 
156  RAJA_INLINE
157  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
158  {
159 #ifdef RAJA_ENABLE_VECTOR_STATS
160  RAJA::tensor_stats::num_vector_load_strided++;
161 #endif
162  m_value = _mm256_i64gather_pd(ptr, createStridedOffsets(stride),
163  sizeof(element_type));
164  return *this;
165  }
166 
172  RAJA_INLINE
173  self_type& load_strided_n(element_type const* ptr,
174  camp::idx_t stride,
175  camp::idx_t N)
176  {
177 #ifdef RAJA_ENABLE_VECTOR_STATS
178  RAJA::tensor_stats::num_vector_load_strided_n++;
179 #endif
180  m_value = _mm256_mask_i64gather_pd(
181  _mm256_setzero_pd(), ptr, createStridedOffsets(stride),
182  _mm256_castsi256_pd(createMask(N)), sizeof(element_type));
183  return *this;
184  }
185 
195  RAJA_INLINE
196  self_type& gather(element_type const* ptr, int_vector_type offsets)
197  {
198 #ifdef RAJA_ENABLE_VECTOR_STATS
199  RAJA::tensor_stats::num_vector_load_strided_n++;
200 #endif
201  m_value =
202  _mm256_i64gather_pd(ptr, offsets.get_register(), sizeof(element_type));
203  return *this;
204  }
205 
215  RAJA_INLINE
216  self_type& gather_n(element_type const* ptr,
217  int_vector_type offsets,
218  camp::idx_t N)
219  {
220 #ifdef RAJA_ENABLE_VECTOR_STATS
221  RAJA::tensor_stats::num_vector_load_strided_n++;
222 #endif
223  m_value = _mm256_mask_i64gather_pd(
224  _mm256_setzero_pd(), ptr, offsets.get_register(),
225  _mm256_castsi256_pd(createMask(N)), sizeof(element_type));
226  return *this;
227  }
228 
233  RAJA_INLINE
234  self_type const& store_packed(element_type* ptr) const
235  {
236 #ifdef RAJA_ENABLE_VECTOR_STATS
237  RAJA::tensor_stats::num_vector_store_packed++;
238 #endif
239  _mm256_storeu_pd(ptr, m_value);
240  return *this;
241  }
242 
247  RAJA_INLINE
248  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
249  {
250 #ifdef RAJA_ENABLE_VECTOR_STATS
251  RAJA::tensor_stats::num_vector_store_packed_n++;
252 #endif
253  _mm256_maskstore_pd(ptr, createMask(N), m_value);
254  return *this;
255  }
256 
261  RAJA_INLINE
262  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
263  {
264 #ifdef RAJA_ENABLE_VECTOR_STATS
265  RAJA::tensor_stats::num_vector_store_strided++;
266 #endif
267  for (camp::idx_t i = 0; i < 4; ++i)
268  {
269  ptr[i * stride] = m_value[i];
270  }
271  return *this;
272  }
273 
278  RAJA_INLINE
279  self_type const& store_strided_n(element_type* ptr,
280  camp::idx_t stride,
281  camp::idx_t N) const
282  {
283 #ifdef RAJA_ENABLE_VECTOR_STATS
284  RAJA::tensor_stats::num_vector_store_strided_n++;
285 #endif
286  for (camp::idx_t i = 0; i < N; ++i)
287  {
288  ptr[i * stride] = m_value[i];
289  }
290  return *this;
291  }
292 
298  RAJA_INLINE
299  element_type get(camp::idx_t i) const { return m_value[i]; }
300 
306  RAJA_INLINE
307  self_type& set(element_type value, camp::idx_t i)
308  {
309  m_value[i] = value;
310  return *this;
311  }
312 
314 
315  RAJA_INLINE
316  self_type& broadcast(element_type const& value)
317  {
318  m_value = _mm256_set1_pd(value);
319  return *this;
320  }
321 
326 
327  RAJA_INLINE
328  self_type get_and_broadcast(int i) const
329  {
330  switch (i)
331  {
332  case 0:
333  return self_type(_mm256_permute4x64_pd(m_value, 0x00));
334  case 1:
335  return self_type(_mm256_permute4x64_pd(m_value, 0x55));
336  case 2:
337  return self_type(_mm256_permute4x64_pd(m_value, 0xAA));
338  case 3:
339  return self_type(_mm256_permute4x64_pd(m_value, 0xFF));
340  }
341  return *this;
342  }
343 
345 
346  RAJA_INLINE
347  self_type& copy(self_type const& src)
348  {
349  m_value = src.m_value;
350  return *this;
351  }
352 
354 
355  RAJA_INLINE
356  self_type add(self_type const& b) const
357  {
358  return self_type(_mm256_add_pd(m_value, b.m_value));
359  }
360 
362 
363  RAJA_INLINE
364  self_type subtract(self_type const& b) const
365  {
366  return self_type(_mm256_sub_pd(m_value, b.m_value));
367  }
368 
370 
371  RAJA_INLINE
372  self_type multiply(self_type const& b) const
373  {
374  return self_type(_mm256_mul_pd(m_value, b.m_value));
375  }
376 
378 
379  RAJA_INLINE
380  self_type divide(self_type const& b) const
381  {
382  return self_type(_mm256_div_pd(m_value, b.m_value));
383  }
384 
386 
387  RAJA_INLINE
388  self_type divide_n(self_type const& b, camp::idx_t N) const
389  {
390  // AVX2 does not supply a masked divide, so do it manually
391  return self_type(_mm256_set_pd(
392  N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
393  N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
394  }
395 
396 // only use FMA's if the compiler has them turned on
397 #ifdef __FMA__
398  RAJA_INLINE
399 
401  self_type multiply_add(self_type const& b, self_type const& c) const
402  {
403  return self_type(_mm256_fmadd_pd(m_value, b.m_value, c.m_value));
404  }
405 
406  RAJA_INLINE
407 
409  self_type multiply_subtract(self_type const& b, self_type const& c) const
410  {
411  return self_type(_mm256_fmsub_pd(m_value, b.m_value, c.m_value));
412  }
413 #endif
414 
419  RAJA_INLINE
420  element_type sum(camp::idx_t = 4) const
421  {
422  auto sh1 = _mm256_permute_pd(m_value, 0x5);
423  auto red1 = _mm256_add_pd(m_value, sh1);
424  return red1[0] + red1[2];
425  }
426 
431  RAJA_INLINE
432  element_type max(camp::idx_t N = 4) const
433  {
434  if (N == 4)
435  {
436  // permute the first two and last two lanes of the register
437  // A = { v[1], v[0], v[3], v[2] }
438  register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
439 
440  // take the maximum value of each lane
441  // B = { max{v[0], v[1]},
442  // max{v[0], v[1]},
443  // max{v[2], v[3]},
444  // max{v[2], v[3]} }
445  register_type b = _mm256_max_pd(m_value, a);
446 
447  // now take the maximum of a lower and upper halves
448  return RAJA::max<element_type>(b[0], b[2]);
449  }
450  else if (N == 3)
451  {
452  // permute the first two and last two lanes of the register
453  // use the third element TWICE, so we effectively remove the 4th
454  // lane
455  // A = { v[1], v[0], v[2], v[2] }
456  register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
457 
458  // take the maximum value of each lane
459  // B = { max{v[0], v[1]},
460  // max{v[0], v[1]},
461  // max{v[2], v[2]}, <-- just v[2]
462  // max{v[2], v[3]} }
463  register_type b = _mm256_max_pd(m_value, a);
464 
465  // now take the maximum of a lower and upper lane
466  return RAJA::max<element_type>(b[0], b[2]);
467  }
468  else if (N == 2)
469  {
470  return RAJA::max<element_type>(m_value[0], m_value[1]);
471  }
472  else if (N == 1)
473  {
474  return m_value[0];
475  }
477  }
478 
483  RAJA_INLINE
484  self_type vmax(self_type a) const
485  {
486  return self_type(_mm256_max_pd(m_value, a.m_value));
487  }
488 
493  RAJA_INLINE
494  element_type min() const
495  {
496  // permute the first two and last two lanes of the register
497  // A = { v[1], v[0], v[3], v[2] }
498  register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
499 
500  // take the minimum value of each lane
501  // B = { min{v[0], v[1]},
502  // min{v[0], v[1]},
503  // min{v[2], v[3]},
504  // min{v[2], v[3]} }
505  register_type b = _mm256_min_pd(m_value, a);
506 
507  // now take the minimum of a lower and upper halves
508  return RAJA::min<element_type>(b[0], b[2]);
509  }
510 
515  RAJA_INLINE
516  element_type min_n(camp::idx_t N) const
517  {
518  if (N == 4)
519  {
520  // permute the first two and last two lanes of the register
521  // A = { v[1], v[0], v[3], v[2] }
522  register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
523 
524  // take the minimum value of each lane
525  // B = { min{v[0], v[1]},
526  // min{v[0], v[1]},
527  // min{v[2], v[3]},
528  // min{v[2], v[3]} }
529  register_type b = _mm256_min_pd(m_value, a);
530 
531  // now take the minimum of a lower and upper halves
532  return std::min<element_type>(b[0], b[2]);
533  }
534  else if (N == 3)
535  {
536  // permute the first two and last two lanes of the register
537  // use the third element TWICE, so we effectively remove the 4th
538  // lane
539  // A = { v[1], v[0], v[2], v[2] }
540  register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
541 
542  // take the minimum value of each lane
543  // B = { min{v[0], v[1]},
544  // min{v[0], v[1]},
545  // min{v[2], v[2]}, <-- just v[2]
546  // min{v[2], v[3]} }
547  register_type b = _mm256_min_pd(m_value, a);
548 
549  // now take the minimum of a lower and upper lane
550  return std::min<element_type>(b[0], b[2]);
551  }
552  else if (N == 2)
553  {
554  return std::min<element_type>(m_value[0], m_value[1]);
555  }
556  else if (N == 1)
557  {
558  return m_value[0];
559  }
561  }
562 
567  RAJA_INLINE
568  self_type vmin(self_type a) const
569  {
570  return self_type(_mm256_min_pd(m_value, a.m_value));
571  }
572 };
573 
574 
575 } // namespace expt
576 
577 } // namespace RAJA
578 
579 
580 #endif
581 
582 #endif //__AVX2__
RAJA header file defining SIMD/SIMT register operations.
Header file for common RAJA internal macro definitions.
#define RAJA_HOST_DEVICE
Definition: macros.hpp:65
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result min(Args... args)
Definition: foldl.hpp:161
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result sum(Args... args)
Definition: foldl.hpp:143
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155