RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
avx_int32.hpp
Go to the documentation of this file.
1 
11 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
12 // Copyright (c) Lawrence Livermore National Security, LLC and other
13 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
14 // files for dates and other details. No copyright assignment is required
15 // to contribute to RAJA.
16 //
17 // SPDX-License-Identifier: (BSD-3-Clause)
18 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
19 
20 #ifdef __AVX__
21 
22 #ifndef RAJA_policy_vector_register_avx_int32_HPP
23 #define RAJA_policy_vector_register_avx_int32_HPP
24 
25 #include "RAJA/config.hpp"
26 #include "RAJA/util/macros.hpp"
28 
29 // Include SIMD intrinsics header file
30 #include <immintrin.h>
31 #include <cmath>
32 
33 namespace RAJA
34 {
35 namespace expt
36 {
37 template<>
38 class Register<int32_t, avx_register>
39  : public internal::expt::RegisterBase<Register<int32_t, avx_register>>
40 {
41 public:
42  using base_type =
43  internal::expt::RegisterBase<Register<int32_t, avx_register>>;
44 
45  using register_policy = avx_register;
46  using self_type = Register<int32_t, avx_register>;
47  using element_type = int32_t;
48  using register_type = __m256i;
49 
50  using int_vector_type = Register<int32_t, avx_register>;
51 
52 
53 private:
54  register_type m_value;
55 
56  RAJA_INLINE
57  __m256i createMask(camp::idx_t N) const
58  {
59  // Generate a mask
60  return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
61  N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
62  N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
63  }
64 
65  RAJA_INLINE
66  __m256i createStridedOffsets(camp::idx_t stride) const
67  {
68  // Generate a strided offset list
69  return _mm256_set_epi32(7 * stride, 6 * stride, 5 * stride, 4 * stride,
70  3 * stride, 2 * stride, stride, 0);
71  }
72 
73  RAJA_INLINE
74  __m256i createPermute1(camp::idx_t N) const
75  {
76  // Generate a permutation for first round of min/max routines
77  return _mm256_set_epi32(N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0,
78  N >= 6 ? 5 : 0, N >= 3 ? 2 : 0, N >= 4 ? 3 : 0,
79  N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
80  }
81 
82  RAJA_INLINE
83  __m256i createPermute2(camp::idx_t N) const
84  {
85  // Generate a permutation for second round of min/max routines
86  return _mm256_set_epi32(N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0,
87  N >= 7 ? 6 : 0, N >= 2 ? 1 : 0, N >= 1 ? 0 : 0,
88  N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
89  }
90 
91 public:
92  static constexpr camp::idx_t s_num_elem = 8;
93 
97  RAJA_INLINE
98  Register() : base_type(), m_value(_mm256_setzero_si256()) {}
99 
103  RAJA_INLINE
104  explicit Register(register_type const& c) : base_type(), m_value(c) {}
105 
109  RAJA_INLINE
110  Register(element_type x0,
111  element_type x1,
112  element_type x2,
113  element_type x3,
114  element_type x4,
115  element_type x5,
116  element_type x6,
117  element_type x7)
118  : m_value(_mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0))
119  {}
120 
124  RAJA_INLINE
125  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
126 
130  RAJA_INLINE
131  self_type& operator=(self_type const& c)
132  {
133  m_value = c.m_value;
134  return *this;
135  }
136 
141  RAJA_INLINE
142  Register(element_type const& c) : m_value(_mm256_set1_epi32(c)) {}
143 
148  RAJA_INLINE
149  self_type& load_packed(element_type const* ptr)
150  {
151  m_value = _mm256_loadu_si256((__m256i const*)ptr);
152  return *this;
153  }
154 
160  RAJA_INLINE
161  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
162  {
163  m_value = _mm256_setzero_si256();
164  for (camp::idx_t i = 0; i < N; ++i)
165  {
166  set(ptr[i], i);
167  }
168  return *this;
169  }
170 
175  RAJA_INLINE
176  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
177  {
178  for (camp::idx_t i = 0; i < 8; ++i)
179  {
180  set(ptr[i * stride], i);
181  }
182  return *this;
183  }
184 
190  RAJA_INLINE
191  self_type& load_strided_n(element_type const* ptr,
192  camp::idx_t stride,
193  camp::idx_t N)
194  {
195  m_value = _mm256_setzero_si256();
196  for (camp::idx_t i = 0; i < N; ++i)
197  {
198  set(ptr[i * stride], i);
199  }
200  return *this;
201  }
202 
207  RAJA_INLINE
208  self_type const& store_packed(element_type* ptr) const
209  {
210  _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
211  return *this;
212  }
213 
218  RAJA_INLINE
219  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
220  {
221  _mm256_maskstore_ps(reinterpret_cast<float*>(ptr), createMask(N),
222  reinterpret_cast<__m256>(m_value));
223  return *this;
224  }
225 
230  RAJA_INLINE
231  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
232  {
233  for (camp::idx_t i = 0; i < 8; ++i)
234  {
235  ptr[i * stride] = get(i);
236  }
237  return *this;
238  }
239 
244  RAJA_INLINE
245  self_type const& store_strided_n(element_type* ptr,
246  camp::idx_t stride,
247  camp::idx_t N) const
248  {
249  for (camp::idx_t i = 0; i < N; ++i)
250  {
251  ptr[i * stride] = get(i);
252  }
253  return *this;
254  }
255 
261  RAJA_INLINE
262  element_type get(camp::idx_t i) const
263  {
264  // got to be a nicer way to do this!?!?
265  switch (i)
266  {
267  case 0:
268  return _mm256_extract_epi32(m_value, 0);
269  case 1:
270  return _mm256_extract_epi32(m_value, 1);
271  case 2:
272  return _mm256_extract_epi32(m_value, 2);
273  case 3:
274  return _mm256_extract_epi32(m_value, 3);
275  case 4:
276  return _mm256_extract_epi32(m_value, 4);
277  case 5:
278  return _mm256_extract_epi32(m_value, 5);
279  case 6:
280  return _mm256_extract_epi32(m_value, 6);
281  case 7:
282  return _mm256_extract_epi32(m_value, 7);
283  }
284  return 0;
285  }
286 
292  RAJA_INLINE
293  self_type& set(element_type value, camp::idx_t i)
294  {
295  // got to be a nicer way to do this!?!?
296  switch (i)
297  {
298  case 0:
299  m_value = _mm256_insert_epi32(m_value, value, 0);
300  break;
301  case 1:
302  m_value = _mm256_insert_epi32(m_value, value, 1);
303  break;
304  case 2:
305  m_value = _mm256_insert_epi32(m_value, value, 2);
306  break;
307  case 3:
308  m_value = _mm256_insert_epi32(m_value, value, 3);
309  break;
310  case 4:
311  m_value = _mm256_insert_epi32(m_value, value, 4);
312  break;
313  case 5:
314  m_value = _mm256_insert_epi32(m_value, value, 5);
315  break;
316  case 6:
317  m_value = _mm256_insert_epi32(m_value, value, 6);
318  break;
319  case 7:
320  m_value = _mm256_insert_epi32(m_value, value, 7);
321  break;
322  }
323 
324  return *this;
325  }
326 
328 
329  RAJA_INLINE
330  self_type& broadcast(element_type const& value)
331  {
332  m_value = _mm256_set1_epi32(value);
333  return *this;
334  }
335 
337 
338  RAJA_INLINE
339  self_type& copy(self_type const& src)
340  {
341  m_value = src.m_value;
342  return *this;
343  }
344 
346 
347  RAJA_INLINE
348  self_type add(self_type const& b) const
349  {
350  // no 8-way 32-bit add, but there is a 4-way... split and conquer
351 
352  // Low 128-bits - use _mm256_castsi256_si128???
353  auto low_a = _mm256_castsi256_si128(m_value);
354  auto low_b = _mm256_castsi256_si128(b.m_value);
355  auto res_low = _mm256_castsi128_si256(_mm_add_epi32(low_a, low_b));
356 
357  // Hi 128-bits
358  auto hi_a = _mm256_extractf128_si256(m_value, 1);
359  auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
360  auto res_hi = _mm_add_epi32(hi_a, hi_b);
361 
362  // Stitch back together
363  return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
364  }
365 
367 
368  RAJA_INLINE
369  self_type subtract(self_type const& b) const
370  {
371  // no 8-way 32-bit subtract, but there is a 4-way... split and conquer
372 
373  // Low 128-bits
374  auto low_a = _mm256_castsi256_si128(m_value);
375  auto low_b = _mm256_castsi256_si128(b.m_value);
376  auto res_low = _mm256_castsi128_si256(_mm_sub_epi32(low_a, low_b));
377 
378  // Hi 128-bits
379  auto hi_a = _mm256_extractf128_si256(m_value, 1);
380  auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
381  auto res_hi = _mm_sub_epi32(hi_a, hi_b);
382 
383  // Stitch back together
384  return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
385  }
386 
388 
389  RAJA_INLINE
390  self_type multiply(self_type const& b) const
391  {
392  // no 8-way 32-bit multiply, but there is a 32x32 -> 64
393  // This gets ugly :)
394 
395  // Low 128-bits
396  auto low_a = _mm256_castsi256_si128(m_value);
397  auto low_b = _mm256_castsi256_si128(b.m_value);
398  // multiply even lanes 0, 2
399  auto res_low_even = _mm_mul_epi32(low_a, low_b);
400 
401  // multiply odd lanes 1, 3
402  auto low_a_sh = _mm_shuffle_epi32(low_a, 0xB1);
403  auto low_b_sh = _mm_shuffle_epi32(low_b, 0xB1);
404  auto res_low_odd = _mm_mul_epi32(low_a_sh, low_b_sh);
405 
406  // recombine to get all 4 lanes
407  // note: AVX doesn't have a int32 blend, so we use the float32 blend
408  res_low_odd = _mm_shuffle_epi32(res_low_odd, 0xB1);
409  auto res_low = _mm256_castsi128_si256(_mm_castps_si128(_mm_blend_ps(
410  _mm_castsi128_ps(res_low_odd), _mm_castsi128_ps(res_low_even), 0x05)));
411 
412 
413  // High 128-bits
414  auto hi_a = _mm256_extractf128_si256(m_value, 1);
415  auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
416  // multiply even lanes 0, 2
417  auto res_hi_even = _mm_mul_epi32(hi_a, hi_b);
418 
419  // multiply odd lanes 1, 3
420  auto hi_a_sh = _mm_shuffle_epi32(hi_a, 0xB1);
421  auto hi_b_sh = _mm_shuffle_epi32(hi_b, 0xB1);
422  auto res_hi_odd = _mm_mul_epi32(hi_a_sh, hi_b_sh);
423 
424  // recombine to get all 4 lanes
425  // note: AVX doesn't have a int32 blend, so we use the float32 blend
426  res_hi_odd = _mm_shuffle_epi32(res_hi_odd, 0xB1);
427  auto res_hi = _mm_castps_si128(_mm_blend_ps(
428  _mm_castsi128_ps(res_hi_odd), _mm_castsi128_ps(res_hi_even), 0x05));
429 
430  // Stitch back together
431  return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
432  }
433 
435 
436  RAJA_INLINE
437  self_type divide(self_type const& b) const
438  {
439  // AVX2 does not supply an integer divide, so do it manually
440  return self_type(_mm256_set_epi32(get(7) / b.get(7), get(6) / b.get(6),
441  get(5) / b.get(5), get(4) / b.get(4),
442  get(3) / b.get(3), get(2) / b.get(2),
443  get(1) / b.get(1), get(0) / b.get(0)));
444  }
445 
447 
448  RAJA_INLINE
449  self_type divide_n(self_type const& b, camp::idx_t N) const
450  {
451  // AVX2 does not supply an integer divide, so do it manually
452  return self_type(_mm256_set_epi32(
453  N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
454  N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
455  N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
456  N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
457  }
458 
463  RAJA_INLINE
464  element_type sum() const
465  {
466  // Low 128-bits
467  auto low = _mm256_castsi256_si128(m_value);
468 
469  auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
470  auto low_red1 = _mm_add_epi32(low, low_sh1);
471 
472  auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
473  auto low_red2 = _mm_add_epi32(low_red1, low_sh2);
474 
475 
476  // High 128-bits
477  auto hi = _mm256_extractf128_si256(m_value, 1);
478 
479  auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
480  auto hi_red1 = _mm_add_epi32(hi, hi_sh1);
481 
482  auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
483  auto hi_red2 = _mm_add_epi32(hi_red1, hi_sh2);
484 
485 
486  // Sum halves, extract total sum
487  auto hi_low = _mm_add_epi32(hi_red2, low_red2);
488  return _mm_extract_epi32(hi_low, 0);
489  }
490 
495  RAJA_INLINE
496  element_type max() const
497  {
498  // this is just painful, since we don't have a proper masked permute
499  // in AVX. Lots of special cases to make sure we compare just the
500  // right lanes
501 
502 
503  // Low 128-bits
504  auto low = _mm256_castsi256_si128(m_value);
505 
506  auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
507  auto low_red1 = _mm_max_epi32(low, low_sh1);
508 
509  auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
510 
511  // lane 0 of low_red2 now has reduction of 0,1,2,3
512  auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
513 
514 
515  // High 128-bits
516  auto hi = _mm256_extractf128_si256(m_value, 1);
517 
518 
519  auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
520  auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
521 
522  auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
523  auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
524 
525 
526  // Sum halves, extract final reduction
527  auto hi_low = _mm_max_epi32(hi_red2, low_red2);
528  return _mm_extract_epi32(hi_low, 0);
529  }
530 
535  RAJA_INLINE
536  element_type max_n(camp::idx_t N) const
537  {
538  // Some simple cases
539  if (N <= 0 || N > 8)
540  {
542  }
543 
544  // this is just painful, since we don't have a proper masked permute
545  // in AVX. Lots of special cases to make sure we compare just the
546  // right lanes
547  if (N == 1)
548  {
549  return _mm256_extract_epi32(m_value, 0);
550  }
551 
552  // Low 128-bits
553  auto low = _mm256_castsi256_si128(m_value);
554 
555  auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
556  auto low_red1 = _mm_max_epi32(low, low_sh1);
557 
558  if (N == 2)
559  {
560  return _mm_extract_epi32(low_red1, 0);
561  }
562 
563  if (N == 3)
564  {
565  // get lane 2 into lane 0
566  auto low_sh1a = _mm_shuffle_epi32(low, 0x2);
567  auto low_red1a = _mm_max_epi32(low_red1, low_sh1a);
568  return _mm_extract_epi32(low_red1a, 0);
569  }
570 
571  auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
572 
573  // lane 0 of low_red2 now has reduction of 0,1,2,3
574  auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
575 
576  if (N == 4)
577  {
578  return _mm_extract_epi32(low_red2, 0);
579  }
580 
581  // High 128-bits
582  auto hi = _mm256_extractf128_si256(m_value, 1);
583 
584  if (N == 5)
585  {
586  auto red_5 = _mm_max_epi32(low_red2, hi);
587  return _mm_extract_epi32(red_5, 0);
588  }
589 
590  auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
591  auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
592 
593  if (N == 6)
594  {
595  auto red_6 = _mm_max_epi32(low_red2, hi_red1);
596  return _mm_extract_epi32(red_6, 0);
597  }
598  if (N == 7)
599  {
600  // get lane 6 (lane 2 of hi) into lane 0
601  auto hi_sh7 = _mm_shuffle_epi32(hi, 0x2);
602  auto hi_red_6 = _mm_max_epi32(hi_sh7, hi_red1);
603  auto red_7 = _mm_max_epi32(low_red2, hi_red_6);
604  return _mm_extract_epi32(red_7, 0);
605  }
606 
607  auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
608  auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
609 
610 
611  // Sum halves, extract total sum
612  auto hi_low = _mm_max_epi32(hi_red2, low_red2);
613  return _mm_extract_epi32(hi_low, 0);
614  }
615 
620  RAJA_INLINE
621  self_type vmax(self_type b) const
622  {
623  // no 8-way 32-bit min, but there is a 4-way... split and conquer
624 
625  // Low 128-bits - use _mm256_castsi256_si128???
626  auto low_a = _mm256_castsi256_si128(m_value);
627  auto low_b = _mm256_castsi256_si128(b.m_value);
628  auto res_low = _mm256_castsi128_si256(_mm_max_epi32(low_a, low_b));
629 
630  // Hi 128-bits
631  auto hi_a = _mm256_extractf128_si256(m_value, 1);
632  auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
633  auto res_hi = _mm_max_epi32(hi_a, hi_b);
634 
635  // Stitch back together
636  return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
637  }
638 
643  RAJA_INLINE
644  element_type min() const
645  {
646  // this is just painful, since we don't have a proper masked permute
647  // in AVX. Lots of special cases to make sure we compare just the
648  // right lanes
649 
650  // Low 128-bits
651  auto low = _mm256_castsi256_si128(m_value);
652 
653  auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
654  auto low_red1 = _mm_min_epi32(low, low_sh1);
655 
656  auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
657 
658  // lane 0 of low_red2 now has reduction of 0,1,2,3
659  auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
660 
661 
662  // High 128-bits
663  auto hi = _mm256_extractf128_si256(m_value, 1);
664 
665  auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
666  auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
667 
668 
669  auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
670  auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
671 
672 
673  // Sum halves, extract total sum
674  auto hi_low = _mm_min_epi32(hi_red2, low_red2);
675  return _mm_extract_epi32(hi_low, 0);
676  }
677 
682  RAJA_INLINE
683  element_type min_n(camp::idx_t N) const
684  {
685  // Some simple cases
686  if (N <= 0 || N > 8)
687  {
689  }
690  // this is just painful, since we don't have a proper masked permute
691  // in AVX. Lots of special cases to make sure we compare just the
692  // right lanes
693  if (N == 1)
694  {
695  return _mm256_extract_epi32(m_value, 0);
696  }
697 
698  // Low 128-bits
699  auto low = _mm256_castsi256_si128(m_value);
700 
701  auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
702  auto low_red1 = _mm_min_epi32(low, low_sh1);
703 
704  if (N == 2)
705  {
706  return _mm_extract_epi32(low_red1, 0);
707  }
708 
709  if (N == 3)
710  {
711  // get lane 2 into lane 0
712  auto low_sh1a = _mm_shuffle_epi32(low, 0x2);
713  auto low_red1a = _mm_min_epi32(low_red1, low_sh1a);
714  return _mm_extract_epi32(low_red1a, 0);
715  }
716 
717  auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
718 
719  // lane 0 of low_red2 now has reduction of 0,1,2,3
720  auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
721 
722  if (N == 4)
723  {
724  return _mm_extract_epi32(low_red2, 0);
725  }
726 
727  // High 128-bits
728  auto hi = _mm256_extractf128_si256(m_value, 1);
729 
730  if (N == 5)
731  {
732  auto red_5 = _mm_min_epi32(low_red2, hi);
733  return _mm_extract_epi32(red_5, 0);
734  }
735 
736  auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
737  auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
738 
739  if (N == 6)
740  {
741  auto red_6 = _mm_min_epi32(low_red2, hi_red1);
742  return _mm_extract_epi32(red_6, 0);
743  }
744  if (N == 7)
745  {
746  // get lane 6 (lane 2 of hi) into lane 0
747  auto hi_sh7 = _mm_shuffle_epi32(hi, 0x2);
748  auto hi_red_6 = _mm_min_epi32(hi_sh7, hi_red1);
749  auto red_7 = _mm_min_epi32(low_red2, hi_red_6);
750  return _mm_extract_epi32(red_7, 0);
751  }
752 
753  auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
754  auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
755 
756 
757  // Sum halves, extract total sum
758  auto hi_low = _mm_min_epi32(hi_red2, low_red2);
759  return _mm_extract_epi32(hi_low, 0);
760  }
761 
766  RAJA_INLINE
767  self_type vmin(self_type b) const
768  {
769  // no 8-way 32-bit min, but there is a 4-way... split and conquer
770 
771  // Low 128-bits - use _mm256_castsi256_si128???
772  auto low_a = _mm256_castsi256_si128(m_value);
773  auto low_b = _mm256_castsi256_si128(b.m_value);
774  auto res_low = _mm256_castsi128_si256(_mm_min_epi32(low_a, low_b));
775 
776  // Hi 128-bits
777  auto hi_a = _mm256_extractf128_si256(m_value, 1);
778  auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
779  auto res_hi = _mm_min_epi32(hi_a, hi_b);
780 
781  // Stitch back together
782  return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
783  }
784 };
785 
786 
787 } // namespace expt
788 
789 } // namespace RAJA
790 
791 
792 #endif
793 
794 #endif //__AVX2__
RAJA header file defining SIMD/SIMT register operations.
Header file for common RAJA internal macro definitions.
#define RAJA_HOST_DEVICE
Definition: macros.hpp:65
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result min(Args... args)
Definition: foldl.hpp:161
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result sum(Args... args)
Definition: foldl.hpp:143
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155