RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
VectorRegisterImpl.hpp
Go to the documentation of this file.
1 
11 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
12 // Copyright (c) Lawrence Livermore National Security, LLC and other
13 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
14 // files for dates and other details. No copyright assignment is required
15 // to contribute to RAJA.
16 //
17 // SPDX-License-Identifier: (BSD-3-Clause)
18 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
19 
20 #ifndef RAJA_pattern_tensor_VectorRegisterImpl_HPP
21 #define RAJA_pattern_tensor_VectorRegisterImpl_HPP
22 
23 #include "RAJA/config.hpp"
24 
25 #include "RAJA/util/macros.hpp"
26 
27 #include "camp/camp.hpp"
30 #include "RAJA/util/BitMask.hpp"
31 
32 namespace RAJA
33 {
34 
35 namespace expt
36 {
37 
41 template<typename REGISTER_POLICY, typename T, camp::idx_t SIZE>
42 class TensorRegister<REGISTER_POLICY,
43  T,
45  camp::idx_seq<SIZE>>
47  RAJA::expt::TensorRegister<REGISTER_POLICY,
48  T,
49  RAJA::expt::VectorLayout,
50  camp::idx_seq<SIZE>>>
51 {
52 public:
53  using self_type = TensorRegister<REGISTER_POLICY,
54  T,
56  camp::idx_seq<SIZE>>;
58  RAJA::expt::TensorRegister<REGISTER_POLICY,
59  T,
61  camp::idx_seq<SIZE>>>;
62  using element_type = camp::decay<T>;
65 
66  static constexpr camp::idx_t s_num_elem = SIZE;
67 
69  typename register_type::int_vector_type::element_type;
70  using int_vector_type = TensorRegister<REGISTER_POLICY,
73  camp::idx_seq<SIZE>>;
74 
75 private:
76  static constexpr camp::idx_t s_register_num_elem = register_type::s_num_elem;
77 
78  static constexpr camp::idx_t s_num_full_registers =
79  s_num_elem / s_register_num_elem;
80 
81  static constexpr camp::idx_t s_num_partial_lanes =
82  s_num_elem % s_register_num_elem;
83 
84  static constexpr camp::idx_t s_num_registers = (s_num_partial_lanes > 0)
85  ? s_num_full_registers + 1
86  : s_num_full_registers;
87 
89 
90  static constexpr camp::idx_t s_shift_per_register = log_base2_t::value;
91 
92  static constexpr camp::idx_t s_mask_per_register =
93  (1 << log_base2_t::value) - 1;
94 
95  // Offset of last regiser in m_registers
96  static constexpr camp::idx_t s_final_register = s_num_partial_lanes == 0
97  ? s_num_full_registers - 1
98  : s_num_full_registers;
99 
100  template<typename IDX>
101  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_register(IDX i) -> IDX
102  {
103  return i >> IDX(s_shift_per_register);
104  }
105 
106  template<typename IDX>
107  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_lane(IDX i) -> IDX
108  {
109  return i & IDX(s_mask_per_register);
110  }
111 
112  using base_type::m_registers;
113 
114 public:
116 
117  RAJA_INLINE
118  constexpr TensorRegister() {}
119 
121 
122  RAJA_INLINE
123  TensorRegister(element_type c) { this->broadcast(c); }
124 
125  RAJA_INLINE
126 
129 
130  /*
131  * Overload for: assignment of ET to a RAJA::expt::TensorRegister
132  */
133  template<typename RHS,
134  typename std::enable_if<
135  std::is_base_of<
137  RHS>::value,
138  bool>::type = true>
139  RAJA_INLINE RAJA_HOST_DEVICE TensorRegister(RHS const& rhs)
140  {
141  // evaluate a single tile of the ET, storing in this
142  // RAJA::expt::TensorRegister
143  *this = rhs.eval(base_type::s_get_default_tile());
144  }
145 
146  template<typename... REGS>
147  explicit RAJA_HOST_DEVICE RAJA_INLINE TensorRegister(register_type reg0,
148  REGS const&... regs)
149  : base_type(reg0, regs...)
150  {}
151 
153 
154  RAJA_INLINE
155  static constexpr bool is_root() { return register_type::is_root(); }
156 
164  template<camp::idx_t STRIDE_ONE_DIM>
165  RAJA_HOST_DEVICE RAJA_INLINE static constexpr bool is_ref_packed()
166  {
167  return STRIDE_ONE_DIM == 0;
168  }
169 
174 
175  RAJA_INLINE
176  static constexpr camp::idx_t s_dim_elem(camp::idx_t dim)
177  {
178  return dim == 0 ? s_num_elem : 0;
179  }
180 
186 
187  RAJA_INLINE
189  {
190  this->broadcast(value);
191  return *this;
192  }
193 
195 
196  RAJA_INLINE
197  self_type& operator=(self_type const& c) { return this->copy(c); }
198 
203  template<typename T2, typename L, typename RP>
205  {
206  return y.left_vector_multiply(*this);
207  }
208 
209 
210  template<typename REF_TYPE>
211  struct RefBridge;
212 
213  template<typename REF_TYPE>
214  RAJA_HOST_DEVICE RAJA_INLINE self_type& load_ref(REF_TYPE const& ref)
215  {
216  RefBridge<REF_TYPE>::load_ref(*this, ref);
217  return *this;
218  }
219 
220  template<typename REF_TYPE>
221  RAJA_HOST_DEVICE RAJA_INLINE self_type const& store_ref(REF_TYPE& ref) const
222  {
223  RefBridge<REF_TYPE>::store_ref(*this, ref);
224  return *this;
225  }
226 
227  template<typename POINTER_TYPE,
228  typename INDEX_TYPE,
230  camp::idx_t STRIDE_ONE_DIM>
231  struct RefBridge<
232  RAJA::internal::expt::
233  TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>>
234  {
235 
236  using RefType = RAJA::internal::expt::
237  TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>;
238 
243 
244  RAJA_INLINE
245  static void load_ref(self_type& self, RefType const& ref)
246  {
247 
248  auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
249 
250  // check for packed data
251  if (STRIDE_ONE_DIM == 0)
252  {
253  // full vector?
254  if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
255  {
256 #ifdef RAJA_ENABLE_VECTOR_STATS
257  RAJA::tensor_stats::num_vector_load_packed++;
258 #endif
259  self.load_packed(ptr);
260  }
261  // partial
262  else
263  {
264 #ifdef RAJA_ENABLE_VECTOR_STATS
265  RAJA::tensor_stats::num_vector_load_packed_n++;
266 #endif
267  self.load_packed_n(ptr, ref.m_tile.m_size[0]);
268  }
269  }
270  // strided data
271  else
272  {
273  // full vector?
274  if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
275  {
276 #ifdef RAJA_ENABLE_VECTOR_STATS
277  RAJA::tensor_stats::num_vector_load_strided++;
278 #endif
279  self.load_strided(ptr, ref.m_stride[0]);
280  }
281  // partial
282  else
283  {
284 #ifdef RAJA_ENABLE_VECTOR_STATS
285  RAJA::tensor_stats::num_vector_load_strided_n++;
286 #endif
287  self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
288  }
289  }
290  }
291 
296 
297  RAJA_INLINE
298  static void store_ref(self_type const& self, RefType& ref)
299  {
300 
301  auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
302 
303  // check for packed data
304  if (STRIDE_ONE_DIM == 0)
305  {
306  // full vector?
307  if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
308  {
309 #ifdef RAJA_ENABLE_VECTOR_STATS
310  RAJA::tensor_stats::num_vector_store_packed++;
311 #endif
312  self.store_packed(ptr);
313  }
314  // partial
315  else
316  {
317 #ifdef RAJA_ENABLE_VECTOR_STATS
318  RAJA::tensor_stats::num_vector_store_packed_n++;
319 #endif
320  self.store_packed_n(ptr, ref.m_tile.m_size[0]);
321  }
322  }
323  // strided data
324  else
325  {
326  // full vector?
327  if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
328  {
329 #ifdef RAJA_ENABLE_VECTOR_STATS
330  RAJA::tensor_stats::num_vector_store_strided++;
331 #endif
332  self.store_strided(ptr, ref.m_stride[0]);
333  }
334  // partial
335  else
336  {
337 #ifdef RAJA_ENABLE_VECTOR_STATS
338  RAJA::tensor_stats::num_vector_store_strided_n++;
339 #endif
340  self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
341  }
342  }
343  }
344  };
345 
346  template<typename POINTER_TYPE,
347  typename INDEX_TYPE,
349  INDEX_TYPE STRIDE_VALUE,
350  INDEX_TYPE BEGIN_VALUE,
351  INDEX_TYPE SIZE_VALUE,
352  camp::idx_t STRIDE_ONE_DIM>
354  POINTER_TYPE,
355  INDEX_TYPE,
356  TENSOR_SIZE,
357  camp::int_seq<INDEX_TYPE, STRIDE_VALUE>,
358  camp::int_seq<INDEX_TYPE, BEGIN_VALUE>,
359  camp::int_seq<INDEX_TYPE, SIZE_VALUE>,
360  STRIDE_ONE_DIM>>
361  {
362 
364  POINTER_TYPE,
365  INDEX_TYPE,
366  TENSOR_SIZE,
367  camp::int_seq<INDEX_TYPE, STRIDE_VALUE>,
368  camp::int_seq<INDEX_TYPE, BEGIN_VALUE>,
369  camp::int_seq<INDEX_TYPE, SIZE_VALUE>,
370  STRIDE_ONE_DIM>;
371 
376 
377  RAJA_INLINE
378  static void load_ref(self_type& self, RefType const& ref)
379  {
380 
381  auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
382 
383  // check for packed data
384  if (STRIDE_ONE_DIM == 0)
385  {
386  // full vector?
387  if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
388  {
389 #ifdef RAJA_ENABLE_VECTOR_STATS
390  RAJA::tensor_stats::num_vector_load_packed++;
391 #endif
392  self.load_packed(ptr);
393  }
394  // partial
395  else
396  {
397 #ifdef RAJA_ENABLE_VECTOR_STATS
398  RAJA::tensor_stats::num_vector_load_packed_n++;
399 #endif
400  self.load_packed_n(ptr, ref.m_tile.m_size[0]);
401  }
402  }
403  // strided data
404  else
405  {
406  // full vector?
407  if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
408  {
409 #ifdef RAJA_ENABLE_VECTOR_STATS
410  RAJA::tensor_stats::num_vector_load_strided++;
411 #endif
412  self.load_strided(ptr, ref.m_stride[0]);
413  }
414  // partial
415  else
416  {
417 #ifdef RAJA_ENABLE_VECTOR_STATS
418  RAJA::tensor_stats::num_vector_load_strided_n++;
419 #endif
420  self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
421  }
422  }
423  }
424 
429 
430  RAJA_INLINE
431  static void store_ref(self_type const& self, RefType& ref)
432  {
433 
434  auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
435 
436  // check for packed data
437  if (STRIDE_ONE_DIM == 0)
438  {
439  // full vector?
440  if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
441  {
442 #ifdef RAJA_ENABLE_VECTOR_STATS
443  RAJA::tensor_stats::num_vector_store_packed++;
444 #endif
445  self.store_packed(ptr);
446  }
447  // partial
448  else
449  {
450 #ifdef RAJA_ENABLE_VECTOR_STATS
451  RAJA::tensor_stats::num_vector_store_packed_n++;
452 #endif
453  self.store_packed_n(ptr, ref.m_tile.m_size[0]);
454  }
455  }
456  // strided data
457  else
458  {
459  // full vector?
460  if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
461  {
462 #ifdef RAJA_ENABLE_VECTOR_STATS
463  RAJA::tensor_stats::num_vector_store_strided++;
464 #endif
465  self.store_strided(ptr, ref.m_stride[0]);
466  }
467  // partial
468  else
469  {
470 #ifdef RAJA_ENABLE_VECTOR_STATS
471  RAJA::tensor_stats::num_vector_store_strided_n++;
472 #endif
473  self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
474  }
475  }
476  }
477  };
478 
483 
484  RAJA_INLINE
486  {
487  for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
488  {
489  m_registers[reg].load_packed(ptr + reg * s_register_num_elem);
490  }
491  if (s_num_partial_lanes)
492  {
493  m_registers[s_final_register].load_packed_n(
494  ptr + s_final_register * s_register_num_elem, s_num_partial_lanes);
495  }
496  return *this;
497  }
498 
503 
504  RAJA_INLINE
505  self_type& load_strided(element_type const* ptr, int stride)
506  {
507  for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
508  {
509  m_registers[reg].load_strided(ptr + reg * s_register_num_elem * stride,
510  stride);
511  }
512  if (s_num_partial_lanes)
513  {
514  m_registers[s_final_register].load_strided_n(
515  ptr + s_final_register * s_register_num_elem * stride, stride,
516  s_num_partial_lanes);
517  }
518  return *this;
519  }
520 
525 
526  RAJA_INLINE
528  {
529  for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
530  {
531  if (N >= reg * s_register_num_elem + s_register_num_elem)
532  {
533  m_registers[reg].load_packed(ptr + reg * s_register_num_elem);
534  }
535  else
536  {
537  m_registers[reg].load_packed_n(ptr + reg * s_register_num_elem,
538  N - reg * s_register_num_elem);
539 
540  for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
541  {
542  m_registers[r].broadcast(0);
543  }
544  return *this;
545  }
546  }
547  if (s_num_partial_lanes)
548  {
549  m_registers[s_final_register].load_packed_n(
550  ptr + s_final_register * s_register_num_elem,
551  N - s_final_register * s_register_num_elem);
552  }
553  return *this;
554  }
555 
560 
561  RAJA_INLINE
562  self_type& load_strided_n(element_type const* ptr, int stride, int N)
563  {
564  for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
565  {
566  if (N >= reg * s_register_num_elem + s_register_num_elem)
567  {
568  m_registers[reg].load_strided(ptr + reg * s_register_num_elem * stride,
569  stride);
570  }
571  else
572  {
573  m_registers[reg].load_strided_n(ptr +
574  reg * s_register_num_elem * stride,
575  stride, N - reg * s_register_num_elem);
576  for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
577  {
578  m_registers[r].broadcast(0);
579  }
580  return *this;
581  }
582  }
583  if (s_num_partial_lanes)
584  {
585  m_registers[s_final_register].load_strided_n(
586  ptr + s_final_register * s_register_num_elem * stride, stride,
587  N - s_final_register * s_register_num_elem);
588  }
589  return *this;
590  }
591 
601  RAJA_INLINE
602 
605  {
606  for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
607  {
608  m_registers[reg].gather(ptr, offsets.vec(reg));
609  }
610  if (s_num_partial_lanes)
611  {
612  m_registers[s_final_register].gather_n(ptr, offsets.vec(s_final_register),
613  s_num_partial_lanes);
614  }
615  return *this;
616  }
617 
627  RAJA_INLINE
629  int_vector_type offsets,
630  camp::idx_t N)
631  {
632  for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
633  {
634  if (N >= reg * s_register_num_elem + s_register_num_elem)
635  {
636  m_registers[reg].gather(ptr, offsets.vec(reg));
637  }
638  else
639  {
640  m_registers[reg].gather_n(ptr, offsets.vec(reg),
641  N - reg * s_register_num_elem);
642  for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
643  {
644  m_registers[r].broadcast(0);
645  }
646  return *this;
647  }
648  }
649  if (s_num_partial_lanes)
650  {
651  m_registers[s_final_register].gather_n(ptr, offsets.vec(s_final_register),
652  N - s_final_register *
653  s_register_num_elem);
654  }
655  return *this;
656  }
657 
662 
663  RAJA_INLINE
665  {
666  for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
667  {
668  m_registers[reg].store_packed(ptr + reg * s_register_num_elem);
669  }
670  if (s_num_partial_lanes)
671  {
672  m_registers[s_final_register].store_packed_n(
673  ptr + s_final_register * s_register_num_elem, s_num_partial_lanes);
674  }
675  return *this;
676  }
677 
682 
683  RAJA_INLINE
684  self_type const& store_strided(element_type* ptr, int stride) const
685  {
686  for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
687  {
688  m_registers[reg].store_strided(ptr + reg * s_register_num_elem * stride,
689  stride);
690  }
691  if (s_num_partial_lanes)
692  {
693  m_registers[s_final_register].store_strided_n(
694  ptr + s_final_register * s_register_num_elem * stride, stride,
695  s_num_partial_lanes);
696  }
697  return *this;
698  }
699 
704 
705  RAJA_INLINE
706  self_type const& store_packed_n(element_type* ptr, int N) const
707  {
708  for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
709  {
710  if (N >= reg * s_register_num_elem + s_register_num_elem)
711  {
712  m_registers[reg].store_packed(ptr + reg * s_register_num_elem);
713  }
714  else
715  {
716  m_registers[reg].store_packed_n(ptr + reg * s_register_num_elem,
717  N - reg * s_register_num_elem);
718  return *this;
719  }
720  }
721  if (s_num_partial_lanes)
722  {
723  m_registers[s_final_register].store_packed_n(
724  ptr + s_final_register * s_register_num_elem,
725  N - s_final_register * s_register_num_elem);
726  }
727  return *this;
728  }
729 
734 
735  RAJA_INLINE
736  self_type const& store_strided_n(element_type* ptr, int stride, int N) const
737  {
738  for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
739  {
740  if (N >= reg * s_register_num_elem + s_register_num_elem)
741  {
742  m_registers[reg].store_strided(ptr + reg * s_register_num_elem * stride,
743  stride);
744  }
745  else
746  {
747  m_registers[reg].store_strided_n(ptr +
748  reg * s_register_num_elem * stride,
749  stride, N - reg * s_register_num_elem);
750  return *this;
751  }
752  }
753  if (s_num_partial_lanes)
754  {
755  m_registers[s_final_register].store_strided_n(
756  ptr + s_final_register * s_register_num_elem * stride, stride,
757  N - s_final_register * s_register_num_elem);
758  }
759  return *this;
760  }
761 
772 
773  RAJA_INLINE
775  int_vector_type const& offsets) const
776  {
777  for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
778  {
779  m_registers[reg].scatter(ptr, offsets.vec(reg));
780  }
781  if (s_num_partial_lanes)
782  {
783  m_registers[s_final_register].scatter_n(
784  ptr, offsets.vec(s_final_register), s_num_partial_lanes);
785  }
786  return *this;
787  }
788 
799 
800  RAJA_INLINE
802  int_vector_type const& offsets,
803  camp::idx_t N) const
804  {
805  for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
806  {
807  if (N >= reg * s_register_num_elem + s_register_num_elem)
808  {
809  m_registers[reg].scatter(ptr, offsets.vec(reg));
810  }
811  else
812  {
813  m_registers[reg].scatter_n(ptr, offsets.vec(reg),
814  N - reg * s_register_num_elem);
815 
816  return *this;
817  }
818  }
819  if (s_num_partial_lanes)
820  {
821  m_registers[s_final_register].scatter_n(
822  ptr, offsets.vec(s_final_register),
823  N - s_num_full_registers * s_register_num_elem);
824  }
825  return *this;
826  }
827 
829 
830  RAJA_INLINE
831  self_type divide(self_type const& den) const
832  {
833  self_type result;
834  for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
835  {
836  result.vec(reg) = m_registers[reg].divide(den.vec(reg));
837  }
838  if (s_num_partial_lanes)
839  {
840  result.vec(s_final_register) = m_registers[s_final_register].divide_n(
841  den.vec(s_final_register), s_num_partial_lanes);
842  }
843  return result;
844  }
845 
854 
855  RAJA_INLINE
856  self_type divide_n(self_type const& b, camp::idx_t n) const
857  {
858  self_type q(*this);
859  for (camp::idx_t i = 0; i < n; ++i)
860  {
861  q.set(this->get(i) / b.get(i), i);
862  }
863  return q;
864  }
865 
874 
875  RAJA_INLINE
876  self_type divide_n(element_type const& b, camp::idx_t n) const
877  {
878  self_type q(*this);
879  for (camp::idx_t i = 0; i < n; ++i)
880  {
881  q.set(this->get(i) / b, i);
882  }
883  return q;
884  }
885 
890  RAJA_INLINE
891 
894  {
895  // special case where there's just one parital register
896  if (s_num_full_registers == 0)
897  {
898  return m_registers[0].min_n(s_num_partial_lanes);
899  }
900 
901  element_type result = m_registers[0].min();
902  for (camp::idx_t i = 1; i < s_num_full_registers; ++i)
903  {
904  result = RAJA::min<element_type>(result, m_registers[i].min());
905  }
906  if (s_num_partial_lanes)
907  {
908  result = RAJA::min<element_type>(
909  result, m_registers[s_final_register].min_n(s_num_partial_lanes));
910  }
911  return result;
912  }
913 
917  RAJA_INLINE
918 
920  element_type min_n(int N) const
921  {
922  // special case where there's just one parital register
923  if (N < s_register_num_elem)
924  {
925  return m_registers[0].min_n(N);
926  }
927 
928  element_type result = m_registers[0].min();
929  for (camp::idx_t reg = 1; reg < s_num_full_registers; ++reg)
930  {
931  if (N >= reg * s_register_num_elem + s_register_num_elem)
932  {
933  result = RAJA::min<element_type>(result, m_registers[reg].min());
934  }
935  else
936  {
937  return RAJA::min<element_type>(
938  result, m_registers[reg].min_n(N - reg * s_register_num_elem));
939  }
940  }
941  if (N - s_num_full_registers * s_register_num_elem > 0)
942  {
943  result = RAJA::min<element_type>(
944  result, m_registers[s_final_register].min_n(
945  N - s_final_register * s_register_num_elem));
946  }
947  return result;
948  }
949 
954  RAJA_INLINE
955 
958  {
959  // special case where there's just one parital register
960  if (s_num_full_registers == 0)
961  {
962  return m_registers[0].max_n(s_num_partial_lanes);
963  }
964 
965  element_type result = m_registers[0].max();
966  for (camp::idx_t i = 1; i < s_num_full_registers; ++i)
967  {
968  result = RAJA::max<element_type>(result, m_registers[i].max());
969  }
970  if (s_num_partial_lanes)
971  {
972  result = RAJA::max<element_type>(
973  result, m_registers[s_final_register].max_n(s_num_partial_lanes));
974  }
975  return result;
976  }
977 
981  RAJA_INLINE
982 
984  element_type max_n(int N) const
985  {
986  // special case where there's just one parital register
987  if (N < s_register_num_elem)
988  {
989  return m_registers[0].max_n(N);
990  }
991 
992  element_type result = m_registers[0].max();
993  for (camp::idx_t reg = 1; reg < s_num_full_registers; ++reg)
994  {
995  if (N >= reg * s_register_num_elem + s_register_num_elem)
996  {
997  result = RAJA::max<element_type>(result, m_registers[reg].max());
998  }
999  else
1000  {
1001  return RAJA::max<element_type>(
1002  result, m_registers[reg].max_n(N - reg * s_register_num_elem));
1003  }
1004  }
1005  if (N - s_num_full_registers * s_register_num_elem > 0)
1006  {
1007  result = RAJA::max<element_type>(
1008  result, m_registers[s_final_register].max_n(
1009  N - s_final_register * s_register_num_elem));
1010  }
1011  return result;
1012  }
1013 
1017  RAJA_INLINE
1018 
1021  {
1022  // first do a vector sum of all registers
1023  register_type s = m_registers[0];
1024  for (camp::idx_t i = 1; i < s_num_registers; ++i)
1025  {
1026  s += m_registers[i];
1027  }
1028  // then a horizontal sum of result
1029  return s.sum();
1030  }
1031 
1036 
1037  RAJA_INLINE
1038  self_type operator*(self_type const& x) const { return this->multiply(x); }
1039 
1044 
1045  RAJA_INLINE
1046  element_type dot(self_type const& x) const
1047  {
1048  element_type dp(0);
1049  for (camp::idx_t i = 0; i < s_num_registers; ++i)
1050  {
1051  dp += m_registers[i].dot(x.vec(i));
1052  }
1053  return dp;
1054  }
1055 
1057 
1058  RAJA_INLINE
1059  self_type& set(element_type val, int idx)
1060  {
1061  m_registers[to_register(idx)].set(val, to_lane(idx));
1062  return *this;
1063  }
1064 
1066 
1067  RAJA_INLINE
1068  element_type get(int idx) const
1069  {
1070  return m_registers[to_register(idx)].get(to_lane(idx));
1071  }
1072 
1078  RAJA_INLINE
1079  std::string to_string() const
1080  {
1081  std::string s = "Vector(" + std::to_string(s_num_elem) + ")[ ";
1082 
1083  //
1084  for (camp::idx_t i = 0; i < s_num_elem; ++i)
1085  {
1086  s += std::to_string(this->get(i)) + " ";
1087  }
1088 
1089  camp::idx_t physical_size = s_num_registers * s_register_num_elem;
1090  if (s_num_elem < physical_size)
1091  {
1092  s += "{";
1093  for (camp::idx_t i = s_num_elem; i < physical_size; ++i)
1094  {
1095  s += std::to_string(this->get(i)) + " ";
1096  }
1097  s += "}";
1098  }
1099 
1100 
1101  s += " ]\n";
1102 
1103  return s;
1104  }
1105 };
1106 
1107 
1108 } // namespace expt
1109 } // namespace RAJA
1110 
1111 // Bring in the register policy file so we get the default register type
1112 // and all of the register traits setup
1113 #include "RAJA/policy/tensor/arch.hpp"
1114 
1115 
1116 #endif
RAJA header file defining a bit masking operator.
RAJA header file defining SIMD/SIMT register operations.
Header file containing RAJA simd policy definitions.
Definition: RegisterBase.hpp:39
RAJA_INLINE RAJA_HOST_DEVICE element_type max_n(int N) const
Returns the largest element over the first N lanes.
Definition: VectorRegisterImpl.hpp:984
RAJA_INLINE RAJA_HOST_DEVICE TensorRegister(self_type const &c)
Definition: VectorRegisterImpl.hpp:128
RAJA_INLINE RAJA_HOST_DEVICE element_type max() const
Returns the largest element.
Definition: VectorRegisterImpl.hpp:957
RAJA_HOST_DEVICE RAJA_INLINE self_type & load_strided_n(element_type const *ptr, int stride, int N)
Definition: VectorRegisterImpl.hpp:562
RAJA_HOST_DEVICE RAJA_INLINE self_type const & store_strided(element_type *ptr, int stride) const
Definition: VectorRegisterImpl.hpp:684
self_type operator*(SquareMatrixRegister< T2, L, RP > const &y) const
Definition: VectorRegisterImpl.hpp:204
RAJA_INLINE RAJA_HOST_DEVICE TensorRegister(RHS const &rhs)
Definition: VectorRegisterImpl.hpp:139
RAJA_HOST_DEVICE RAJA_INLINE self_type const & store_strided_n(element_type *ptr, int stride, int N) const
Definition: VectorRegisterImpl.hpp:736
RAJA_INLINE RAJA_HOST_DEVICE self_type & gather(element_type const *ptr, int_vector_type offsets)
Generic gather operation for full vector.
Definition: VectorRegisterImpl.hpp:604
RAJA_HOST_DEVICE RAJA_INLINE element_type get(int idx) const
Definition: VectorRegisterImpl.hpp:1068
RAJA_HOST_DEVICE RAJA_INLINE TensorRegister(element_type c)
Definition: VectorRegisterImpl.hpp:123
RAJA_SUPPRESS_HD_WARN RAJA_HOST_DEVICE RAJA_INLINE self_type divide_n(self_type const &b, camp::idx_t n) const
Divide n elements of this vector by another vector.
Definition: VectorRegisterImpl.hpp:856
RAJA_HOST_DEVICE RAJA_INLINE element_type dot(self_type const &x) const
The dot product of two vectors.
Definition: VectorRegisterImpl.hpp:1046
RAJA_INLINE std::string to_string() const
Converts to vector to a string.
Definition: VectorRegisterImpl.hpp:1079
RAJA_HOST_DEVICE RAJA_INLINE self_type & load_packed(element_type const *ptr)
Definition: VectorRegisterImpl.hpp:485
RAJA_HOST_DEVICE RAJA_INLINE self_type const & store_packed_n(element_type *ptr, int N) const
Definition: VectorRegisterImpl.hpp:706
RAJA_HOST_DEVICE RAJA_INLINE self_type const & scatter_n(element_type *ptr, int_vector_type const &offsets, camp::idx_t N) const
Generic scatter operation for n-length subvector.
Definition: VectorRegisterImpl.hpp:801
RAJA_INLINE RAJA_HOST_DEVICE element_type min() const
Returns the largest element.
Definition: VectorRegisterImpl.hpp:893
typename register_type::int_vector_type::element_type int_element_type
Definition: VectorRegisterImpl.hpp:69
RAJA_HOST_DEVICE RAJA_INLINE TensorRegister(register_type reg0, REGS const &... regs)
Definition: VectorRegisterImpl.hpp:147
RAJA_HOST_DEVICE static constexpr RAJA_INLINE bool is_ref_packed()
Definition: VectorRegisterImpl.hpp:165
RAJA_HOST_DEVICE static constexpr RAJA_INLINE bool is_root()
Definition: VectorRegisterImpl.hpp:155
RAJA_INLINE RAJA_HOST_DEVICE element_type min_n(int N) const
Returns the smallest element over the first N lanes.
Definition: VectorRegisterImpl.hpp:920
RAJA_HOST_DEVICE RAJA_INLINE self_type & set(element_type val, int idx)
Definition: VectorRegisterImpl.hpp:1059
RAJA_HOST_DEVICE constexpr RAJA_INLINE TensorRegister()
Definition: VectorRegisterImpl.hpp:118
RAJA_HOST_DEVICE RAJA_INLINE self_type & operator=(self_type const &c)
Definition: VectorRegisterImpl.hpp:197
RAJA_HOST_DEVICE RAJA_INLINE self_type const & scatter(element_type *ptr, int_vector_type const &offsets) const
Generic scatter operation for full vector.
Definition: VectorRegisterImpl.hpp:774
RAJA_HOST_DEVICE RAJA_INLINE self_type divide(self_type const &den) const
Definition: VectorRegisterImpl.hpp:831
RAJA_HOST_DEVICE RAJA_INLINE self_type const & store_packed(element_type *ptr) const
Definition: VectorRegisterImpl.hpp:664
RAJA_HOST_DEVICE RAJA_INLINE self_type const & store_ref(REF_TYPE &ref) const
Definition: VectorRegisterImpl.hpp:221
RAJA_INLINE self_type & gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N)
Generic gather operation for n-length subvector.
Definition: VectorRegisterImpl.hpp:628
RAJA_HOST_DEVICE static constexpr RAJA_INLINE camp::idx_t s_dim_elem(camp::idx_t dim)
Definition: VectorRegisterImpl.hpp:176
RAJA_INLINE RAJA_HOST_DEVICE element_type sum() const
Returns the sum of all elements.
Definition: VectorRegisterImpl.hpp:1020
RAJA_HOST_DEVICE RAJA_INLINE self_type & load_strided(element_type const *ptr, int stride)
Definition: VectorRegisterImpl.hpp:505
RAJA_HOST_DEVICE RAJA_INLINE self_type & operator=(element_type value)
Set entire vector to a single scalar value.
Definition: VectorRegisterImpl.hpp:188
RAJA_HOST_DEVICE RAJA_INLINE self_type & load_packed_n(element_type const *ptr, int N)
Definition: VectorRegisterImpl.hpp:527
RAJA_HOST_DEVICE RAJA_INLINE self_type operator*(self_type const &x) const
The * operator of two vectors is a element-wise multiply.
Definition: VectorRegisterImpl.hpp:1038
RAJA_HOST_DEVICE RAJA_INLINE self_type & load_ref(REF_TYPE const &ref)
Definition: VectorRegisterImpl.hpp:214
RAJA_SUPPRESS_HD_WARN RAJA_HOST_DEVICE RAJA_INLINE self_type divide_n(element_type const &b, camp::idx_t n) const
Divide n elements of this vector by a scalar.
Definition: VectorRegisterImpl.hpp:876
Definition: TensorRegister.hpp:46
Definition: ExpressionTemplateBase.hpp:68
Definition: TensorRegisterBase.hpp:105
Header file for common RAJA internal macro definitions.
#define RAJA_HOST_DEVICE
Definition: macros.hpp:65
#define RAJA_SUPPRESS_HD_WARN
Definition: macros.hpp:68
TensorLayout< 0 > VectorLayout
Definition: TensorLayout.hpp:77
TensorTileSize
Definition: TensorRef.hpp:234
@ TENSOR_FULL
Definition: TensorRef.hpp:236
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result min(Args... args)
Definition: foldl.hpp:161
RAJA_HOST_DEVICE constexpr RAJA_INLINE RAJA::zip_tuple_element_t< I, zip_tuple< is_val, Ts... > > & get(zip_tuple< is_val, Ts... > &z) noexcept
Definition: zip_tuple.hpp:56
RAJA_HOST_DEVICE constexpr RAJA_INLINE Result max(Args... args)
Definition: foldl.hpp:155
RAJA header file defining SIMD/SIMT register operations.
Definition: BitMask.hpp:30
Definition: TensorLayout.hpp:35
RAJA_HOST_DEVICE static RAJA_INLINE void load_ref(self_type &self, RefType const &ref)
Performs load specified by TensorRef object.
Definition: VectorRegisterImpl.hpp:245
RAJA_HOST_DEVICE static RAJA_INLINE void store_ref(self_type const &self, RefType &ref)
Performs load specified by TensorRef object.
Definition: VectorRegisterImpl.hpp:298
Definition: TensorRef.hpp:472
Definition: TensorRef.hpp:426
index_type m_stride[NUM_DIMS]
Definition: TensorRef.hpp:442
pointer_type m_pointer
Definition: TensorRef.hpp:441
tile_type m_tile
Definition: TensorRef.hpp:443
index_type m_begin[NUM_DIMS]
Definition: TensorRef.hpp:246
index_type m_size[NUM_DIMS]
Definition: TensorRef.hpp:247