RAJA
RAJA provides a collection of platform portability abstractions for C++ HPC applications.
TensorTileExec.hpp
Go to the documentation of this file.
1 
11 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
12 // Copyright (c) Lawrence Livermore National Security, LLC and other
13 // RAJA Project Developers. See top-level LICENSE and COPYRIGHT
14 // files for dates and other details. No copyright assignment is required
15 // to contribute to RAJA.
16 //
17 // SPDX-License-Identifier: (BSD-3-Clause)
18 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
19 
20 #ifndef RAJA_pattern_tensor_TensorTileExec_HPP
21 #define RAJA_pattern_tensor_TensorTileExec_HPP
22 
23 #include "RAJA/config.hpp"
24 
25 #include "RAJA/util/macros.hpp"
26 
29 
30 namespace RAJA
31 {
32 namespace internal
33 {
34 namespace expt
35 {
36 
37 
38 template<typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
40 
41 template<typename STORAGE, typename DIM_SEQ>
43 
47 template<typename STORAGE, camp::idx_t DIM0, camp::idx_t... DIM_REST>
48 struct TensorTileExec<STORAGE, camp::idx_seq<DIM0, DIM_REST...>>
49 {
50 
51  using inner_t = TensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>>;
52 
53  template<typename OTILE, typename TTYPE, typename BODY>
54  RAJA_HOST_DEVICE RAJA_INLINE static void exec(OTILE const& otile,
55  TTYPE& tile,
56  BODY&& body)
57  {
58 
59  auto const orig_begin = otile.m_begin[DIM0];
60  auto const orig_size = otile.m_size[DIM0];
61 
62  // Do the full tile sizes
63  for (tile.m_begin[DIM0] = orig_begin;
64 
65  tile.m_begin[DIM0] + STORAGE::s_dim_elem(DIM0) <=
66  orig_begin + orig_size;
67 
68  tile.m_begin[DIM0] += STORAGE::s_dim_elem(DIM0))
69  {
70 
71  // Do the next inner tiling loop
72  inner_t::exec(otile, tile, body);
73  }
74 
75  // Postamble if needed
76  if (tile.m_begin[DIM0] < orig_begin + orig_size)
77  {
78 
79  // convert tile to a partial tile
80  auto& part_tile = make_tensor_tile_partial(tile);
81 
82  // store original size
83  auto tmp_size = part_tile.m_size[DIM0];
84 
85  // set tile size to the remainder
86  part_tile.m_size[DIM0] = orig_begin + orig_size - tile.m_begin[DIM0];
87 
88  // Do the next inner tiling loop
89  inner_t::exec(otile, part_tile, body);
90 
91  // restore size
92  part_tile.m_size[DIM0] = tmp_size;
93  }
94 
95  // reset tile dimension
96  tile.m_begin[DIM0] = orig_begin;
97  }
98 
99  template<typename OTILE, typename TTYPE, typename BODY>
100  RAJA_HOST_DEVICE RAJA_INLINE static void static_exec(OTILE const& otile,
101  TTYPE const& tile,
102  BODY&& body)
103  {
104 
105 
106  auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
107  auto constexpr orig_size = OTILE::size_type::value_at(DIM0);
108 
109  auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
110 
111  auto constexpr step_size = STORAGE::s_dim_elem(DIM0);
112 
113  auto constexpr iter_count =
114  (tile_begin >= orig_begin) && (tile_begin < (orig_begin + orig_size))
115  ? ((orig_begin + orig_size) - tile_begin + step_size - 1) /
116  step_size
117  : 0;
118 
119 
120  using IterCount =
121  camp::integral_constant<typename TTYPE::index_type, iter_count>;
122  using DimSeq = camp::idx_seq<DIM0, DIM_REST...>;
123  using IdxSeq = typename camp::detail::gen_seq<typename TTYPE::index_type,
124  IterCount>::type;
125 
127  }
128 };
129 
133 template<typename STORAGE>
134 struct TensorTileExec<STORAGE, camp::idx_seq<>>
135 {
136 
137  template<typename OTILE, typename TTYPE, typename BODY>
138  RAJA_HOST_DEVICE RAJA_INLINE static void exec(OTILE&,
139  TTYPE const& tile,
140  BODY&& body)
141  {
142 
143  // execute body, passing in the current tile
144  body(tile);
145  }
146 
147  template<typename OTILE, typename TTYPE, typename BODY>
148  RAJA_HOST_DEVICE RAJA_INLINE static void static_exec(OTILE const&,
149  TTYPE const& tile,
150  BODY&& body)
151  {
152 
153  // execute body, passing in the current tile
154  body(tile);
155  }
156 };
157 
158 template<typename STORAGE,
159  typename TILE_TYPE,
160  typename BODY,
161  camp::idx_t... IDX_SEQ,
162  camp::idx_t... DIM_SEQ>
164  TILE_TYPE const& orig_tile,
165  BODY&& body,
166  camp::idx_seq<IDX_SEQ...> const&,
167  camp::idx_seq<DIM_SEQ...> const&)
168 {
169 
170  // tile over full rows and columns
171  // tile_type tile{{0,0},{row_tile_size, col_tile_size}};
172  TILE_TYPE tile {
173  {orig_tile.m_begin[IDX_SEQ]...},
174  {STORAGE::s_dim_elem(IDX_SEQ)...},
175  };
176 
177 
178  // Promote the tile type to a "full-tile" so that the full-element
179  // register operations are used.
180  // Any of the tiling loops can demote this to a partial-tile when
181  // they do postamble execution
182  auto& full_tile = make_tensor_tile_full(tile);
183 
184  // Do all of the tiling loops in layout order, this may improve
185  // cache performance
186  using layout_order = typename STORAGE::layout_type::seq_t;
187  using tensor_tile_exec_t = TensorTileExec<STORAGE, layout_order>;
188 
189 
190  tensor_tile_exec_t::exec(orig_tile, full_tile, body);
191 }
192 
193 
194 template<typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
195 struct StaticTensorTileExec;
196 
201 template<typename STORAGE,
202  camp::idx_t DIM0,
203  camp::idx_t... DIM_REST,
204  camp::idx_t IDX,
205  camp::idx_t... IDX_REST>
206 struct StaticTensorTileExec<STORAGE,
207  camp::idx_seq<DIM0, DIM_REST...>,
208  camp::idx_seq<IDX, IDX_REST...>>
209 {
210 
211  using DimList = camp::idx_seq<DIM0, DIM_REST...>;
212  using DimTail = camp::idx_seq<DIM_REST...>;
213  using IdxList = camp::idx_seq<IDX, IDX_REST...>;
214  using IdxTail = camp::idx_seq<IDX_REST...>;
215 
216  using DownExec = TensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>>;
218  camp::idx_seq<DIM0, DIM_REST...>,
219  camp::idx_seq<IDX_REST...>>;
220 
221  static auto const step_size = STORAGE::s_dim_elem(DIM0);
222 
223  template<typename OTILE, typename TTYPE, typename BODY>
224  RAJA_HOST_DEVICE RAJA_INLINE static void exec(OTILE const& otile,
225  TTYPE const& tile,
226  BODY&& body)
227  {
228 
229  auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
230  auto constexpr orig_size = OTILE::size_type::value_at(DIM0);
231 
232  auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
233 
234  using NextBegin =
235  camp::integral_constant<typename TTYPE::index_type,
236  tile_begin + STORAGE::s_dim_elem(DIM0)>;
237  using TailSize =
238  camp::integral_constant<typename TTYPE::index_type,
239  (orig_begin + orig_size) - tile_begin>;
240 
241  using NextTile =
242  typename expt::SetStaticTensorTileBegin<TTYPE, NextBegin,
243  (size_t)DIM0>::Type;
244 
245  using TailTile = typename expt::SetStaticTensorTileSize<TTYPE, TailSize,
246  (size_t)DIM0>::Type;
247  using PartTile = typename TailTile::Partial;
248 
249 
250  static_assert((tile_begin + STORAGE::s_dim_elem(DIM0)) <=
251  (orig_begin + orig_size + STORAGE::s_dim_elem(DIM0)),
252  "OOB StaticTensorTileExec DOWN");
253 
254  if ((tile_begin + STORAGE::s_dim_elem(DIM0)) <= (orig_begin + orig_size))
255  {
256  DownExec::static_exec(otile, tile, body);
257  NextTile next_tile;
258  NextExec::exec(otile, next_tile, body);
259  }
260  else if (tile_begin < (orig_begin + orig_size))
261  {
262  PartTile part_tile;
263  DownExec::static_exec(otile, part_tile, body);
264  }
265  }
266 };
267 
268 template<typename STORAGE,
269  camp::idx_t DIM0,
270  camp::idx_t IDX,
271  camp::idx_t... IDX_REST>
272 struct StaticTensorTileExec<STORAGE,
273  camp::idx_seq<DIM0>,
274  camp::idx_seq<IDX, IDX_REST...>>
275 {
277  camp::idx_seq<DIM0>,
278  camp::idx_seq<IDX_REST...>>;
279 
280  template<typename OTILE, typename TTYPE, typename BODY>
281  RAJA_HOST_DEVICE RAJA_INLINE static void exec(OTILE const& otile,
282  TTYPE const& tile,
283  BODY&& body)
284  {
285  auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
286  auto constexpr orig_size = OTILE::size_type::value_at(DIM0);
287 
288  auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
289 
290  using NextBegin =
291  camp::integral_constant<typename TTYPE::index_type,
292  tile_begin + STORAGE::s_dim_elem(DIM0)>;
293  using TailSize =
294  camp::integral_constant<typename TTYPE::index_type,
295  (orig_begin + orig_size) - tile_begin>;
296 
297  using NextTile =
298  typename expt::SetStaticTensorTileBegin<TTYPE, NextBegin,
299  (size_t)DIM0>::Type;
300 
301  using TailTile = typename expt::SetStaticTensorTileSize<TTYPE, TailSize,
302  (size_t)DIM0>::Type;
303  using PartTile = typename TailTile::Partial;
304 
305 
306  static_assert((tile_begin + STORAGE::s_dim_elem(DIM0)) <=
307  (orig_begin + orig_size + STORAGE::s_dim_elem(DIM0)),
308  "OOB StaticTensorTileExec ACROSS");
309 
310  if ((tile_begin + STORAGE::s_dim_elem(DIM0)) <= (orig_begin + orig_size))
311  {
312  body(tile);
313  NextTile next_tile;
314  NextExec::exec(otile, next_tile, body);
315  }
316  else if (tile_begin < (orig_begin + orig_size))
317  {
318  PartTile part_tile;
319  body(part_tile);
320  }
321  }
322 };
323 
324 template<typename STORAGE, camp::idx_t... DIM_REST>
325 struct StaticTensorTileExec<STORAGE,
326  camp::idx_seq<DIM_REST...>,
327  camp::idx_seq<>>
328 {
329 
330  template<typename OTILE, typename TTYPE, typename BODY>
331  RAJA_HOST_DEVICE RAJA_INLINE static void exec(OTILE const&,
332  TTYPE const&,
333  BODY&&)
334  {}
335 };
336 
337 template<typename STORAGE,
338  typename INDEX_TYPE,
339  TensorTileSize TENSOR_SIZE,
340  typename TBEGIN,
341  typename TSIZE,
342  typename BODY,
343  camp::idx_t... IDX_SEQ,
344  camp::idx_t... DIM_SEQ>
347  BODY&& body,
348  camp::idx_seq<IDX_SEQ...> const&,
349  camp::idx_seq<DIM_SEQ...> const&)
350 {
351 
353 
354  using InputBegin = typename InputType::begin_type;
355 
356  using Type = StaticTensorTile<
357  INDEX_TYPE, TENSOR_FULL,
358  camp::int_seq<INDEX_TYPE, InputBegin::value_at(IDX_SEQ)...>,
359  camp::int_seq<INDEX_TYPE, STORAGE::s_dim_elem(IDX_SEQ)...>>;
360 
361  Type full_tile;
362 
363  // Do all of the tiling loops in layout order, this may improve
364  // cache performance
365  using layout_order = typename STORAGE::layout_type::seq_t;
366  using tensor_tile_exec_t = TensorTileExec<STORAGE, layout_order>;
367 
368 
369  tensor_tile_exec_t::static_exec(orig_tile, full_tile, body);
370 }
371 
372 template<typename STORAGE, typename TILE_TYPE, typename BODY>
373 RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec(TILE_TYPE const& tile,
374  BODY&& body)
375 {
376  using layout_type = typename STORAGE::layout_type;
377  tensorTileExec_expanded<STORAGE>(
378  tile, body, camp::make_idx_seq_t<STORAGE::s_num_dims> {}, layout_type {});
379 }
380 
381 } // namespace expt
382 } // namespace internal
383 
384 } // namespace RAJA
385 
386 
387 #endif
RAJA header file defining SIMD/SIMT register operations.
Header file for common RAJA internal macro definitions.
#define RAJA_HOST_DEVICE
Definition: macros.hpp:65
RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec(TILE_TYPE const &tile, BODY &&body)
Definition: TensorTileExec.hpp:373
TensorTileSize
Definition: TensorRef.hpp:234
@ TENSOR_FULL
Definition: TensorRef.hpp:236
RAJA_INLINE constexpr RAJA_HOST_DEVICE TensorTile< INDEX_TYPE, TENSOR_FULL, NUM_DIMS > & make_tensor_tile_full(TensorTile< INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS > &tile)
Definition: TensorRef.hpp:721
RAJA_INLINE constexpr RAJA_HOST_DEVICE TensorTile< INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS > & make_tensor_tile_partial(TensorTile< INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS > &tile)
Definition: TensorRef.hpp:733
RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec_expanded(TILE_TYPE const &orig_tile, BODY &&body, camp::idx_seq< IDX_SEQ... > const &, camp::idx_seq< DIM_SEQ... > const &)
Definition: TensorTileExec.hpp:163
Definition: AlignedRangeIndexSetBuilders.cpp:35
RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const &ctx, TILE_T tile_size, SEGMENT const &segment, BODY const &body)
Definition: launch_core.hpp:589
auto & body
Definition: launch.hpp:177
RAJA header file defining SIMD/SIMT register operations.
RAJA_HOST_DEVICE static RAJA_INLINE void exec(OTILE const &, TTYPE const &, BODY &&)
Definition: TensorTileExec.hpp:331
RAJA_HOST_DEVICE static RAJA_INLINE void exec(OTILE const &otile, TTYPE const &tile, BODY &&body)
Definition: TensorTileExec.hpp:281
RAJA_HOST_DEVICE static RAJA_INLINE void exec(OTILE const &otile, TTYPE const &tile, BODY &&body)
Definition: TensorTileExec.hpp:224
Definition: TensorTileExec.hpp:39
Definition: TensorRef.hpp:309
RAJA_HOST_DEVICE static RAJA_INLINE void exec(OTILE const &otile, TTYPE &tile, BODY &&body)
Definition: TensorTileExec.hpp:54
RAJA_HOST_DEVICE static RAJA_INLINE void static_exec(OTILE const &otile, TTYPE const &tile, BODY &&body)
Definition: TensorTileExec.hpp:100
RAJA_HOST_DEVICE static RAJA_INLINE void static_exec(OTILE const &, TTYPE const &tile, BODY &&body)
Definition: TensorTileExec.hpp:148
RAJA_HOST_DEVICE static RAJA_INLINE void exec(OTILE &, TTYPE const &tile, BODY &&body)
Definition: TensorTileExec.hpp:138
Definition: TensorTileExec.hpp:42