threadwise_tensor_slice_transfer_v7r2.hpp Source File

threadwise_tensor_slice_transfer_v7r2.hpp Source File#

Composable Kernel: threadwise_tensor_slice_transfer_v7r2.hpp Source File
threadwise_tensor_slice_transfer_v7r2.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: MIT
2// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
3
4#pragma once
5
12
14
15namespace ck {
16// Thread-level multi-source, multi-destination tensor slice data movement
17// Assume:
18// 1. All sources and destinations are DynamicBuffer
19// 2. Same VectorDim and ScalerPerVector for all sources and destinations
20// 3. DstInMemOps are per destination tensor
21// 4. ThreadTransferSrcResetCoordinateAfterRunFlags are per source tensor
22// 5. ThreadTransferDstResetCoordinateAfterRunFlags are per destination tensor
23// 6. Does not need to know src_descs and dst_descs at compile-time
24// 7. Does not need to know src_slice_origins and dst_slice_origins at compile-time,
25//
26// Does following things to avoid scratch memory issue
27// 1. Use StaticallyIndexedArray or vector_type instead of C array for thread buffer
28// 2. Pass tensor descritpors by reference (or tuple of references)
29// 3. Does not keep reference to tensor descriptor
30// 4. Does not construct new tensor coordinate when call Run()
31template <typename SrcDatas,
32 typename DstDatas,
33 typename SrcDescs,
34 typename DstDescs,
35 typename ElementwiseOperation,
36 typename DstInMemOps, // Sequence<InMemoryDataOperationEnum ...>
37 typename SliceLengths,
38 typename SrcDimAccessOrder,
39 typename DstDimAccessOrder,
40 index_t SrcVectorDim,
41 index_t DstVectorDim,
42 index_t SrcScalarPerVector,
43 index_t DstScalarPerVector,
44 typename SrcResetCoordinateAfterRunFlags, // Sequence<bool ...>
45 typename DstResetCoordinateAfterRunFlags, // Sequence<bool ...>
46 index_t NumThreadScratch = 1>
48{
49 static constexpr auto I0 = Number<0>{};
50
51 static constexpr index_t nDim = SliceLengths::Size();
52
53 static constexpr index_t nSrc = SrcDescs::Size();
54 static constexpr index_t nDst = DstDescs::Size();
55
57
58 // return a tuple of coordiantes for a tuple of tensor
59 template <typename Descs,
60 typename Indices,
61 enable_if_t<Descs::Size() == Indices::Size(), bool> = false>
62 static constexpr auto MakeCoordinates(const Descs& descs, const Indices& indices)
63 {
64 return generate_tuple([&](auto i) { return make_tensor_coordinate(descs[i], indices[i]); },
65 Number<Descs::Size()>{});
66 }
67
70
71 // scalar per access on each dim
72 // FIXME: don't use lambda_scalar_per_access
75
78
80 SrcDimAccessOrder,
82 false>;
83
85 DstDimAccessOrder,
87 false>;
88
90 const SrcDescs& src_descs,
91 const StaticallyIndexedArray<Index, nSrc>& src_slice_origins,
92 const DstDescs& dst_descs,
93 const StaticallyIndexedArray<Index, nDst>& dst_slice_origins,
94 const ElementwiseOperation& element_op)
95 : src_coords_(MakeCoordinates(src_descs, src_slice_origins)),
96 dst_coords_(MakeCoordinates(dst_descs, dst_slice_origins)),
97 element_op_(element_op)
98 {
99 static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0,
100 "wrong! cannot evenly divide");
101
102 static_assert(SliceLengths::At(Number<DstVectorDim>{}) % DstScalarPerVector == 0,
103 "wrong! cannot evenly divide");
104 }
105
106 template <typename Indices, enable_if_t<SrcDescs::Size() == Indices::Size(), bool> = false>
107 __device__ void SetSrcSliceOrigins(const SrcDescs& src_descs,
108 const Indices& src_slice_origin_idxs)
109 {
110 static_for<0, nSrc, 1>{}([&](auto i) {
111 src_coords_(i) = make_tensor_coordinate(src_descs[i], src_slice_origin_idxs[i]);
112 });
113 }
114
115 template <typename Indices, enable_if_t<DstDescs::Size() == Indices::Size(), bool> = false>
116 __device__ void SetDstSliceOrigins(const DstDescs& dst_descs,
117 const Indices& dst_slice_origin_idxs)
118 {
119 static_for<0, nDst, 1>{}([&](auto i) {
120 dst_coords_(i) = make_tensor_coordinate(dst_descs[i], dst_slice_origin_idxs[i]);
121 });
122 }
123
124 template <typename DataTypes, index_t ScalarPerVector>
125 __device__ static auto generate_vectors()
126 {
127 auto data_types = DataTypes{};
128
129 constexpr index_t num = data_types.Size();
130
131 return generate_tuple(
132 [&](auto i) {
133 using DataType = remove_cvref_t<decltype(data_types[i])>;
134
136 },
137 Number<num>{});
138 }
139
140 // SrcDescs: Tuple<const SrcDesc0&, const SrcDesc1&, ...>
141 // SrcBuffers: Tuple<const SrcBuffer0&, const SrcBuffer1&, ...>
142 template <typename SrcBuffers,
143 index_t ThreadScratchId = 0,
144 enable_if_t<SrcDescs::Size() == SrcBuffers::Size(), bool> = false>
145 __device__ void RunRead(const SrcDescs& src_descs,
146 const SrcBuffers& src_bufs,
148 {
149 // loop over space-filling curve
150 static_for<0, src_num_access, 1>{}([&](auto iAccess) {
153
154 bool oob_val = true;
155
156 // copy data from src_bufs into src_vectors
157 static_for<0, nSrc, 1>{}([&](auto i) {
158 using src_vector_t = typename remove_cvref_t<decltype(src_vectors[i])>::type;
159
160 const bool is_src_valid =
162 src_coords_[i]);
163
164 oob_val = oob_val & is_src_valid;
165
166 src_vectors(i).template AsType<src_vector_t>()(I0) =
167 src_bufs[i].template Get<src_vector_t>(src_coords_[i].GetOffset(), true);
168 });
169
170 constexpr auto get_elem_op_vec_len = []() {
171 if constexpr(is_detected<is_pack8_invocable_t, decltype(element_op_)>::value)
172 {
173 if constexpr(decltype(element_op_)::is_pack8_invocable)
174 return math::min(8, SrcScalarPerVector);
175 }
176 if constexpr(is_detected<is_pack4_invocable_t, decltype(element_op_)>::value)
177 {
178 if constexpr(decltype(element_op_)::is_pack4_invocable)
179 return math::min(4, SrcScalarPerVector);
180 }
181 if constexpr(is_detected<is_pack2_invocable_t, decltype(element_op_)>::value)
182 {
183 if constexpr(decltype(element_op_)::is_pack2_invocable)
184 return math::min(2, SrcScalarPerVector);
185 }
186 return 1;
187 };
188
189 constexpr index_t elem_op_vec_len = get_elem_op_vec_len();
190
191 // apply pointwise function
192 static_for<0, SrcScalarPerVector / elem_op_vec_len, 1>{}([&](auto i) {
193 // get reference to src data
194 const auto src_data_refs = generate_tie(
195 // return type should be lvalue
196 [&](auto iSrc) -> const auto& {
197 using SrcData = remove_cvref_t<tuple_element_t<iSrc.value, SrcDatas>>;
198
199 using elem_op_vec_t = typename vector_type<SrcData, elem_op_vec_len>::type;
200
201 return src_vectors[iSrc].template AsType<elem_op_vec_t>()[i];
202 },
203 Number<nSrc>{});
204
205 // get reference to dst data
206 auto dst_data_refs = generate_tie(
207 // return type should be lvalue
208 [&](auto iDst) -> auto& {
209 using DstData = remove_cvref_t<tuple_element_t<iDst.value, DstDatas>>;
210
211 using elem_op_vec_t = typename vector_type<DstData, elem_op_vec_len>::type;
212
213 return elm_vectors(iDst).template AsType<elem_op_vec_t>()(i);
214 },
215 Number<nDst>{});
216
217 // apply pointwise function
218 // pointwise function signature:
219 // element_op_(dst_data_refs[I0],
220 // dst_data_refs[I1],
221 // ...,
222 // src_data_refs[I0],
223 // src_data_refs[I1],
224 // ...)
225 unpack2(element_op_, dst_data_refs, src_data_refs);
226 });
227
228 elm_vectors_tuple_(thread_scratch_id)(iAccess) = elm_vectors;
229 oob_vectors_tuple_(thread_scratch_id)(iAccess) = oob_val;
230
231 // move coordinate
232 if constexpr(iAccess.value != src_num_access - 1)
233 {
234 constexpr auto forward_step = SrcSpaceFillingCurve::GetForwardStep(iAccess);
235
236 static_for<0, nSrc, 1>{}([&](auto i) {
237 move_tensor_coordinate(src_descs[i],
238 src_coords_(i),
239 make_tensor_coordinate_step(src_descs[i], forward_step));
240 });
241 }
242 });
243
244 // move coordinate back to slice origin (or not)
245 static_for<0, nSrc, 1>{}([&](auto i) {
246 if constexpr(SrcResetCoordinateAfterRunFlags::At(i))
247 {
248 const auto src_reset_step =
250
251 move_tensor_coordinate(src_descs[i], src_coords_(i), src_reset_step);
252 }
253 });
254 }
255
256#if 1
257 template <index_t ThreadScratchId = 0>
258 __device__ void OOBCheck(Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
259 {
260 // loop over space-filling curve
261 static_for<0, src_num_access, 1>{}([&](auto iAccess) {
262 auto elm_vectors = elm_vectors_tuple_[thread_scratch_id][iAccess];
263 auto oob_val = oob_vectors_tuple_[thread_scratch_id][iAccess];
264
265 static_for<0, nDst, 1>{}([&](auto i) {
266 using elm_vector_t = typename remove_cvref_t<decltype(elm_vectors[i])>::type;
267 elm_vectors(i).template AsType<elm_vector_t>()(I0) =
268 oob_val ? elm_vectors(i).template AsType<elm_vector_t>()[I0] : elm_vector_t{0};
269 });
270
271 elm_vectors_tuple_(thread_scratch_id)(iAccess) = elm_vectors;
272 });
273 }
274#endif
275
276 template <index_t ThreadScratchId = 0>
277 __device__ void
279 {
280 using DstData = remove_cvref_t<decltype(DstDatas{}[I0])>;
281
282 using ElmThreadScratch =
283 StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
284 DstData,
285 SrcScalarPerVector,
287 true>;
288 using DstThreadScratch =
289 StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
290 DstData,
291 DstScalarPerVector,
293 true>;
294
295 ElmThreadScratch elm_thread_scratch_;
296 DstThreadScratch dst_thread_scratch_;
297
298 elm_thread_scratch_.data_ =
299 bit_cast<decltype(elm_thread_scratch_.data_)>(elm_vectors_tuple_[thread_scratch_id]);
300
301 if constexpr(SrcVectorDim != DstVectorDim &&
302 ((is_same<half_t, remove_cvref_t<DstData>>::value &&
303 SrcScalarPerVector % 2 == 0 && DstScalarPerVector % 2 == 0) ||
304 (is_same<f8_t, remove_cvref_t<DstData>>::value &&
305 SrcScalarPerVector % 4 == 0 && DstScalarPerVector % 4 == 0) ||
306 (is_same<int8_t, remove_cvref_t<DstData>>::value &&
307 SrcScalarPerVector % 4 == 0 && DstScalarPerVector % 4 == 0)))
308 {
309 // each transpose does
310 // DstScalarPerVector # of src vectors in src_thread_scratch_
311 // SrcScalarPerVector # of dst vectors in dst_thread_scratch_
312 constexpr index_t num_src_vector = Number<DstScalarPerVector>{};
313 constexpr index_t num_dst_vector = Number<SrcScalarPerVector>{};
314
315 // Assume SrcVectorDim is not the same as DstVectorDim, so we do transpose
316 // TODO: make this logic generic for all scenario
317
318 constexpr auto src_scalar_step_in_vector = generate_sequence(
319 detail::lambda_scalar_step_in_vector<SrcVectorDim>{}, Number<nDim>{});
320
321 constexpr auto dst_scalar_step_in_vector = generate_sequence(
322 detail::lambda_scalar_step_in_vector<DstVectorDim>{}, Number<nDim>{});
323
324 constexpr auto scalar_per_access = generate_sequence(
325 detail::lambda_scalar_per_access_for_src_and_dst<SrcVectorDim,
326 SrcScalarPerVector,
327 DstVectorDim,
328 DstScalarPerVector>{},
329 Number<nDim>{});
330
331 constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
332
333 static_ford<decltype(access_lengths)>{}([&](auto access_idx) {
334 constexpr auto data_idx = access_idx * scalar_per_access;
335
336 constexpr auto data_idx_seq = generate_sequence_v2(
337 [&](auto i) { return Number<data_idx[i]>{}; }, Number<nDim>{});
338
341
342 // get DstScalarPerVector # of read-only references to src vectors from
343 // src_thread_scratch_
344 const auto src_vector_refs = generate_tie(
345 [&](auto i) -> const src_vector_t& {
346 // i increment corresponds to movement in DstVectorDim
347 return elm_thread_scratch_.GetVectorTypeReference(
348 data_idx_seq + i * dst_scalar_step_in_vector);
349 },
351
352 // get SrcScalarPerVector # of references to dst vectors from dst_thread_scratch_
353 auto dst_vector_refs = generate_tie(
354 [&](auto i) -> dst_vector_t& {
355 // i increment corresponds to movement in SrcVectorDim
356 return dst_thread_scratch_.GetVectorTypeReference(
357 data_idx_seq + i * src_scalar_step_in_vector);
358 },
360
361 // do data transpose
362 transpose_vectors<DstData, DstScalarPerVector, SrcScalarPerVector>{}(
363 src_vector_refs, dst_vector_refs);
364 });
365 }
366 else
367 {
368 static_ford<SliceLengths>{}(
369 [&](auto idx) { dst_thread_scratch_(idx) = elm_thread_scratch_[idx]; });
370 }
371
372 dst_vectors_tuple_(thread_scratch_id) = bit_cast<DstVectorTuple>(dst_thread_scratch_.data_);
373 }
374
375 // DstDescs: Tuple<const DstDesc0&, const DstDesc1&, ...>
376 // DstBuffers: Tuple<const DstBuffer0&, const DstBuffer1&, ...>
377 template <typename DstBuffers,
378 index_t ThreadScratchId = 0,
379 enable_if_t<DstDescs::Size() == 1 && DstBuffers::Size() == 1, bool> = false>
380 __device__ void RunWrite(const DstDescs& dst_descs,
381 DstBuffers dst_bufs,
383 {
384 OOBCheck(thread_scratch_id);
385 TransposeFromElmToDst(thread_scratch_id);
386
387 // loop over space-filling curve
388 static_for<0, dst_num_access, 1>{}([&](auto iAccess) {
389 auto dst_vectors = dst_vectors_tuple_[thread_scratch_id][iAccess];
390
391 // copy data from buf_vectors into dst_bufs
392 static_for<0, nDst, 1>{}([&](auto i) {
393 using dst_vector_t = typename remove_cvref_t<decltype(dst_vectors[i])>::type;
394
395 const bool is_dst_valid =
397 dst_coords_[i]);
398
399 constexpr InMemoryDataOperationEnum DstInMemOp =
400 static_cast<InMemoryDataOperationEnum>(DstInMemOps::At(i.value));
401
402 dst_bufs(i).template Update<DstInMemOp, dst_vector_t>(
403 dst_coords_[i].GetOffset(),
404 is_dst_valid,
405 dst_vectors[i].template AsType<dst_vector_t>()[I0]);
406 });
407
408 // move coordinate
409 if constexpr(iAccess.value != dst_num_access - 1)
410 {
411 constexpr auto forward_step = DstSpaceFillingCurve::GetForwardStep(iAccess);
412
413 static_for<0, nDst, 1>{}([&](auto i) {
414 move_tensor_coordinate(dst_descs[i],
415 dst_coords_(i),
416 make_tensor_coordinate_step(dst_descs[i], forward_step));
417 });
418 }
419 });
420
421 static_for<0, nDst, 1>{}([&](auto i) {
422 if constexpr(DstResetCoordinateAfterRunFlags::At(i))
423 {
424 const auto dst_reset_step =
426
427 move_tensor_coordinate(dst_descs[i], dst_coords_(i), dst_reset_step);
428 }
429 });
430 }
431
432 // SrcDescs: Tuple<const SrcDesc0&, const SrcDesc1&, ...>
433 // SrcBuffers: Tuple<const SrcBuffer0&, const SrcBuffer1&, ...>
434 // DstDescs: Tuple<const DstDesc0&, const DstDesc1&, ...>
435 // DstBuffers: Tuple<const DstBuffer0&, const DstBuffer1&, ...>
436 template <typename SrcBuffers,
437 typename DstBuffers,
438 enable_if_t<SrcDescs::Size() == SrcBuffers::Size() &&
439 DstDescs::Size() == DstBuffers::Size(),
440 bool> = false>
441 __device__ void Run(const SrcDescs& src_descs,
442 const SrcBuffers& src_bufs,
443 const DstDescs& dst_descs,
444 DstBuffers dst_bufs)
445 {
446 RunRead(src_descs, src_bufs);
447 RunWrite(dst_descs, dst_bufs);
448 }
449
450 __device__ static constexpr auto GetSrcCoordinateResetStep()
451 {
452 if constexpr(src_num_access == 0)
453 {
454 return typename SrcSpaceFillingCurve::Index{};
455 }
456 else
457 {
459 }
460 }
461
462 __device__ static constexpr auto GetDstCoordinateResetStep()
463 {
464 if constexpr(dst_num_access == 0)
465 {
466 return typename DstSpaceFillingCurve::Index{};
467 }
468 else
469 {
471 }
472 }
473
474 __device__ static constexpr auto GetSrcThreadScratchDescriptor()
475 {
476 // constexpr auto src_scalar_per_access = generate_sequence(
477 // detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
478
479 constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
480
481 constexpr auto src_access_lengths_and_vector_length = container_push_back(
483
484 // 1st stage of transforms
485 constexpr auto desc0 =
486 make_naive_tensor_descriptor_packed(src_access_lengths_and_vector_length);
487
488 // 2nd stage of transforms
489 constexpr auto transforms = generate_tuple(
490 [&](auto i) {
491 if constexpr(i == SrcVectorDim)
492 {
494 make_tuple(src_access_lengths_and_vector_length[i],
495 src_access_lengths_and_vector_length[Number<nDim>{}]));
496 }
497 else
498 {
499 return make_pass_through_transform(src_access_lengths_and_vector_length[i]);
500 }
501 },
502 Number<nDim>{});
503
504 constexpr auto low_dim_idss = generate_tuple(
505 [&](auto i) {
506 if constexpr(i == SrcVectorDim)
507 {
508 return Sequence<i.value, nDim>{};
509 }
510 else
511 {
512 return Sequence<i.value>{};
513 }
514 },
515 Number<nDim>{});
516
517 constexpr auto up_dim_idss =
518 generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
519
520 return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
521 }
522
523 __device__ static constexpr auto GetDstThreadScratchDescriptor()
524 {
525 // 1st stage of transforms
526 // constexpr auto dst_scalar_per_access = generate_sequence(
527 // detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
528
529 constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
530
531 constexpr auto dst_access_lengths_and_vector_length = container_push_back(
533
534 constexpr auto desc0 =
535 make_naive_tensor_descriptor_packed(dst_access_lengths_and_vector_length);
536
537 // 2nd stage of transforms
538 constexpr auto transforms = generate_tuple(
539 [&](auto i) {
540 if constexpr(i == DstVectorDim)
541 {
543 make_tuple(dst_access_lengths_and_vector_length[i],
544 dst_access_lengths_and_vector_length[Number<nDim>{}]));
545 }
546 else
547 {
548 return make_pass_through_transform(dst_access_lengths_and_vector_length[i]);
549 }
550 },
551 Number<nDim>{});
552
553 constexpr auto low_dim_idss = generate_tuple(
554 [&](auto i) {
555 if constexpr(i == DstVectorDim)
556 {
557 return Sequence<i.value, nDim>{};
558 }
559 else
560 {
561 return Sequence<i.value>{};
562 }
563 },
564 Number<nDim>{});
565
566 constexpr auto up_dim_idss =
567 generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
568
569 return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
570 }
571
572 // src_slice_origin_step_idx need to be known at compile-time, for performance reason
573 template <index_t ISrc>
574 __device__ void MoveSrcSliceWindow(const SrcDescs& src_descs,
575 Number<ISrc> iSrc,
576 const Index& src_slice_origin_step_idx)
577 {
578 // if src coord was not reset by RunRead(), then need to adjust the step here
579 const auto adjusted_step_idx =
580 SrcResetCoordinateAfterRunFlags::At(iSrc)
581 ? src_slice_origin_step_idx
582 : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
583
584 // is it OK to construct a new step every time?
585 const auto adjusted_step = make_tensor_coordinate_step(src_descs[iSrc], adjusted_step_idx);
586
587 move_tensor_coordinate(src_descs[iSrc], src_coords_(iSrc), adjusted_step);
588 }
589
590 // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
591 template <index_t IDst>
592 __device__ void MoveDstSliceWindow(const DstDescs& dst_descs,
593 Number<IDst> iDst,
594 const Index& dst_slice_origin_step_idx)
595 {
596 // if dst coord was not reset by Run(), then need to adjust the step here
597 const auto adjusted_step_idx =
598 DstResetCoordinateAfterRunFlags::At(iDst)
599 ? dst_slice_origin_step_idx
600 : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
601
602 // is it OK to construct a new step every time?
603 const auto adjusted_step = make_tensor_coordinate_step(dst_descs[iDst], adjusted_step_idx);
604
605 move_tensor_coordinate(dst_descs[iDst], dst_coords_(iDst), adjusted_step);
606 }
607
608 private:
609 using SrcVectorsType = decltype(generate_vectors<SrcDatas, SrcScalarPerVector>());
610 using ElmVectorsType = decltype(generate_vectors<DstDatas, SrcScalarPerVector>());
611 using DstVectorsType = decltype(generate_vectors<DstDatas, DstScalarPerVector>());
612
613 static constexpr auto src_num_access = SrcSpaceFillingCurve::GetNumOfAccess();
614 static constexpr auto dst_num_access = DstSpaceFillingCurve::GetNumOfAccess();
615
618
621
624
625 SrcCoords src_coords_;
626 DstCoords dst_coords_;
627 const ElementwiseOperation element_op_;
628};
629
630} // namespace ck
__host__ __device__ constexpr T min(T x)
Definition utility/math.hpp:116
Definition ck.hpp:268
__host__ __device__ constexpr auto make_pass_through_transform(const LowLength &low_length)
Definition multi_index_transform_helper.hpp:12
decltype(ck::declval< T & >().is_pack8_invocable) is_pack8_invocable_t
Definition is_detected.hpp:43
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition utility/statically_indexed_array.hpp:45
int32_t index_t
Definition ck.hpp:299
__host__ __device__ constexpr auto make_tensor_coordinate_step(const TensorDesc &, const VisibleIndex &idx_diff_visible, UpdateLowerIndexHack)
Definition tensor_description/tensor_descriptor.hpp:444
__host__ __device__ constexpr void move_tensor_coordinate(const TensorDesc &tensor_desc, TensorCoord &coord, const TensorCoordStep &coord_step)
Definition tensor_description/tensor_descriptor.hpp:508
__host__ __device__ constexpr auto container_push_back(const Array< TData, NSize > &a, const TData &x)
Definition utility/container_helper.hpp:18
InMemoryDataOperationEnum
Definition ck.hpp:277
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition type.hpp:297
decltype(ck::declval< T & >().is_pack4_invocable) is_pack4_invocable_t
Definition is_detected.hpp:40
__host__ __device__ constexpr bool coordinate_has_valid_offset_assuming_visible_index_is_valid(const TensorDesc &tensor_desc, const TensorCoord &coord)
Definition tensor_description/tensor_descriptor.hpp:560
typename detail::detector< nonesuch, void, Op, Args... >::value_t is_detected
Definition is_detected.hpp:34
integral_constant< index_t, N > Number
Definition number.hpp:12
@ Vgpr
Definition amd_address_space.hpp:20
typename tuple_element< I, TTuple >::type tuple_element_t
Definition utility/tuple.hpp:208
__host__ __device__ constexpr auto sequence_to_tuple_of_number(Sequence< Is... >)
Definition utility/container_helper.hpp:380
__host__ __device__ constexpr auto generate_sequence_v2(F &&f, Number< N >)
Definition sequence_helper.hpp:25
__host__ __device__ constexpr auto make_merge_transform_v3_division_mod(const LowLengths &low_lengths)
Definition multi_index_transform_helper.hpp:84
__host__ __device__ constexpr auto generate_tuple(F &&f, Number< N >)
Definition tuple_helper.hpp:21
__host__ __device__ constexpr auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition tensor_descriptor_helper.hpp:101
decltype(ck::declval< T & >().is_pack2_invocable) is_pack2_invocable_t
Definition is_detected.hpp:37
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__host__ __device__ constexpr auto generate_sequence(F, Number< N >)
Definition sequence_helper.hpp:18
__host__ __device__ constexpr auto transform_tensor_descriptor(const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss)
Definition tensor_description/tensor_descriptor.hpp:319
__host__ __device__ constexpr Y bit_cast(const X &x)
Definition type.hpp:306
typename remove_cv< T >::type remove_cv_t
Definition type.hpp:295
__host__ __device__ constexpr auto unpack2(F &&f, X &&x, Y &&y)
Definition functional4.hpp:55
typename std::enable_if< B, T >::type enable_if_t
Definition enable_if.hpp:27
__host__ __device__ constexpr auto make_tensor_coordinate(const TensorDesc &tensor_desc, const VisibleIndex &idx_visible)
Definition tensor_description/tensor_descriptor.hpp:407
__host__ __device__ constexpr auto generate_tie(F &&f, Number< N >)
Definition tuple_helper.hpp:34
Array< index_t, N > MultiIndex
Definition array_multi_index.hpp:12
typename vector_type_maker< T, N >::type vector_type_maker_t
Definition dtype_vector.hpp:54
const GenericPointer< typename T::ValueType > T2 value
Definition pointer.h:1697
Definition utility/sequence.hpp:43
Definition tensor_space_filling_curve.hpp:20
static __device__ __host__ constexpr auto GetStepBetween(Number< AccessIdx1dBegin >, Number< AccessIdx1dEnd >)
Definition tensor_space_filling_curve.hpp:52
__host__ static __device__ constexpr index_t GetNumOfAccess()
Definition tensor_space_filling_curve.hpp:41
static __device__ __host__ constexpr auto GetForwardStep(Number< AccessIdx1d >)
Definition tensor_space_filling_curve.hpp:66
MultiIndex< nDim > Index
Definition tensor_space_filling_curve.hpp:23
static __device__ auto generate_vectors()
Definition threadwise_tensor_slice_transfer_v7r2.hpp:125
__device__ constexpr ThreadwiseTensorSliceTransfer_v7r2(const SrcDescs &src_descs, const StaticallyIndexedArray< Index, nSrc > &src_slice_origins, const DstDescs &dst_descs, const StaticallyIndexedArray< Index, nDst > &dst_slice_origins, const ElementwiseOperation &element_op)
Definition threadwise_tensor_slice_transfer_v7r2.hpp:89
static __device__ constexpr auto GetSrcCoordinateResetStep()
Definition threadwise_tensor_slice_transfer_v7r2.hpp:450
__device__ void MoveSrcSliceWindow(const SrcDescs &src_descs, Number< ISrc > iSrc, const Index &src_slice_origin_step_idx)
Definition threadwise_tensor_slice_transfer_v7r2.hpp:574
static __device__ constexpr auto GetDstCoordinateResetStep()
Definition threadwise_tensor_slice_transfer_v7r2.hpp:462
__device__ void RunWrite(const DstDescs &dst_descs, DstBuffers dst_bufs, Number< ThreadScratchId > thread_scratch_id=Number< ThreadScratchId >{})
Definition threadwise_tensor_slice_transfer_v7r2.hpp:380
__device__ void Run(const SrcDescs &src_descs, const SrcBuffers &src_bufs, const DstDescs &dst_descs, DstBuffers dst_bufs)
Definition threadwise_tensor_slice_transfer_v7r2.hpp:441
__device__ void RunRead(const SrcDescs &src_descs, const SrcBuffers &src_bufs, Number< ThreadScratchId > thread_scratch_id=Number< ThreadScratchId >{})
Definition threadwise_tensor_slice_transfer_v7r2.hpp:145
__device__ void MoveDstSliceWindow(const DstDescs &dst_descs, Number< IDst > iDst, const Index &dst_slice_origin_step_idx)
Definition threadwise_tensor_slice_transfer_v7r2.hpp:592
static __device__ constexpr auto GetDstThreadScratchDescriptor()
Definition threadwise_tensor_slice_transfer_v7r2.hpp:523
__device__ void SetSrcSliceOrigins(const SrcDescs &src_descs, const Indices &src_slice_origin_idxs)
Definition threadwise_tensor_slice_transfer_v7r2.hpp:107
__device__ void SetDstSliceOrigins(const DstDescs &dst_descs, const Indices &dst_slice_origin_idxs)
Definition threadwise_tensor_slice_transfer_v7r2.hpp:116
static __device__ constexpr auto GetSrcThreadScratchDescriptor()
Definition threadwise_tensor_slice_transfer_v7r2.hpp:474
static constexpr auto MakeCoordinates(const Descs &descs, const Indices &indices)
Definition threadwise_tensor_slice_transfer_v7r2.hpp:62
__device__ void OOBCheck(Number< ThreadScratchId > thread_scratch_id=Number< ThreadScratchId >{})
Definition threadwise_tensor_slice_transfer_v7r2.hpp:258
__device__ void TransposeFromElmToDst(Number< ThreadScratchId > thread_scratch_id=Number< ThreadScratchId >{})
Definition threadwise_tensor_slice_transfer_v7r2.hpp:278
Definition threadwise_tensor_slice_transfer_util.hpp:20
Definition functional2.hpp:33