26 if(i == SrcVectorDim && i == DstVectorDim)
28 return math::lcm(SrcScalarPerVector, DstScalarPerVector);
30 else if(i == SrcVectorDim)
32 return SrcScalarPerVector;
34 else if(i == DstVectorDim)
36 return DstScalarPerVector;
53template <
typename SliceLengths,
54 typename ScaleSliceLengths,
55 typename SrcElementwiseOperation,
56 typename ScaleElementwiseOperation,
57 typename DstElementwiseOperation,
65 typename SrcDimAccessOrder,
66 typename DstDimAccessOrder,
72 index_t SrcScalarStrideInVector,
73 index_t ScaleScalarStrideInVector,
74 index_t DstScalarStrideInVector,
75 bool SrcResetCoordinateAfterRun,
78 bool DstResetCoordinateAfterRun,
94 const SrcDesc& src_desc,
95 const Index& src_slice_origin,
96 const SrcElementwiseOperation& src_element_op,
97 const ScaleDesc& scale_desc,
98 const Index& scale_slice_origin,
99 const ScaleElementwiseOperation& scale_element_op,
100 const DstDesc& dst_desc,
101 const Index& dst_slice_origin,
102 const DstElementwiseOperation& dst_element_op)
106 src_element_op_(src_element_op),
107 scale_element_op_(scale_element_op),
108 dst_element_op_(dst_element_op)
118 const Index& scale_slice_origin_idx)
128 template <
typename SrcBuffer, index_t ThreadScratchId = 0>
129 __device__
void RunRead(
const SrcDesc& src_desc,
130 const SrcBuffer& src_buf,
139 "wrong! SrcBuffer and SrcData data type are inconsistent");
146 constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
148 constexpr auto src_dim_access_order = SrcDimAccessOrder{};
150 constexpr auto ordered_src_access_lengths =
156 Index forward_step_idx;
158 static_for<0, nDim, 1>{}([&](
auto j) {
159 forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
169 Index backward_step_idx;
171 static_for<0, nDim, 1>{}([&](
auto j) {
172 backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
180 static_ford<
decltype(ordered_src_access_lengths)>{}([&](
auto ordered_src_access_idx) {
182 constexpr auto forward_sweep = [&]() {
185 forward_sweep_(
I0) =
true;
187 static_for<1, nDim, 1>{}([&](
auto i) {
188 index_t tmp = ordered_src_access_idx[
I0];
190 static_for<1, i, 1>{}([&](
auto j) {
191 tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
194 forward_sweep_(i) = tmp % 2 == 0;
197 return forward_sweep_;
201 constexpr auto src_data_idx = [&]() {
204 static_for<0, nDim, 1>{}([&](
auto i) {
205 ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
206 : ordered_src_access_lengths[i] - 1 -
207 ordered_src_access_idx[i];
211 src_scalar_per_access;
217 const bool is_src_valid =
221 using src_vector_t =
typename src_vector_type::type;
224 auto src_vector_container = src_vector_type{
225 src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid)};
228 src_thread_scratch_tuple_(thread_scratch_id)
229 .template SetAsType<src_vector_t>(
230 src_data_idx_seq, src_vector_container.template AsType<src_vector_t>()[
I0]);
232 constexpr auto move_on_dim = [&]()
constexpr {
235 static_for<0, nDim, 1>{}([&](
auto i) {
236 move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1;
238 static_for<i + 1, nDim, 1>{}([&](
auto j) {
240 ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1;
248 static_for<0, nDim, 1>{}([&](
auto i) {
249 if constexpr(move_on_dim[i])
251 if constexpr(forward_sweep[i])
254 src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
259 src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
266 if constexpr(SrcResetCoordinateAfterRun)
268 const auto src_reset_step =
275 template <
typename ScaleBuffer>
276 __device__
void RunScaleRead(
const ScaleDesc& scale_desc,
const ScaleBuffer& scale_buf)
284 "wrong! ScaleBuffer and ScaleData data type are inconsistent");
291 constexpr auto scale_access_lengths = SliceLengths{} / scale_scalar_per_access;
293 constexpr auto scale_dim_access_order = SrcDimAccessOrder{};
295 constexpr auto ordered_scale_access_lengths =
301 Index forward_step_idx;
304 forward_step_idx(j) = (i.value == j.value) ? scale_scalar_per_access[i] : 0;
314 Index backward_step_idx;
317 backward_step_idx(j) = (i.value == j.value) ? -scale_scalar_per_access[i] : 0;
325 static_ford<
decltype(ordered_scale_access_lengths)>{}([&](
auto ordered_scale_access_idx) {
327 constexpr auto forward_sweep = [&]() {
330 forward_sweep_(
I0) =
true;
333 index_t tmp = ordered_scale_access_idx[
I0];
336 tmp = tmp * ordered_scale_access_lengths[j] + ordered_scale_access_idx[j];
339 forward_sweep_(i) = tmp % 2 == 0;
342 return forward_sweep_;
346 constexpr auto scale_data_idx = [&]() {
350 ordered_idx(i) = forward_sweep[i] ? ordered_scale_access_idx[i]
351 : ordered_scale_access_lengths[i] - 1 -
352 ordered_scale_access_idx[i];
356 scale_scalar_per_access;
359 constexpr auto scale_data_idx_seq =
361 Number<scale_data_idx.Size()>{});
364 scale_desc, scale_coord_);
367 using scale_vector_t =
typename scale_vector_type::type;
370 auto scale_vector_container = scale_vector_type{
371 scale_buf.template Get<scale_vector_t>(scale_coord_.GetOffset(), is_scale_valid)};
374 scale_thread_scratch_.template SetAsType<scale_vector_t>(
375 scale_data_idx_seq, scale_vector_container.template AsType<scale_vector_t>()[
I0]);
377 constexpr auto move_on_dim = [&]()
constexpr {
382 ordered_scale_access_idx[i] < ordered_scale_access_lengths[i] - 1;
386 ordered_scale_access_idx[j] == ordered_scale_access_lengths[j] - 1;
395 if constexpr(move_on_dim[i])
397 if constexpr(forward_sweep[i])
401 scale_forward_steps[scale_dim_access_order[i]]);
407 scale_backward_steps[scale_dim_access_order[i]]);
425 template <index_t ThreadScratchId>
429#if !CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE
432 dst_thread_scratch_(idx) =
438 if constexpr(SrcVectorDim != DstVectorDim &&
441 SrcScalarPerVector % 2 == 0 && DstScalarPerVector % 2 == 0) ||
444 SrcScalarPerVector % 4 == 0 && DstScalarPerVector % 4 == 0)))
454 static_assert(SrcVectorDim != DstVectorDim,
"wrong");
466 DstScalarPerVector>{},
469 constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
471 static_ford<
decltype(access_lengths)>{}([&](
auto access_idx) {
472 constexpr auto data_idx = access_idx * scalar_per_access;
483 [&](
auto i) ->
const src_vector_t& {
485 return src_thread_scratch_tuple_[thread_scratch_id].GetVectorTypeReference(
486 data_idx_seq + i * dst_scalar_step_in_vector);
492 [&](
auto i) -> dst_vector_t& {
494 return dst_thread_scratch_.GetVectorTypeReference(
495 data_idx_seq + i * src_scalar_step_in_vector);
501 src_vector_refs, dst_vector_refs);
510 DstScalarPerVector>{},
513 constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
516 using src_vector_t =
typename src_vector_type::type;
519 using src_converted_vector_t =
typename src_converted_vector_type::type;
521 static_ford<
decltype(access_lengths)>{}([&](
auto access_idx) {
522 auto src_vector_container = src_vector_type{
523 src_thread_scratch_tuple_[thread_scratch_id].template GetAsType<src_vector_t>(
526 auto src_converted_vector_container =
527 src_converted_vector_type{fast_numeric_converter(src_vector_container)};
529 src_converted_thread_scratch_.template SetAsType<src_converted_vector_t>(
531 src_converted_vector_container.template AsType<src_converted_vector_t>()[
I0]);
537 constexpr auto scale_idx =
Sequence<
I0, idx.At(1),
I0>{};
540 src_element_op_(dst_v,
541 src_converted_thread_scratch_[idx] * scale_thread_scratch_[scale_idx]);
542 dst_thread_scratch_(idx) = dst_v;
547 template <
typename DstBuffer, index_t ThreadScratchId = 0>
562 "wrong! SrcBuffer or DstBuffer data type is wrong");
569 constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
571 constexpr auto dst_dim_access_order = DstDimAccessOrder{};
573 constexpr auto ordered_dst_access_lengths =
579 Index forward_step_idx;
581 static_for<0, nDim, 1>{}([&](
auto j) {
582 forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
592 Index backward_step_idx;
594 static_for<0, nDim, 1>{}([&](
auto j) {
595 backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
603 static_ford<
decltype(ordered_dst_access_lengths)>{}([&](
auto ordered_dst_access_idx) {
605 constexpr auto forward_sweep = [&]() {
608 forward_sweep_(
I0) =
true;
610 static_for<1, nDim, 1>{}([&](
auto i) {
611 index_t tmp = ordered_dst_access_idx[
I0];
613 static_for<1, i, 1>{}([&](
auto j) {
614 tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
617 forward_sweep_(i) = tmp % 2 == 0;
620 return forward_sweep_;
624 constexpr auto dst_data_idx = [&]() {
627 static_for<0, nDim, 1>{}([&](
auto i) {
628 ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i]
629 : ordered_dst_access_lengths[i] - 1 -
630 ordered_dst_access_idx[i];
634 dst_scalar_per_access;
640 const bool is_dst_valid =
644 using dst_vector_t =
typename dst_vector_type::type;
647 auto dst_vector_container = dst_vector_type{
648 dst_thread_scratch_.template GetAsType<dst_vector_t>(dst_data_idx_seq)};
650 static_for<0, DstScalarPerVector, 1>{}([&](
auto i) {
654 dst_element_op_(dst_v, dst_vector_container.template AsType<DstData>()[i]);
656 dst_vector_container.template AsType<DstData>()(i) = dst_v;
661 dst_coord_.GetOffset(),
663 dst_vector_container.template AsType<dst_vector_t>()[
I0]);
665 constexpr auto move_on_dim = [&]()
constexpr {
668 static_for<0, nDim, 1>{}([&](
auto i) {
669 move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1;
671 static_for<i + 1, nDim, 1>{}([&](
auto j) {
673 ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1;
681 static_for<0, nDim, 1>{}([&](
auto i) {
682 if constexpr(move_on_dim[i])
684 if constexpr(forward_sweep[i])
687 dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
692 dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
699 if constexpr(DstResetCoordinateAfterRun)
701 const auto dst_reset_step =
715 constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
717 constexpr auto src_dim_access_order = SrcDimAccessOrder{};
719 constexpr auto ordered_src_access_lengths =
723 constexpr auto forward_sweep = [&]() {
726 forward_sweep_(
I0) =
true;
729 index_t tmp = ordered_src_access_lengths[
I0] - 1;
732 tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
735 forward_sweep_(i) = tmp % 2 == 0;
738 return forward_sweep_;
743 constexpr auto src_data_idx = [&]() {
747 ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
751 src_scalar_per_access;
755 constexpr auto reset_src_data_step = [&]() {
756 Index reset_src_data_step_;
760 return reset_src_data_step_;
763 return reset_src_data_step;
773 constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
775 constexpr auto dst_dim_access_order = DstDimAccessOrder{};
777 constexpr auto ordered_dst_access_lengths =
781 constexpr auto forward_sweep = [&]() {
784 forward_sweep_(
I0) =
true;
787 index_t tmp = ordered_dst_access_lengths[
I0] - 1;
790 tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
793 forward_sweep_(i) = tmp % 2 == 0;
796 return forward_sweep_;
801 constexpr auto dst_data_idx = [&]() {
805 ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
809 dst_scalar_per_access;
813 constexpr auto reset_dst_data_step = [&]() {
814 Index reset_dst_data_step_;
818 return reset_dst_data_step_;
821 return reset_dst_data_step;
826 const Index& src_slice_origin_step_idx)
829 const auto adjusted_step_idx =
830 SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
841 const Index& dst_slice_origin_step_idx)
844 const auto adjusted_step_idx =
845 DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
859 constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
865 constexpr auto desc0 =
871 if constexpr(i == SrcVectorDim)
874 make_tuple(src_access_lengths_and_vector_length[i],
886 if constexpr(i == SrcVectorDim)
897 constexpr auto up_dim_idss =
909 constexpr auto scale_access_lengths = SliceLengths{} / scale_scalar_per_access;
915 constexpr auto desc0 =
921 if constexpr(i == SrcVectorDim)
924 make_tuple(scale_access_lengths_and_vector_length[i],
925 scale_access_lengths_and_vector_length[
Number<nDim>{}]));
936 if constexpr(i == SrcVectorDim)
947 constexpr auto up_dim_idss =
959 constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
964 constexpr auto desc0 =
970 if constexpr(i == DstVectorDim)
973 make_tuple(dst_access_lengths_and_vector_length[i],
985 if constexpr(i == DstVectorDim)
996 constexpr auto up_dim_idss =
1004 static constexpr auto scale_thread_scratch_desc_ =
1017 decltype(src_thread_scratch_desc_),
1021 using SrcThreadConvertedScratch =
1025 decltype(src_thread_scratch_desc_),
1031 ScaleScalarPerVector,
1032 decltype(scale_thread_scratch_desc_),
1039 decltype(dst_thread_scratch_desc_),
1042 using FastTypeConverter = tensor_operation::element_wise::
1043 FastNumericArrayConverter<SrcData, DstData, SrcScalarPerVector>;
1046 SrcThreadConvertedScratch src_converted_thread_scratch_;
1047 ScaleThreadScratch scale_thread_scratch_;
1049 DstThreadScratch dst_thread_scratch_;
1050 FastTypeConverter fast_numeric_converter;
1055 const SrcElementwiseOperation src_element_op_;
1056 const ScaleElementwiseOperation scale_element_op_;
1057 const DstElementwiseOperation dst_element_op_;
Definition threadwise_tensor_slice_transfer_util.hpp:15
__host__ __device__ constexpr auto lcm(X x, Y y)
Definition utility/math.hpp:198
__host__ __device__ constexpr auto make_pass_through_transform(const LowLength &low_length)
Definition multi_index_transform_helper.hpp:12
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition utility/statically_indexed_array.hpp:45
int32_t index_t
Definition ck.hpp:299
__host__ __device__ constexpr auto make_tensor_coordinate_step(const TensorDesc &, const VisibleIndex &idx_diff_visible, UpdateLowerIndexHack)
Definition tensor_description/tensor_descriptor.hpp:444
__host__ __device__ constexpr void move_tensor_coordinate(const TensorDesc &tensor_desc, TensorCoord &coord, const TensorCoordStep &coord_step)
Definition tensor_description/tensor_descriptor.hpp:508
__host__ __device__ constexpr auto container_push_back(const Array< TData, NSize > &a, const TData &x)
Definition utility/container_helper.hpp:18
InMemoryDataOperationEnum
Definition ck.hpp:277
@ Set
Definition ck.hpp:278
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition type.hpp:297
__host__ __device__ constexpr bool coordinate_has_valid_offset_assuming_visible_index_is_valid(const TensorDesc &tensor_desc, const TensorCoord &coord)
Definition tensor_description/tensor_descriptor.hpp:560
integral_constant< index_t, N > Number
Definition number.hpp:12
@ Lds
Definition amd_address_space.hpp:18
@ Global
Definition amd_address_space.hpp:17
@ Vgpr
Definition amd_address_space.hpp:20
__host__ __device__ constexpr auto sequence_to_tuple_of_number(Sequence< Is... >)
Definition utility/container_helper.hpp:380
__host__ __device__ constexpr Y type_convert(X x)
Definition utility/type_convert.hpp:98
__host__ __device__ constexpr auto generate_sequence_v2(F &&f, Number< N >)
Definition sequence_helper.hpp:25
__host__ __device__ constexpr auto container_reorder_given_old2new(const Array< TData, NSize > &old_array, Sequence< IRs... > old2new)
Definition utility/container_helper.hpp:54
__host__ __device__ constexpr auto make_merge_transform_v3_division_mod(const LowLengths &low_lengths)
Definition multi_index_transform_helper.hpp:84
__host__ __device__ constexpr auto generate_tuple(F &&f, Number< N >)
Definition tuple_helper.hpp:21
__host__ __device__ constexpr auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition tensor_descriptor_helper.hpp:101
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__host__ __device__ constexpr auto generate_sequence(F, Number< N >)
Definition sequence_helper.hpp:18
__host__ __device__ constexpr auto transform_tensor_descriptor(const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss)
Definition tensor_description/tensor_descriptor.hpp:319
__host__ __device__ constexpr auto make_tensor_coordinate(const TensorDesc &tensor_desc, const VisibleIndex &idx_visible)
Definition tensor_description/tensor_descriptor.hpp:407
__host__ __device__ constexpr auto container_reorder_given_new2old(const Array< TData, NSize > &old_array, Sequence< IRs... >)
Definition utility/container_helper.hpp:43
__host__ __device__ constexpr auto generate_tie(F &&f, Number< N >)
Definition tuple_helper.hpp:34
Array< index_t, N > MultiIndex
Definition array_multi_index.hpp:12
typename vector_type_maker< T, N >::type vector_type_maker_t
Definition dtype_vector.hpp:54
const GenericPointer< typename T::ValueType > T2 value
Definition pointer.h:1697
Definition utility/sequence.hpp:43
static __device__ constexpr auto GetScaleThreadScratchDescriptor()
Definition threadwise_tensor_slice_transfer_v3r1_dequant.hpp:903
ck::ThreadwiseTensorSliceTransfer_v3r1_dequant< decltype(thread_slice_lengths), decltype(scale_thread_slice_lengths), SrcElementwiseOperation, ScaleElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, ScaleData, DstData, SrcDesc, ScaleDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, ScaleScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, ScaleScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::DstCoord decltype(make_tensor_coordinate(DstDesc{}, Index{})) DstCoord
Definition threadwise_tensor_slice_transfer_v3r1_dequant.hpp:89
__device__ void SetDstSliceOrigin(const DstDesc &dst_desc, const Index &dst_slice_origin_idx)
Definition threadwise_tensor_slice_transfer_v3r1_dequant.hpp:123
ck::ThreadwiseTensorSliceTransfer_v3r1_dequant< decltype(thread_slice_lengths), decltype(scale_thread_slice_lengths), SrcElementwiseOperation, ScaleElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, ScaleData, DstData, SrcDesc, ScaleDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, ScaleScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, ScaleScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::ScaleCoord decltype(make_tensor_coordinate(SrcDesc{}, Index{})) ScaleCoord
Definition threadwise_tensor_slice_transfer_v3r1_dequant.hpp:88
static __device__ constexpr auto GetDstThreadScratchDescriptor()
Definition threadwise_tensor_slice_transfer_v3r1_dequant.hpp:953
ck::ThreadwiseTensorSliceTransfer_v3r1_dequant< decltype(thread_slice_lengths), decltype(scale_thread_slice_lengths), SrcElementwiseOperation, ScaleElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, ScaleData, DstData, SrcDesc, ScaleDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, ScaleScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, ScaleScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::I0 static constexpr auto I0
Definition threadwise_tensor_slice_transfer_v3r1_dequant.hpp:91
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx)
Definition threadwise_tensor_slice_transfer_v3r1_dequant.hpp:825
__device__ void RunWrite(const DstDesc &dst_desc, DstBuffer &dst_buf, Number< ThreadScratchId > thread_scratch_id=Number< ThreadScratchId >{})
Definition threadwise_tensor_slice_transfer_v3r1_dequant.hpp:548
ck::ThreadwiseTensorSliceTransfer_v3r1_dequant< decltype(thread_slice_lengths), decltype(scale_thread_slice_lengths), SrcElementwiseOperation, ScaleElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, ScaleData, DstData, SrcDesc, ScaleDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, ScaleScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, ScaleScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::Index MultiIndex< nDim > Index
Definition threadwise_tensor_slice_transfer_v3r1_dequant.hpp:85
__device__ void RunRead(const SrcDesc &src_desc, const SrcBuffer &src_buf, Number< ThreadScratchId > thread_scratch_id=Number< ThreadScratchId >{})
Definition threadwise_tensor_slice_transfer_v3r1_dequant.hpp:129
__device__ void SetScaleSliceOrigin(const ScaleDesc &scale_desc, const Index &scale_slice_origin_idx)
Definition threadwise_tensor_slice_transfer_v3r1_dequant.hpp:117
__device__ constexpr ThreadwiseTensorSliceTransfer_v3r1_dequant(const SrcDesc &src_desc, const Index &src_slice_origin, const SrcElementwiseOperation &src_element_op, const ScaleDesc &scale_desc, const Index &scale_slice_origin, const ScaleElementwiseOperation &scale_element_op, const DstDesc &dst_desc, const Index &dst_slice_origin, const DstElementwiseOperation &dst_element_op)
Definition threadwise_tensor_slice_transfer_v3r1_dequant.hpp:93
__device__ void SetSrcSliceOrigin(const SrcDesc &src_desc, const Index &src_slice_origin_idx)
Definition threadwise_tensor_slice_transfer_v3r1_dequant.hpp:112
static __device__ constexpr auto GetSrcThreadScratchDescriptor()
Definition threadwise_tensor_slice_transfer_v3r1_dequant.hpp:854
__device__ void MoveDstSliceWindow(const DstDesc &dst_desc, const Index &dst_slice_origin_step_idx)
Definition threadwise_tensor_slice_transfer_v3r1_dequant.hpp:840
ck::ThreadwiseTensorSliceTransfer_v3r1_dequant< decltype(thread_slice_lengths), decltype(scale_thread_slice_lengths), SrcElementwiseOperation, ScaleElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, ScaleData, DstData, SrcDesc, ScaleDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, ScaleScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, ScaleScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::nDim static constexpr index_t nDim
Definition threadwise_tensor_slice_transfer_v3r1_dequant.hpp:84
__device__ void RunScaleRead(const ScaleDesc &scale_desc, const ScaleBuffer &scale_buf)
Definition threadwise_tensor_slice_transfer_v3r1_dequant.hpp:276
ck::ThreadwiseTensorSliceTransfer_v3r1_dequant< decltype(thread_slice_lengths), decltype(scale_thread_slice_lengths), SrcElementwiseOperation, ScaleElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, ScaleData, DstData, SrcDesc, ScaleDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, ScaleScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, ScaleScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::SrcCoord decltype(make_tensor_coordinate(SrcDesc{}, Index{})) SrcCoord
Definition threadwise_tensor_slice_transfer_v3r1_dequant.hpp:87
static __device__ constexpr auto GetSrcCoordinateResetStep()
Definition threadwise_tensor_slice_transfer_v3r1_dequant.hpp:708
static __device__ constexpr auto GetDstCoordinateResetStep()
Definition threadwise_tensor_slice_transfer_v3r1_dequant.hpp:766
__device__ void TransferDataFromSrcThreadScratchToDstThreadScratch(Number< ThreadScratchId > thread_scratch_id)
Definition threadwise_tensor_slice_transfer_v3r1_dequant.hpp:427
Definition threadwise_tensor_slice_transfer_v3r1_dequant.hpp:23
__host__ __device__ constexpr auto operator()(index_t i) const
Definition threadwise_tensor_slice_transfer_v3r1_dequant.hpp:24
Definition threadwise_tensor_slice_transfer_util.hpp:20
Definition threadwise_tensor_slice_transfer_util.hpp:29
Definition functional2.hpp:33
Definition functional3.hpp:97
Definition utility/transpose_vectors.hpp:16