copy.hpp Source File

copy.hpp Source File#

Composable Kernel: copy.hpp Source File
copy.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: MIT
2// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
3
4#pragma once
5
7
14
15// Disable from doxygen docs generation
17namespace ck {
18namespace wrapper {
20
31template <typename DimAccessOrderTuple,
32 index_t VectorDim,
33 index_t ScalarPerVector,
34 typename SrcTensorType,
35 typename DstTensorType>
36__device__ void copy(const SrcTensorType& src_tensor, DstTensorType& dst_tensor)
37{
38 static_assert(is_detected<is_tuple, DimAccessOrderTuple>::value);
39 constexpr auto I0 = Number<0>{};
40 constexpr auto I1 = Number<1>{};
41
42 const auto& in_grid_desc = layout(src_tensor).GetUnrolledDescriptor();
43 const auto& out_grid_desc = layout(dst_tensor).GetUnrolledDescriptor();
44
45 using SrcShapeType = remove_cvref_t<decltype(shape(src_tensor))>;
46 constexpr index_t num_dims = SrcShapeType::Size();
47
48 constexpr auto thread_slice_lengths =
49 generate_sequence_v2([](auto I) { return size(SrcShapeType{}.At(I)); }, Number<num_dims>{});
50 constexpr auto dim_access_order = generate_sequence_v2(
51 [](auto I) { return DimAccessOrderTuple{}.At(I); }, Number<num_dims>{});
52
53 if constexpr(SrcTensorType::IsDynamicBuffer && DstTensorType::IsDynamicBuffer)
54 {
55 // Perform a copy between DynamicBuffers
56 auto transfer = ThreadwiseTensorSliceTransfer_v7<
57 Tuple<typename SrcTensorType::TensorElementType>,
58 Tuple<typename DstTensorType::TensorElementType>,
59 decltype(tie(in_grid_desc)),
60 decltype(tie(out_grid_desc)),
61 tensor_operation::element_wise::PassThrough,
62 Sequence<static_cast<index_t>(InMemoryDataOperationEnum::Set)>,
63 decltype(thread_slice_lengths),
64 decltype(dim_access_order),
65 VectorDim,
66 ScalarPerVector,
67 Sequence<true>,
68 Sequence<true>>{in_grid_desc,
69 make_tuple(src_tensor.GetMultiIdxOffsets()),
70 out_grid_desc,
71 make_tuple(dst_tensor.GetMultiIdxOffsets()),
72 tensor_operation::element_wise::PassThrough{}};
73
74 transfer.Run(tie(in_grid_desc),
75 tie(src_tensor.GetBuffer()),
76 tie(out_grid_desc),
77 tie(dst_tensor.GetBuffer()));
78 }
79 else if constexpr(!SrcTensorType::IsDynamicBuffer && DstTensorType::IsDynamicBuffer)
80 {
81 // Perform copy from StaticBuffer to DynamicBuffer
82 const auto src_slice_origin_idxs =
83 generate_tuple([&](auto) { return I0; }, Number<num_dims>{});
84
85 auto transfer =
86 ThreadwiseTensorSliceTransfer_v1r3<typename SrcTensorType::TensorElementType,
87 typename DstTensorType::TensorElementType,
88 remove_cvref_t<decltype(in_grid_desc)>,
89 remove_cvref_t<decltype(out_grid_desc)>,
90 tensor_operation::element_wise::PassThrough,
91 decltype(thread_slice_lengths),
92 decltype(dim_access_order),
93 VectorDim,
94 ScalarPerVector,
95 InMemoryDataOperationEnum::Set,
96 I1,
97 true>{out_grid_desc,
98 dst_tensor.GetMultiIdxOffsets(),
99 tensor_operation::element_wise::PassThrough{}};
100
101 transfer.Run(in_grid_desc,
102 src_slice_origin_idxs,
103 src_tensor.GetBuffer(),
104 out_grid_desc,
105 dst_tensor.GetBuffer());
106 }
107 else if constexpr(SrcTensorType::IsDynamicBuffer && !DstTensorType::IsDynamicBuffer)
108 {
109 // Perform copy from DynamicBuffer to StaticBuffer
110 const auto dst_slice_origin_idxs =
111 generate_tuple([&](auto) { return I0; }, Number<num_dims>{});
112 auto transfer = ThreadwiseTensorSliceTransfer_v2<
113 std::remove_const_t<typename SrcTensorType::TensorElementType>,
114 std::remove_const_t<typename DstTensorType::TensorElementType>,
115 remove_cvref_t<decltype(in_grid_desc)>,
116 remove_cvref_t<decltype(out_grid_desc)>,
117 decltype(thread_slice_lengths),
118 decltype(dim_access_order),
119 VectorDim,
120 ScalarPerVector,
121 I1,
122 false,
123 false>{in_grid_desc, src_tensor.GetMultiIdxOffsets()};
124
125 transfer.Run(in_grid_desc,
126 src_tensor.GetBuffer(),
127 out_grid_desc,
128 dst_slice_origin_idxs,
129 dst_tensor.GetBuffer());
130 }
131 else
132 {
133 // Perform copy between StaticBuffers
134 static_for<0, SrcShapeType::Size(), 1>{}([&](auto i) { dst_tensor(i) = src_tensor(i); });
135 }
136}
137
145template <typename SrcTensorType, typename DstTensorType>
146__host__ __device__ void copy(const SrcTensorType& src_tensor, DstTensorType& dst_tensor)
147{
148 // Generate default params
149 using SrcShapeType = remove_cvref_t<decltype(shape(src_tensor))>;
150 constexpr index_t num_dims = SrcShapeType::Size();
151 // Incrementing dims 0, 1, 2 ... num_dims - 1
152 constexpr auto dim_access_order_tuple =
153 generate_tuple([](auto i) { return Number<i>{}; }, Number<num_dims>{});
154 constexpr index_t vector_dim = num_dims - 1;
155 constexpr index_t scalar_per_vector = 1;
157}
158
172template <typename DimAccessOrderTuple,
173 index_t VectorDim,
174 index_t ScalarPerVector,
175 typename SrcTensorType,
176 typename DstTensorType,
177 typename ThreadShape,
178 typename ThreadUnrolledDesc>
179__device__ void
180blockwise_copy(const SrcTensorType& src_tensor,
181 DstTensorType& dst_tensor,
182 [[maybe_unused]] const Layout<ThreadShape, ThreadUnrolledDesc>& thread_layout)
183{
184 static_assert(SrcTensorType::IsDynamicBuffer && DstTensorType::IsDynamicBuffer);
185 static_assert(is_detected<is_tuple, DimAccessOrderTuple>::value);
186
187 const auto& in_grid_desc = layout(src_tensor).GetUnrolledDescriptor();
188 const auto& out_grid_desc = layout(dst_tensor).GetUnrolledDescriptor();
189
190 using SrcShapeType = remove_cvref_t<decltype(shape(src_tensor))>;
191 constexpr index_t num_dims = SrcShapeType::Size();
192
193 constexpr auto tile_lengths_seq =
194 generate_sequence_v2([](auto I) { return size(SrcShapeType{}.At(I)); }, Number<num_dims>{});
195 constexpr auto thread_layout_seq =
196 generate_sequence_v2([](auto I) { return size<I>(ThreadShape{}); }, Number<num_dims>{});
197 constexpr auto dim_access_order = generate_sequence_v2(
198 [](auto I) { return DimAccessOrderTuple{}.At(I); }, Number<num_dims>{});
199
200 using ThisThreadBlock = ThisThreadBlock<size(ThreadShape{})>;
201
202 // Perform copy between DynamicBuffers
203 auto transfer = ThreadGroupTensorSliceTransfer_v7<
204 ThisThreadBlock,
205 Tuple<typename SrcTensorType::TensorElementType>,
206 Tuple<typename DstTensorType::TensorElementType>,
207 decltype(tie(in_grid_desc)),
208 decltype(tie(out_grid_desc)),
209 tensor_operation::element_wise::PassThrough,
210 Sequence<static_cast<index_t>(InMemoryDataOperationEnum::Set)>,
211 std::remove_const_t<decltype(tile_lengths_seq)>,
212 std::remove_const_t<decltype(thread_layout_seq)>,
213 std::remove_const_t<decltype(dim_access_order)>,
214 std::remove_const_t<decltype(dim_access_order)>,
215 VectorDim,
216 ScalarPerVector,
217 Sequence<true>,
218 Sequence<true>>{in_grid_desc,
219 make_tuple(src_tensor.GetMultiIdxOffsets()),
220 out_grid_desc,
221 make_tuple(dst_tensor.GetMultiIdxOffsets()),
222 tensor_operation::element_wise::PassThrough{}};
223
224 transfer.Run(tie(in_grid_desc),
225 tie(src_tensor.GetBuffer()),
226 tie(out_grid_desc),
227 tie(dst_tensor.GetBuffer()));
228}
229
230} // namespace wrapper
231} // namespace ck
__device__ void copy(const SrcTensorType &src_tensor, DstTensorType &dst_tensor)
Perform optimized copy between two tensors partitions (threadwise copy). Tensors must have the same s...
Definition copy.hpp:36
__device__ void blockwise_copy(const SrcTensorType &src_tensor, DstTensorType &dst_tensor, const Layout< ThreadShape, ThreadUnrolledDesc > &thread_layout)
Perform optimized blockwise copy between two tensors. Tensors must have the same size.
Definition copy.hpp:180
__host__ __device__ constexpr const auto & shape(const LayoutType &layout)
Get Layout shape.
Definition layout_utils.hpp:431
Definition ck.hpp:268
int32_t index_t
Definition ck.hpp:299
Layout wrapper that performs the tensor descriptor logic.
Definition layout.hpp:24
__host__ __device__ constexpr const auto & layout(const Tensor< BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType > &tensor)
Get Tensor Layout.
Definition tensor_utils.hpp:162