utility.hpp Source File

utility.hpp Source File#

Composable Kernel: utility.hpp Source File
utility.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: MIT
2// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
3
4#pragma once
5
6// Address Space for AMDGCN
7// https://llvm.org/docs/AMDGPUUsage.html#address-space
8
13
14#include <stdint.h>
15
16namespace ck_tile {
17
18// TODO: we have "memory" clobber here because this inline asm is used for async copy
20{
21 asm volatile("s_mov_b32 m0, %0" : : "s"(v) : "memory");
22}
23
24// NOTE: this is an immediate value
26{
27 asm volatile("s_add_u32 m0, %0, m0" : : "n"(v) : "memory");
28}
29
30template <typename T>
31CK_TILE_DEVICE T warp_shuffle_up(const T& v_local, uint32_t lane_delta)
32{
33#if 0
34 return __shfl_up(v_local, lane_delta);
35#elif 1
36 static_assert(sizeof(T) == sizeof(int32_t), "wrong!");
37
38 const uint32_t wrap_around_lane_delta = get_warp_size() - lane_delta;
39
40 const int32_t v_remote_tmp = __builtin_amdgcn_ds_bpermute(
41 (__lane_id() << 2) + (wrap_around_lane_delta << 2), bit_cast<int32_t>(v_local));
42
43 return bit_cast<T>(v_remote_tmp);
44#endif
45}
46
47template <typename T>
48CK_TILE_DEVICE T warp_shuffle_down(const T& v_local, uint32_t lane_delta)
49{
50#if 0
51 return __shfl_down(v_local, lane_delta);
52#elif 1
53 static_assert(sizeof(T) == sizeof(int32_t), "wrong!");
54
55 const int32_t v_remote_tmp = __builtin_amdgcn_ds_bpermute(
56 (__lane_id() << 2) + (lane_delta << 2), bit_cast<int32_t>(v_local));
57
58 return bit_cast<T>(v_remote_tmp);
59#endif
60}
61
62template <typename T>
64{
65 static_assert(sizeof(T) == sizeof(int32_t), "wrong!");
66
67 const int32x2_t x = __builtin_amdgcn_permlane32_swap(
68 bit_cast<int32_t>(v_local), bit_cast<int32_t>(v_local), false, false);
69
71 v(0) = bit_cast<T>(x[0]);
72 v(1) = bit_cast<T>(x[1]);
73
74 return v;
75}
76
77template <typename T>
78CK_TILE_DEVICE T warp_shuffle(const T& v_local, uint32_t src_lane)
79{
80#if 0
81 return __shfl(v_local, src_lane);
82#elif 1
83 if constexpr(sizeof(int32_t) > sizeof(T))
84 {
85 union packet
86 {
87 int32_t x;
88 T v;
89 };
90 packet p;
91 p.v = v_local;
92 packet p_remote;
93 p_remote.x = __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(p));
94
95 return p_remote.v;
96 }
97 else if constexpr(sizeof(int32_t) == sizeof(T))
98 {
99 const int32_t v_remote_tmp =
100 __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(v_local));
101
102 return bit_cast<T>(v_remote_tmp);
103 }
104 else
105 {
106 static_assert(sizeof(T) % sizeof(int32_t) == 0, "wrong!");
107 constexpr index_t elm = sizeof(T) / sizeof(int32_t);
108 using vector_type = thread_buffer<int32_t, elm>;
109 auto vs = bit_cast<vector_type>(v_local);
110 auto vs_remote = vector_type{};
111 static_for<0, elm, 1>{}([&](auto i_e) {
112 int32_t tmp = __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(vs[i_e]));
113 vs_remote(i_e) = tmp;
114 });
115 return bit_cast<T>(vs_remote);
116 }
117#endif
118}
119
120template <typename T>
121CK_TILE_DEVICE auto flag_to_exec(const T& v_flag)
122{
123 static_assert(sizeof(T) == 4);
124 // per-thread v_flag store into 2x sgpr
125 uint32x2_t exec_flag;
126 asm volatile("v_cmp_ge_u32 %[s_exec_flag], %[v_flag], 1"
127 : [s_exec_flag] "=s"(exec_flag)
128 : [v_flag] "v"(v_flag));
129 return exec_flag;
130}
131
132template <typename X, typename Y>
133CK_TILE_DEVICE auto cmp_lt_to_exec(const X& x, const Y& y)
134{
135 static_assert(sizeof(X) == 4 && sizeof(Y) == 4);
136 // per-thread cmp store into 2x sgpr
137 uint32x2_t exec_flag;
138 asm volatile("v_cmp_lt_u32 %[s_exec_flag], %[v_x], %[v_y]"
139 : [s_exec_flag] "=s"(exec_flag)
140 : [v_x] "v"(x), [v_y] "v"(y));
141 return exec_flag;
142}
143
144} // namespace ck_tile
#define CK_TILE_DEVICE
Definition config.hpp:41
Definition tile/core/algorithm/cluster_descriptor.hpp:13
CK_TILE_DEVICE auto cmp_lt_to_exec(const X &x, const Y &y)
Definition utility.hpp:133
CK_TILE_DEVICE T warp_shuffle_up(const T &v_local, uint32_t lane_delta)
Definition utility.hpp:31
CK_TILE_HOST_DEVICE constexpr index_t get_warp_size()
Definition arch.hpp:63
CK_TILE_DEVICE T warp_shuffle(const T &v_local, uint32_t src_lane)
Definition utility.hpp:78
CK_TILE_HOST_DEVICE constexpr Y bit_cast(const X &x)
Definition bit_cast.hpp:11
CK_TILE_DEVICE T warp_shuffle_down(const T &v_local, uint32_t lane_delta)
Definition utility.hpp:48
CK_TILE_DEVICE auto warp_shuffle_down_pair(const T &v_local)
Definition utility.hpp:63
int32_t int32_t
Definition integer.hpp:10
CK_TILE_DEVICE void m0_set_with_memory(index_t v)
Definition utility.hpp:19
int32_t index_t
Definition integer.hpp:9
int32_t int32x2_t
Definition vector_type.hpp:154
CK_TILE_DEVICE auto flag_to_exec(const T &v_flag)
Definition utility.hpp:121
CK_TILE_DEVICE void m0_inc_with_memory(index_t v)
Definition utility.hpp:25
uint32_t uint32x2_t
Definition vector_type.hpp:163
unsigned int uint32_t
Definition stdint.h:126
Definition tile/core/utility/functional.hpp:43
Definition tile/core/utility/debug.hpp:67