GridwiseGemmMultiD_xdl_cshuffle_v3< ALayout, BLayout, DsLayout, CLayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraMCustom, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraNCustom, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVectors, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, LDSTypeA, LDSTypeB, DoElementwiseBeforeCShuffle, DirectLoad > Struct Template Reference#
Classes |
Public Types |
Static Public Member Functions |
Static Public Attributes |
List of all members
ck::GridwiseGemmMultiD_xdl_cshuffle_v3< ALayout, BLayout, DsLayout, CLayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraMCustom, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraNCustom, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVectors, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, LDSTypeA, LDSTypeB, DoElementwiseBeforeCShuffle, DirectLoad > Struct Template Reference
#include <gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp>
Classes | |
| struct | Problem |
| struct | Argument |
| struct | SplitKBatchOffset |
Public Types | |
| using | DsGridPointer = decltype(MakeDsGridPointer()) |
| using | ThisThreadBlock = ThisThreadBlock<BlockSize> |
| using | BlockwiseGemmPipe |
| using | Block2CTileMapDefault = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock, NPerBlock> |
Static Public Member Functions | |
| static constexpr auto | MakeDsGridPointer () |
| static __host__ auto | CalculateGridSize (index_t M, index_t N, index_t KBatch) |
| __host__ static __device__ auto | CalculateMPadded (index_t M) |
| __host__ static __device__ auto | CalculateNPadded (index_t N) |
| __host__ static __device__ auto | CalculateKPadded (index_t K) |
| __host__ static __device__ auto | CalculateAK0Padded (index_t K, index_t K_Batch=1) |
| __host__ static __device__ auto | CalculateBK0Padded (index_t K, index_t K_Batch=1) |
| __host__ static __device__ auto | CalculateKPadded (index_t K, index_t K_Batch=1) |
| __host__ static __device__ auto | CalculateKRead (index_t K, index_t K_Batch=1) |
| __host__ static __device__ auto | CalculateMBlock (index_t M) |
| __host__ static __device__ auto | CalculateNBlock (index_t N) |
| template<typename GridDesc_K0_MN_K1_T, index_t K0Number, index_t K1Value> | |
| __host__ static __device__ auto | TransformGrid (GridDesc_K0_MN_K1_T &desc) |
| template<index_t MNXdlPerWave, index_t MNWaves, index_t MNPerXdl, typename TileDesc_K0_MN_K1> | |
| __host__ static __device__ constexpr auto | MakeGemmMmaTileDescriptor (const TileDesc_K0_MN_K1 &) |
| __host__ static __device__ auto | MakeAGridDescriptor_AK0_M_AK1 (index_t M, index_t MPad, index_t K, index_t KPad, index_t StrideA, index_t AK0) |
| __host__ static __device__ auto | MakeBGridDescriptor_BK0_N_BK1 (index_t K, index_t KPad, index_t N, index_t NPad, index_t StrideB, index_t BK0) |
| template<typename ABlockDesc_AK0_M_AK1> | |
| __host__ static __device__ constexpr auto | MakeAMmaTileDescriptor_M0_M1_M2_K (const ABlockDesc_AK0_M_AK1 &) |
| template<typename BBlockDesc_BK0_N_BK1> | |
| __host__ static __device__ constexpr auto | MakeBMmaTileDescriptor_N0_N1_N2_K (const BBlockDesc_BK0_N_BK1 &) |
| template<typename ELayout> | |
| __host__ static __device__ auto | MakeCGridDescriptor_M_N (index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC) |
| __host__ static __device__ auto | MakeDsGridDescriptor_M_N (index_t M, index_t MPad, index_t N, index_t NPad, std::array< index_t, NumDTensor > StrideDs) |
| template<typename DsGridDesc> | |
| static __device__ constexpr auto | MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock (const DsGridDesc &ds_grid_desc_m_n, index_t MBlock, index_t NBlock) |
| static __device__ constexpr auto | GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1 () |
| static __device__ constexpr auto | GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1 () |
| static __device__ constexpr auto | GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock () |
| static __device__ constexpr index_t | GetSharedMemoryNumberOfByte () |
| template<InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set> | |
| static __device__ bool constexpr | IsValidCompilationParameter () |
| static __host__ constexpr bool | CheckValidity (const Argument &karg) |
| __host__ static __device__ constexpr bool | CalculateHasMainKBlockLoop (index_t K) |
| __host__ static __device__ constexpr TailNumber | CalculateKBlockLoopTailNum (index_t K) |
| template<typename CGridDesc> | |
| static __device__ constexpr auto | MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock (const CGridDesc &c_grid_desc_m_n, index_t MBlock, index_t NBlock) |
| template<bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, TailNumber TailNum = TailNumber::Odd> | |
| static __device__ void | Run (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsGridPointer &p_ds_grid, CDataType *__restrict__ p_c_grid, void *__restrict__ p_shared, const Problem &problem, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, CElementwiseOperation c_element_op) |
| template<typename Block2CTileMap, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, TailNumber TailNum = TailNumber::Odd> | |
| static __device__ void | Run (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsGridPointer &p_ds_grid, CDataType *__restrict__ p_c_grid, void *__restrict__ p_shared, const Problem &problem, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, CElementwiseOperation c_element_op, const Block2CTileMap &block_2_ctile_map) |
| template<bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, TailNumber TailNum, typename Block2CTileMap, typename AGridDesc_AK0_M_K1, typename BGridDesc_BK0_N_K1, typename DsGridDesc_M_N, typename CGridDesc_M_N> | |
| static __device__ void | Run (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsGridPointer &p_ds_grid, CDataType *__restrict__ p_c_grid, void *__restrict__ p_shared, const Problem &problem, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, CElementwiseOperation c_element_op, const Block2CTileMap &block_2_ctile_map, const AGridDesc_AK0_M_K1 &a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_K1 &b_grid_desc_bk0_n_bk1, const DsGridDesc_M_N &ds_grid_desc_m_n, const CGridDesc_M_N &c_grid_desc_m_n) |
| template<bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, TailNumber TailNum = TailNumber::Odd> | |
| static __device__ void | Run_2Lds (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsGridPointer &p_ds_grid, CDataType *__restrict__ p_c_grid, void *__restrict__ p_shared_0, void *__restrict__ p_shared_1, const Problem &problem, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, CElementwiseOperation c_element_op) |
| template<typename Block2CTileMap, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, TailNumber TailNum = TailNumber::Odd> | |
| static __device__ void | Run_2Lds (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsGridPointer &p_ds_grid, CDataType *__restrict__ p_c_grid, void *__restrict__ p_shared_0, void *__restrict__ p_shared_1, const Problem &problem, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, CElementwiseOperation c_element_op, const Block2CTileMap &block_2_ctile_map) |
| template<bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, TailNumber TailNum, typename Block2CTileMap, typename AGridDesc_AK0_M_K1, typename BGridDesc_BK0_N_K1, typename DsGridDesc_M_N, typename CGridDesc_M_N> | |
| static __device__ void | Run_2Lds (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsGridPointer &p_ds_grid, CDataType *__restrict__ p_c_grid, void *__restrict__ p_shared_0, void *__restrict__ p_shared_1, const Problem &problem, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, CElementwiseOperation c_element_op, const Block2CTileMap &block_2_ctile_map, const AGridDesc_AK0_M_K1 &a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_K1 &b_grid_desc_bk0_n_bk1, const DsGridDesc_M_N &ds_grid_desc_m_n, const CGridDesc_M_N &c_grid_desc_m_n) |
Static Public Attributes | |
| static constexpr auto | I0 = Number<0>{} |
| static constexpr auto | I1 = Number<1>{} |
| static constexpr auto | I2 = Number<2>{} |
| static constexpr auto | I3 = Number<3>{} |
| static constexpr auto | I4 = Number<4>{} |
| static constexpr auto | I5 = Number<5>{} |
| static constexpr auto | I6 = Number<6>{} |
| static constexpr auto | I7 = Number<7>{} |
| static constexpr auto | CShuffleBlockTransferScalarPerVector_NPerBlock |
| static constexpr auto | AK0Number = Number<KPerBlock / AK1Value>{} |
| static constexpr auto | BK0Number = Number<KPerBlock / BK1Value>{} |
| static constexpr auto | AK1Number = Number<AK1Value>{} |
| static constexpr auto | BK1Number = Number<BK1Value>{} |
| static constexpr bool | DirectLoadEnabled = DirectLoad |
| static constexpr index_t | NumDTensor = DsDataType::Size() |
| static constexpr auto | lcm_AK1_BK1 = math::lcm(AK1Number, BK1Number) |
| static constexpr bool | is_single_rate_mfma |
| static constexpr auto | is_scale_mfma = false |
| static constexpr index_t | KPack |
Member Typedef Documentation
◆ Block2CTileMapDefault
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
| using ck::GridwiseGemmMultiD_xdl_cshuffle_v3< ALayout, BLayout, DsLayout, CLayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraMCustom, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraNCustom, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVectors, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, LDSTypeA, LDSTypeB, DoElementwiseBeforeCShuffle, DirectLoad >::Block2CTileMapDefault = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock, NPerBlock> |
◆ BlockwiseGemmPipe
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
| using ck::GridwiseGemmMultiD_xdl_cshuffle_v3< ALayout, BLayout, DsLayout, CLayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraMCustom, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraNCustom, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVectors, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, LDSTypeA, LDSTypeB, DoElementwiseBeforeCShuffle, DirectLoad >::BlockwiseGemmPipe |
Initial value:
BlkGemmPipelineVer,
BlkGemmPipeSched,
BlockSize,
ComputeTypeA,
AccDataType,
decltype(GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()),
decltype(GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()),
decltype(MakeAMmaTileDescriptor_M0_M1_M2_K(
decltype(MakeBMmaTileDescriptor_N0_N1_N2_K(
ABlockTransferSrcScalarPerVector,
BBlockTransferSrcScalarPerVector,
MPerBlock,
NPerBlock,
KPerBlock,
MPerXdl,
NPerXdl,
MXdlPerWave,
NXdlPerWave,
DirectLoad>())>
constexpr auto BlockGemmPipeline_Selector()
Definition blockwise_gemm_pipeline_wmma_selector.hpp:32
static constexpr auto KPack
Definition gridwise_batched_gemm_gemm_wmma_cshuffle_v3.hpp:113
typename std::conditional<(NumBTensor > 1), ComputeTypeB, remove_cvref_t< tuple_element_t< 0, BsDataType > > >::type LDSTypeB
Definition gridwise_gemm_wmma_cshuffle_v3_common.hpp:140
typename std::conditional<(NumATensor > 1), ComputeTypeA, remove_cvref_t< tuple_element_t< 0, AsDataType > > >::type LDSTypeA
Definition gridwise_gemm_wmma_cshuffle_v3_common.hpp:136
static __device__ constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
Definition gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp:768
__host__ static __device__ constexpr auto MakeAMmaTileDescriptor_M0_M1_M2_K(const ABlockDesc_AK0_M_AK1 &)
Definition gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp:519
__host__ static __device__ constexpr auto MakeBMmaTileDescriptor_N0_N1_N2_K(const BBlockDesc_BK0_N_BK1 &)
Definition gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp:528
static __device__ constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
Definition gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp:923
◆ DsGridPointer
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
| using ck::GridwiseGemmMultiD_xdl_cshuffle_v3< ALayout, BLayout, DsLayout, CLayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraMCustom, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraNCustom, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVectors, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, LDSTypeA, LDSTypeB, DoElementwiseBeforeCShuffle, DirectLoad >::DsGridPointer = decltype(MakeDsGridPointer()) |
◆ ThisThreadBlock
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
| using ck::GridwiseGemmMultiD_xdl_cshuffle_v3< ALayout, BLayout, DsLayout, CLayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraMCustom, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraNCustom, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVectors, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, LDSTypeA, LDSTypeB, DoElementwiseBeforeCShuffle, DirectLoad >::ThisThreadBlock = ThisThreadBlock<BlockSize> |
Member Function Documentation
◆ CalculateAK0Padded()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
inlinestatic |
◆ CalculateBK0Padded()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
inlinestatic |
◆ CalculateGridSize()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
inlinestatic |
◆ CalculateHasMainKBlockLoop()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
inlinestaticconstexpr |
◆ CalculateKBlockLoopTailNum()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
inlinestaticconstexpr |
◆ CalculateKPadded() [1/2]
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
inlinestatic |
◆ CalculateKPadded() [2/2]
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
inlinestatic |
◆ CalculateKRead()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
inlinestatic |
◆ CalculateMBlock()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
inlinestatic |
◆ CalculateMPadded()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
inlinestatic |
◆ CalculateNBlock()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
inlinestatic |
◆ CalculateNPadded()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
inlinestatic |
◆ CheckValidity()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
inlinestaticconstexpr |
◆ GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
inlinestaticconstexpr |
◆ GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
inlinestaticconstexpr |
◆ GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
inlinestaticconstexpr |
◆ GetSharedMemoryNumberOfByte()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
inlinestaticconstexpr |
◆ IsValidCompilationParameter()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
template<InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set>
|
inlinestaticconstexpr |
◆ MakeAGridDescriptor_AK0_M_AK1()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
inlinestatic |
◆ MakeAMmaTileDescriptor_M0_M1_M2_K()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
template<typename ABlockDesc_AK0_M_AK1>
|
inlinestaticconstexpr |
◆ MakeBGridDescriptor_BK0_N_BK1()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
inlinestatic |
◆ MakeBMmaTileDescriptor_N0_N1_N2_K()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
template<typename BBlockDesc_BK0_N_BK1>
|
inlinestaticconstexpr |
◆ MakeCGridDescriptor_M_N()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
template<typename ELayout>
|
inlinestatic |
◆ MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
template<typename CGridDesc>
|
inlinestaticconstexpr |
◆ MakeDsGridDescriptor_M_N()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
inlinestatic |
◆ MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
template<typename DsGridDesc>
|
inlinestaticconstexpr |
◆ MakeDsGridPointer()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
inlinestaticconstexpr |
◆ MakeGemmMmaTileDescriptor()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
inlinestaticconstexpr |
◆ Run() [1/3]
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
template<bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, TailNumber TailNum = TailNumber::Odd>
|
inlinestatic |
◆ Run() [2/3]
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
template<typename Block2CTileMap, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, TailNumber TailNum = TailNumber::Odd>
|
inlinestatic |
◆ Run() [3/3]
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
template<bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, TailNumber TailNum, typename Block2CTileMap, typename AGridDesc_AK0_M_K1, typename BGridDesc_BK0_N_K1, typename DsGridDesc_M_N, typename CGridDesc_M_N>
|
inlinestatic |
◆ Run_2Lds() [1/3]
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
template<bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, TailNumber TailNum = TailNumber::Odd>
|
inlinestatic |
◆ Run_2Lds() [2/3]
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
template<typename Block2CTileMap, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, TailNumber TailNum = TailNumber::Odd>
|
inlinestatic |
◆ Run_2Lds() [3/3]
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
template<bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, TailNumber TailNum, typename Block2CTileMap, typename AGridDesc_AK0_M_K1, typename BGridDesc_BK0_N_K1, typename DsGridDesc_M_N, typename CGridDesc_M_N>
|
inlinestatic |
◆ TransformGrid()
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
inlinestatic |
Member Data Documentation
◆ AK0Number
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
staticconstexpr |
◆ AK1Number
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
staticconstexpr |
◆ BK0Number
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
staticconstexpr |
◆ BK1Number
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
staticconstexpr |
◆ CShuffleBlockTransferScalarPerVector_NPerBlock
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
staticconstexpr |
Initial value:
=
CDEShuffleBlockTransferScalarPerVectors{}[I0]
static constexpr auto I0
Definition gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp:162
◆ DirectLoadEnabled
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
staticconstexpr |
◆ I0
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
staticconstexpr |
◆ I1
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
staticconstexpr |
◆ I2
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
staticconstexpr |
◆ I3
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
staticconstexpr |
◆ I4
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
staticconstexpr |
◆ I5
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
staticconstexpr |
◆ I6
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
staticconstexpr |
◆ I7
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
staticconstexpr |
◆ is_scale_mfma
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
staticconstexpr |
◆ is_single_rate_mfma
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
staticconstexpr |
Initial value:
=
lcm_AK1_BK1 <= 4) ||
(is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8) ||
KPerBlock < 128 && MPerXdl == 16))
? true
: false
static constexpr auto lcm_AK1_BK1
Definition gridwise_gemm_xdl_cshuffle_conv_v3.hpp:82
◆ KPack
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
staticconstexpr |
Initial value:
=
MfmaSelector<ComputeTypeA,
MPerXdl,
NPerXdl,
ComputeTypeB,
is_scale_mfma>::selected_mfma.k_per_blk)
static constexpr bool is_single_rate_mfma
Definition gridwise_gemm_xdl_cshuffle_conv_v3.hpp:83
static constexpr auto is_scale_mfma
Definition gridwise_gemm_xdl_cshuffle_conv_v3.hpp:91
Selects the appropriate MFMA instruction type and configuration for given data types and tile sizes o...
Definition xdlops_gemm.hpp:1208
◆ lcm_AK1_BK1
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
staticconstexpr |
◆ NumDTensor
template<typename ALayout, typename BLayout, typename DsLayout, typename CLayout, typename ADataType, typename BDataType, typename AccDataType, typename CShuffleDataType, typename DsDataType, typename CDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, tensor_operation::device::GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraMCustom, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraNCustom, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CDEShuffleBlockTransferScalarPerVectors, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4, typename ComputeTypeA = CDataType, typename ComputeTypeB = ComputeTypeA, typename LDSTypeA = ADataType, typename LDSTypeB = BDataType, bool DoElementwiseBeforeCShuffle = false, bool DirectLoad = false>
|
staticconstexpr |
The documentation for this struct was generated from the following file: