DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB > Struct Template Reference#
Classes |
Public Types |
Public Member Functions |
Static Public Member Functions |
Static Public Attributes |
List of all members
ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB > Struct Template Reference
#include <device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp>
Inheritance diagram for ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >:
Classes | |
| struct | Argument |
| struct | Invoker |
Public Types | |
| using | DeviceOp = DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3 |
| using | LayernormBlockTileSize_M_N |
| using | CDEShuffleBlockTransferScalarPerVectors |
| using | GridwiseGemmWelford |
| using | LayernormMeanVarGridDesc_M_NBlock |
| using | LayernormCountGridDesc_M_NBlock |
| using | GammaBetaGridDesc_N = decltype(MakeDescriptor_X<LayernormBlockTileSize_M_N::At(1)>(1)) |
| using | EHGridDesc_M_N = decltype(MakeEHGridDescriptor_M_N<Sequence<true, true>, 1, 1>(1, 1, 1)) |
| using | GridwiseWelfordLayernorm |
Public Member Functions | |
| size_t | GetWorkSpaceSize (const BaseArgument *pArg) const override |
| void | SetWorkSpacePointer (BaseArgument *pArg, void *p_workspace, const StreamConfig &=StreamConfig{}) const override |
| bool | IsSupportedArgument (const BaseArgument *p_arg) override |
| std::unique_ptr< BaseArgument > | MakeArgumentPointer (const void *p_a, const void *p_b, std::array< const void *, NumDTensor > p_ds, const void *p_gamma, const void *p_beta, void *p_h, index_t MRaw, index_t NRaw, index_t KRaw, index_t StrideA, index_t StrideB, std::array< index_t, NumDTensor > StrideDs, index_t StrideH, double epsilon, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, CDEElementwiseOperation cde_element_op, HElementwiseOperation h_element_op) override |
| std::unique_ptr< BaseInvoker > | MakeInvokerPointer () override |
| std::string | GetTypeString () const override |
| Public Member Functions inherited from ck::tensor_operation::device::BaseOperator | |
| BaseOperator ()=default | |
| BaseOperator (const BaseOperator &)=default | |
| BaseOperator & | operator= (const BaseOperator &)=default |
| virtual std::string | GetInstanceString () const |
| virtual std::string | GetTypeIdName () const |
| virtual std::optional< std::string > | GetObjectName () const |
| virtual std::optional< std::string > | GetTemplateInfo () const |
| virtual std::string | GetTypeIdHashCode () const |
| virtual | ~BaseOperator () |
Static Public Member Functions | |
| template<typename DoPads, index_t MPerTile, index_t NPerTile> | |
| static auto | MakeEHGridDescriptor_M_N (index_t M, index_t N, index_t Stride) |
| template<index_t XPerTile> | |
| static auto | MakeDescriptor_X (index_t X) |
| static bool | IsSupportedArgument (const Argument &arg) |
| static auto | MakeArgument (const void *p_a, const void *p_b, std::array< const void *, NumDTensor > p_ds, const void *p_gamma, const void *p_beta, void *p_h, index_t MRaw, index_t NRaw, index_t KRaw, index_t StrideA, index_t StrideB, std::array< index_t, NumDTensor > StrideDs, index_t StrideH, double epsilon, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, CDEElementwiseOperation cde_element_op, HElementwiseOperation h_element_op) |
| static auto | MakeInvoker () |
Static Public Attributes | |
| static constexpr index_t | NumDTensor = DsDataType::Size() |
| static constexpr index_t | LayernormHDstVectorSize = CDEShuffleBlockTransferScalarPerVector |
| static constexpr index_t | LayernormGammaSrcVectorSize = CDEShuffleBlockTransferScalarPerVector |
| static constexpr index_t | LayernormBetaSrcVectorSize = CDEShuffleBlockTransferScalarPerVector |
| static constexpr index_t | LayernormESrcVectorSize = CDEShuffleBlockTransferScalarPerVector |
| static constexpr index_t | LayernormThreadSliceSize_N = CDEShuffleBlockTransferScalarPerVector |
| static constexpr auto | I0 = Number<0>{} |
| static constexpr auto | I1 = Number<1>{} |
| static constexpr auto | I2 = Number<2>{} |
| static constexpr auto | I3 = Number<3>{} |
| Static Public Attributes inherited from ck::tensor_operation::device::DeviceGemmMultipleDLayernorm< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, GammaDataType, BetaDataType, HDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation > | |
| static constexpr index_t | NumDTensor = DsDataType::Size() |
Member Typedef Documentation
◆ CDEShuffleBlockTransferScalarPerVectors
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
| using ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::CDEShuffleBlockTransferScalarPerVectors |
Initial value:
Sequence<CDEShuffleBlockTransferScalarPerVector,
CDEShuffleBlockTransferScalarPerVector,
CDEShuffleBlockTransferScalarPerVector>
Definition utility/sequence.hpp:43
◆ DeviceOp
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
| using ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::DeviceOp = DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3 |
◆ EHGridDesc_M_N
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
| using ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::EHGridDesc_M_N = decltype(MakeEHGridDescriptor_M_N<Sequence<true, true>, 1, 1>(1, 1, 1)) |
◆ GammaBetaGridDesc_N
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
| using ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::GammaBetaGridDesc_N = decltype(MakeDescriptor_X<LayernormBlockTileSize_M_N::At(1)>(1)) |
◆ GridwiseGemmWelford
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
| using ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::GridwiseGemmWelford |
◆ GridwiseWelfordLayernorm
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
| using ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::GridwiseWelfordLayernorm |
Initial value:
GridwiseWelfordSecondHalfLayernorm2d<EMeanVarDataType,
HDataType,
GammaDataType,
BetaDataType,
AccDataType,
HElementwiseOperation,
BlockSize,
LayernormThreadClusterSize_M_N::At(I0),
LayernormThreadClusterSize_M_N::At(I1),
LayernormThreadSliceSize_M,
Definition gridwise_welford_second_half_layernorm2d.hpp:42
decltype(GridwiseGemmWelford::EpilogueWelfordCShuffle::template MakeMeanVarDescriptor_M_N< Sequence< true, true >, LayernormBlockTileSize_M_N::At(0), LayernormBlockTileSize_M_N::At(1)>(1, 1)) LayernormMeanVarGridDesc_M_NBlock
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:295
static constexpr index_t LayernormGammaSrcVectorSize
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:206
decltype(MakeEHGridDescriptor_M_N< Sequence< true, true >, 1, 1 >(1, 1, 1)) EHGridDesc_M_N
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:308
static constexpr index_t LayernormHDstVectorSize
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:205
static constexpr index_t LayernormESrcVectorSize
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:208
static constexpr index_t LayernormBetaSrcVectorSize
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:207
static constexpr auto I0
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:215
decltype(MakeDescriptor_X< LayernormBlockTileSize_M_N::At(1)>(1)) GammaBetaGridDesc_N
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:307
decltype(GridwiseGemmWelford::EpilogueWelfordCShuffle::template MakeCountDescriptor_M_N< Sequence< true, true >, LayernormBlockTileSize_M_N::At(0), LayernormBlockTileSize_M_N::At(1)>(1, 1)) LayernormCountGridDesc_M_NBlock
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:301
static constexpr auto I1
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:216
static constexpr index_t LayernormThreadSliceSize_N
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:209
◆ LayernormBlockTileSize_M_N
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
| using ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::LayernormBlockTileSize_M_N |
Initial value:
Sequence<LayernormThreadClusterSize_M_N::At(0) * LayernormThreadSliceSize_M,
LayernormThreadClusterSize_M_N::At(1) * LayernormThreadSliceSize_N>
◆ LayernormCountGridDesc_M_NBlock
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
| using ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::LayernormCountGridDesc_M_NBlock |
Initial value:
decltype(GridwiseGemmWelford::EpilogueWelfordCShuffle::template MakeCountDescriptor_M_N<
LayernormBlockTileSize_M_N::At(1)>(1, 1))
__host__ static __device__ constexpr index_t At(index_t I)
Definition utility/sequence.hpp:53
◆ LayernormMeanVarGridDesc_M_NBlock
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
| using ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::LayernormMeanVarGridDesc_M_NBlock |
Initial value:
decltype(GridwiseGemmWelford::EpilogueWelfordCShuffle::template MakeMeanVarDescriptor_M_N<
LayernormBlockTileSize_M_N::At(1)>(1, 1))
Member Function Documentation
◆ GetTypeString()
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
|
inlineoverridevirtual |
Reimplemented from ck::tensor_operation::device::BaseOperator.
◆ GetWorkSpaceSize()
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
|
inlineoverridevirtual |
Reimplemented from ck::tensor_operation::device::BaseOperator.
◆ IsSupportedArgument() [1/2]
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
|
inlinestatic |
◆ IsSupportedArgument() [2/2]
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
|
inlineoverridevirtual |
Reimplemented from ck::tensor_operation::device::BaseOperator.
◆ MakeArgument()
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
|
inlinestatic |
◆ MakeArgumentPointer()
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
|
inlineoverridevirtual |
◆ MakeDescriptor_X()
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
template<index_t XPerTile>
|
inlinestatic |
◆ MakeEHGridDescriptor_M_N()
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
|
inlinestatic |
◆ MakeInvoker()
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
|
inlinestatic |
◆ MakeInvokerPointer()
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
|
inlineoverridevirtual |
◆ SetWorkSpacePointer()
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
|
inlineoverridevirtual |
Reimplemented from ck::tensor_operation::device::BaseOperator.
Member Data Documentation
◆ I0
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
|
staticconstexpr |
◆ I1
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
|
staticconstexpr |
◆ I2
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
|
staticconstexpr |
◆ I3
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
|
staticconstexpr |
◆ LayernormBetaSrcVectorSize
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
|
staticconstexpr |
◆ LayernormESrcVectorSize
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
|
staticconstexpr |
◆ LayernormGammaSrcVectorSize
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
|
staticconstexpr |
◆ LayernormHDstVectorSize
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
|
staticconstexpr |
◆ LayernormThreadSliceSize_N
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
|
staticconstexpr |
◆ NumDTensor
template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
|
staticconstexpr |
The documentation for this struct was generated from the following file: