20 typename AScaleDataType,
22 typename BScaleDataType,
25 typename AMmaTileDesc,
26 typename BMmaTileDesc,
27 index_t ABlockTransferSrcScalarPerVector,
28 index_t BBlockTransferSrcScalarPerVector,
41template <
index_t ThreadBlockSize,
44 typename AScaleDataType,
46 typename BScaleDataType,
49 typename AMmaTileDesc,
50 typename BMmaTileDesc,
51 index_t ABlockTransferSrcScalarPerVector,
52 index_t BBlockTransferSrcScalarPerVector,
72 ABlockTransferSrcScalarPerVector,
73 BBlockTransferSrcScalarPerVector,
89 ABlockTransferSrcScalarPerVector,
90 BBlockTransferSrcScalarPerVector,
109 ABlockTransferSrcScalarPerVector,
110 BBlockTransferSrcScalarPerVector,
172 KPerBlock / ScaleBlockSize;
186 "A scale pack data type too large!");
188 "B scale pack data type too large!");
206 constexpr auto num_ds_read_inst_a =
213 constexpr auto num_buffer_load_stage1 =
216 constexpr auto num_buffer_load_stage2 = num_buffer_load_inst_a;
221 constexpr auto ds_read_a_issue_cycle =
223 constexpr auto ds_read_a_mfma_rate =
229 constexpr auto num_total_stages = std::max(2, MRepeat);
230 if constexpr(num_total_stages > 2)
235 constexpr auto num_mfma_perstage = num_mfma_inst / num_total_stages;
236 constexpr auto num_ds_read_a_perstage = num_ds_read_inst_a / num_total_stages;
238 constexpr auto num_ds_read_a_mfma_perstage =
241 constexpr auto num_ds_read_a_prefetch_stages = 2;
243 constexpr auto buffer_load_perstage_more =
245 constexpr auto buffer_load_perstage_less =
247 constexpr auto buffer_load_perstage_stage2 =
250 constexpr auto buffer_load_stages_more =
251 num_buffer_load_stage1 -
253 ((num_total_stages - 2));
255 constexpr auto buffer_load_issue_point_interval_more =
256 num_mfma_perstage / buffer_load_perstage_more;
257 constexpr auto buffer_load_issue_point_interval_less =
258 num_mfma_perstage / buffer_load_perstage_less;
259 constexpr auto buffer_load_issue_point_interval_stage2 =
260 num_mfma_perstage / buffer_load_perstage_stage2;
266 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
268 if constexpr(imfma % buffer_load_issue_point_interval_more == 0)
270 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
273 if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
275 __builtin_amdgcn_sched_group_barrier(
276 0x100, ds_read_a_mfma_rate, 0);
282 static_for<0, (num_total_stages - 2 - buffer_load_stages_more), 1>{}([&](
auto ) {
284 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
285 if constexpr(imfma % buffer_load_issue_point_interval_less == 0)
287 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
289 if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
291 __builtin_amdgcn_sched_group_barrier(
292 0x100, ds_read_a_mfma_rate, 0);
301 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
302 if constexpr(imfma % buffer_load_issue_point_interval_stage2 == 0)
304 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
306 if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
308 __builtin_amdgcn_sched_group_barrier(
309 0x100, ds_read_a_mfma_rate, 0);
316 constexpr auto num_buffer_load_total = num_buffer_load_inst_a + num_buffer_load_inst_b +
320 num_ds_read_inst_a, ds_read_a_mfma_rate);
323 constexpr auto num_mfma_stage1 = num_mfma_inst - num_dsread_a_mfma;
325 constexpr auto mfma_perstage_more =
327 constexpr auto mfma_perstage_less =
330 constexpr auto mfma_stages_more =
331 num_mfma_stage1 - mfma_perstage_less * num_buffer_load_total;
334 if constexpr(i < mfma_stages_more)
337 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
339 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
344 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
346 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
351 if constexpr((i + num_buffer_load_inst_a) < mfma_stages_more)
354 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
356 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
361 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
363 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
368 if constexpr((i + num_buffer_load_inst_a + num_buffer_load_inst_b) <
372 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
374 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
379 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
381 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
386 if constexpr((i + num_buffer_load_inst_a + num_buffer_load_inst_b +
390 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
392 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
397 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
399 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
405 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
406 if constexpr((num_ds_read_inst_a - (i + 1) * ds_read_a_mfma_rate) >=
409 __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0);
413 __builtin_amdgcn_sched_group_barrier(
415 num_ds_read_inst_a - (num_dsread_a_mfma - 1) * ds_read_a_mfma_rate,
422 template <
bool HasMainLoop,
426 typename ABlockTransfer,
427 typename AGridBuffer,
428 typename ABlockBuffer,
429 typename ABlockTransferStep,
432 typename BBlockTransfer,
433 typename BGridBuffer,
434 typename BBlockBuffer,
435 typename BBlockTransferStep,
436 typename CThreadBuffer,
437 typename AScaleGridBuffer,
438 typename AScaleGridDesc,
439 typename AScaleThreadTransfer,
440 typename BScaleGridBuffer,
441 typename BScaleGridDesc,
442 typename BScaleThreadTransfer>
445 const AGridDesc& a_grid_desc,
446 const ABlockDesc& a_block_desc,
447 ABlockTransfer& a_blockwise_copy,
448 const AGridBuffer& a_grid_buf,
449 ABlockBuffer& a_block_bufs,
450 const ABlockTransferStep& a_block_copy_step,
452 const BGridDesc& b_grid_desc,
453 const BBlockDesc& b_block_desc,
454 BBlockTransfer& b_blockwise_copy,
455 const BGridBuffer& b_grid_buf,
456 BBlockBuffer& b_block_bufs,
457 const BBlockTransferStep& b_block_copy_step,
459 CThreadBuffer& c_thread_buf,
461 const AScaleGridDesc& a_scale_grid_desc,
462 AScaleThreadTransfer& a_scale_thread_copy,
463 const AScaleGridBuffer& a_scale_grid_buf,
464 const BScaleGridDesc& b_scale_grid_desc,
465 BScaleThreadTransfer& b_scale_thread_copy,
466 const BScaleGridBuffer& b_scale_grid_buf,
486 a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_bufs(
I0));
487 b_blockwise_copy.Run(
488 b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_thread_bufs(
I0));
490 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
491 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
496 a_scale_thread_copy.Run(a_scale_grid_desc,
500 a_scale_thread_bufs(
I0));
502 a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
505 a_scale_thread_copy.MoveSrcSliceWindow(
510 a_scale_thread_copy.MoveSrcSliceWindow(
517 b_scale_thread_copy.Run(b_scale_grid_desc,
521 b_scale_thread_bufs(
I0));
523 b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
526 b_scale_thread_copy.MoveSrcSliceWindow(
532 b_scale_thread_copy.MoveSrcSliceWindow(
545 constexpr auto a_k_step_chunk =
561 a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_bufs(
I1));
562 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
565 c_thread_buf.Clear();
566 __builtin_amdgcn_sched_barrier(0);
569 if constexpr(HasMainLoop)
575 auto LoopFunc = [&](
auto scale_comp_buf,
auto scale_mem_buf) {
576 b_blockwise_copy.Run(b_grid_desc,
580 b_thread_bufs(scale_mem_buf));
585 a_scale_thread_copy.Run(a_scale_grid_desc,
589 a_scale_thread_bufs(scale_mem_buf));
591 a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
594 a_scale_thread_copy.MoveSrcSliceWindow(
599 a_scale_thread_copy.MoveSrcSliceWindow(
606 b_scale_thread_copy.Run(b_scale_grid_desc,
610 b_scale_thread_bufs(scale_mem_buf));
612 b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
615 b_scale_thread_copy.MoveSrcSliceWindow(
621 b_scale_thread_copy.MoveSrcSliceWindow(
626 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
629 constexpr auto im_major = m0 /
MXdlPack;
630 constexpr auto im_minor = m0 %
MXdlPack;
632 constexpr auto ik_major = k0 /
KXdlPack;
633 constexpr auto ik_minor = k0 %
KXdlPack;
635 constexpr auto in_major = n0 /
NXdlPack;
636 constexpr auto in_minor = n0 %
NXdlPack;
638 constexpr index_t a_scale_offset =
641 constexpr index_t b_scale_offset =
646 "Must have at least one scale per Xdlops "
656 a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
662 b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
671 a_thread_vec.template AsType<ComputeTypeA>()(ik) =
674 b_thread_vec.template AsType<ComputeTypeB>()(ik) = b_thread_bufs
679 using mfma_input_type_a =
684 using mfma_input_type_b =
689 using mfma_scale_input_type_a =
692 using mfma_scale_input_type_b =
697 make_tuple(im_major, in_major, im_minor, in_minor, 0));
702 a_thread_vec.template AsType<mfma_input_type_a>(),
703 a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
704 b_thread_vec.template AsType<mfma_input_type_b>(),
705 b_scale_thread_vec.template AsType<mfma_scale_input_type_b>(),
710 if constexpr(m0.value == SwitchM)
714 a_blockwise_copy.Run(a_grid_desc,
717 a_block_bufs(scale_comp_buf));
718 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
721 constexpr auto lds_buf =
722 m0.value >= SwitchM ? scale_mem_buf : scale_comp_buf;
729 1>{}([&](
auto chunk) {
730 constexpr auto a_k_step_chunk =
754 __builtin_amdgcn_sched_barrier(0);
761 }
while(i < (num_loop - 2));
767 b_blockwise_copy.Run(
768 b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_thread_bufs(
I1));
773 a_scale_thread_copy.Run(a_scale_grid_desc,
777 a_scale_thread_bufs(
I1));
779 a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
782 a_scale_thread_copy.MoveSrcSliceWindow(
789 b_scale_thread_copy.Run(b_scale_grid_desc,
793 b_scale_thread_bufs(
I1));
795 b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
798 b_scale_thread_copy.MoveSrcSliceWindow(
803 constexpr auto im_major = m0 /
MXdlPack;
804 constexpr auto im_minor = m0 %
MXdlPack;
806 constexpr auto ik_major = k0 /
KXdlPack;
807 constexpr auto ik_minor = k0 %
KXdlPack;
809 constexpr auto in_major = n0 /
NXdlPack;
810 constexpr auto in_minor = n0 %
NXdlPack;
812 constexpr index_t a_scale_offset =
814 constexpr index_t b_scale_offset =
818 "Must have at least one scale per Xdlops "
826 a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
831 b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
839 a_thread_vec.template AsType<ComputeTypeA>()(ik) =
842 b_thread_vec.template AsType<ComputeTypeB>()(ik) =
847 using mfma_input_type_a =
851 using mfma_input_type_b =
855 using mfma_scale_input_type_a =
857 using mfma_scale_input_type_b =
861 make_tuple(im_major, in_major, im_minor, in_minor, 0));
866 a_thread_vec.template AsType<mfma_input_type_a>(),
867 a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
868 b_thread_vec.template AsType<mfma_input_type_b>(),
869 b_scale_thread_vec.template AsType<mfma_scale_input_type_b>(),
873 if constexpr(m0.value == SwitchM)
879 constexpr auto lds_buf = m0.value >= SwitchM ?
I1 :
I0;
886 constexpr auto a_k_step_chunk =
907 constexpr auto im_major = m0 /
MXdlPack;
908 constexpr auto im_minor = m0 %
MXdlPack;
910 constexpr auto ik_major = k0 /
KXdlPack;
911 constexpr auto ik_minor = k0 %
KXdlPack;
913 constexpr auto in_major = n0 /
NXdlPack;
914 constexpr auto in_minor = n0 %
NXdlPack;
916 constexpr index_t a_scale_offset =
918 constexpr index_t b_scale_offset =
922 "Must have at least one scale per Xdlops "
930 a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
935 b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
943 a_thread_vec.template AsType<ComputeTypeA>()(ik) =
946 b_thread_vec.template AsType<ComputeTypeB>()(ik) =
951 using mfma_input_type_a =
955 using mfma_input_type_b =
959 using mfma_scale_input_type_a =
961 using mfma_scale_input_type_b =
965 make_tuple(im_major, in_major, im_minor, in_minor, 0));
970 a_thread_vec.template AsType<mfma_input_type_a>(),
971 a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
972 b_thread_vec.template AsType<mfma_input_type_b>(),
973 b_scale_thread_vec.template AsType<mfma_scale_input_type_b>(),
984 constexpr auto a_k_step_chunk =
1011 constexpr auto im_major = m0 /
MXdlPack;
1012 constexpr auto im_minor = m0 %
MXdlPack;
1014 constexpr auto ik_major = k0 /
KXdlPack;
1015 constexpr auto ik_minor = k0 %
KXdlPack;
1017 constexpr auto in_major = n0 /
NXdlPack;
1018 constexpr auto in_minor = n0 %
NXdlPack;
1020 constexpr index_t a_scale_offset =
1022 constexpr index_t b_scale_offset =
1026 "Must have at least one scale per Xdlops "
1034 a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
1039 b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
1047 a_thread_vec.template AsType<ComputeTypeA>()(ik) =
1050 b_thread_vec.template AsType<ComputeTypeB>()(ik) =
1055 using mfma_input_type_a =
1059 using mfma_input_type_b =
1063 using mfma_scale_input_type_a =
1065 using mfma_scale_input_type_b =
1069 make_tuple(im_major, in_major, im_minor, in_minor, 0));
1074 a_thread_vec.template AsType<mfma_input_type_a>(),
1075 a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
1076 b_thread_vec.template AsType<mfma_input_type_b>(),
1077 b_scale_thread_vec.template AsType<mfma_scale_input_type_b>(),
1088 constexpr auto a_k_step_chunk =
1135 Number<KRepeat / KXdlPack>{},
1142 Number<KRepeat / KXdlPack>{},
1148 using Base::b_thread_copy_;
1149 using Base::b_thread_desc_;
1150 using Base::c_thread_desc_;
__host__ __device__ constexpr auto integer_divide_floor(X x, Y y)
Definition utility/math.hpp:66
__host__ __device__ constexpr auto integer_divide_ceil(X x, Y y)
Definition utility/math.hpp:72
__host__ __device__ constexpr auto make_multi_index(Xs &&... xs)
Definition array_multi_index.hpp:15
__host__ __device__ constexpr auto make_static_buffer(Number< N >)
Definition static_buffer.hpp:186
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition utility/statically_indexed_array.hpp:45
int32_t index_t
Definition ck.hpp:299
integral_constant< index_t, N > Number
Definition number.hpp:12
TailNumber
Definition blkgemmpipe_scheduler.hpp:31
@ Even
Definition blkgemmpipe_scheduler.hpp:34
@ Odd
Definition blkgemmpipe_scheduler.hpp:33
constexpr detail::ignore_t ignore
Definition utility/ignore.hpp:20
BlockGemmPipelineScheduler
Definition blkgemmpipe_scheduler.hpp:25
@ Intrawave
Definition blkgemmpipe_scheduler.hpp:26
__host__ __device__ constexpr auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition tensor_descriptor_helper.hpp:101
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__device__ void block_sync_lds()
Definition synchronization.hpp:16
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GetWaveIdx static __device__ auto GetWaveIdx()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:118
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::AccType float AccType
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:36
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::HotLoopInstList ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops,(packed_size_v< ComputeTypeA > > 1||packed_size_v< ComputeTypeB > > 1)> HotLoopInstList
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:88
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ComputeTypeA ADataType ComputeTypeA
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:34
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2 __host__ static __device__ constexpr auto MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_M_N &c_grid_desc_m_n)
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:344
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::AMmaKStride static constexpr index_t AMmaKStride
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:68
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::MWaves static constexpr index_t MWaves
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:49
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4 __host__ static __device__ constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:220
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2 __host__ static __device__ constexpr auto GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:269
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_block_desc_n0_n1_n2_n3_k static constexpr BMmaTileDesc b_block_desc_n0_n1_n2_n3_k
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:382
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2 __host__ static __device__ constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:297
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Tuple5 decltype(CalculateAThreadOriginDataIndex()) Tuple5
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:184
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::I0 static constexpr auto I0
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:41
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::I1 static constexpr auto I1
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:42
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::KRepeat static constexpr index_t KRepeat
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:79
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::CalculateCThreadOriginDataIndex static __device__ auto CalculateCThreadOriginDataIndex(Number< m0 >, Number< n0 >, Number< xdlops_i >, Number< blk_i >)
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:154
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::MXdlPack static constexpr index_t MXdlPack
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:84
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4 __host__ static __device__ constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:283
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2 __host__ static __device__ constexpr auto MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_G_M_N &c_grid_desc_g_m_n)
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:361
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::xdlops_gemm static constexpr auto xdlops_gemm
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:60
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_block_desc_m0_m1_m2_m3_k static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_m3_k
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:381
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::KThreadChunk static constexpr index_t KThreadChunk
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:74
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::KXdlPack static constexpr index_t KXdlPack
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:86
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2 __host__ static __device__ constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:234
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GetCThreadBuffer __host__ __device__ constexpr auto & GetCThreadBuffer()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:116
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ComputeTypeB BDataType ComputeTypeB
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:35
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::APackedSize static constexpr index_t APackedSize
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:38
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockwiseGemmXdlops_mx_pipeline_base __host__ __device__ BlockwiseGemmXdlops_mx_pipeline_base(Tuple5 a_origin=CalculateAThreadOriginDataIndex(), Tuple5 b_origin=CalculateBThreadOriginDataIndex())
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:204
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BMmaKStride static constexpr index_t BMmaKStride
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:69
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::NXdlPack static constexpr index_t NXdlPack
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:85
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2 __host__ static __device__ constexpr auto GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:327
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::A_K1 static constexpr index_t A_K1
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:55
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::NWaves static constexpr index_t NWaves
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:50
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::WaveSize static constexpr index_t WaveSize
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:51
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BPackedSize static constexpr index_t BPackedSize
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:39
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::CalculateAThreadOriginDataIndex static __device__ auto CalculateAThreadOriginDataIndex()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:130
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_thread_desc_ static constexpr auto a_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:1117
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::LocalPrefetchStages static constexpr index_t LocalPrefetchStages
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:160
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockLoopTailNum static __host__ constexpr TailNumber BlockLoopTailNum(index_t num_loop)
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:197
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ScalesPerXdlopsRun static constexpr auto ScalesPerXdlopsRun
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:175
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::I1 static constexpr auto I1
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:42
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::MXdlPack static constexpr index_t MXdlPack
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:84
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::MWaves static constexpr index_t MWaves
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:49
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GlobalBufferNum static constexpr index_t GlobalBufferNum
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:162
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_block_desc_m0_m1_m2_m3_k static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_m3_k
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:381
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::I0 static constexpr auto I0
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:41
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::KThreadChunk static constexpr index_t KThreadChunk
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:74
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ComputeTypeA typename Base::ComputeTypeA ComputeTypeA
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:156
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_scale_thread_vec_size static constexpr auto b_scale_thread_vec_size
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:190
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Base BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack > Base
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:102
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::AThreadCopy ThreadwiseTensorSliceTransfer_v4< ADataType, ComputeTypeA, decltype(a_block_desc_m0_m1_m2_m3_k), decltype(a_thread_desc_), Sequence< 1, 1, 1, 1, KThreadChunk >, Sequence< 0, 1, 2, 3, 4 >, 4, A_K1, A_K1 > AThreadCopy
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:1120
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Tuple5 typename Base::Tuple5 Tuple5
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:155
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockHasHotloop static __host__ constexpr bool BlockHasHotloop(index_t num_loop)
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:192
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_thread_desc_ static constexpr auto b_thread_desc_
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:392
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::async_vmcnt static constexpr auto async_vmcnt
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:167
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::num_buffer_load_a_scale static constexpr auto num_buffer_load_a_scale
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:165
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ScalesPerKBlockSize static constexpr auto ScalesPerKBlockSize
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:171
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::c_thread_desc_ static constexpr auto c_thread_desc_
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:396
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_scale_thread_desc static constexpr auto a_scale_thread_desc
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:1133
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ScalesPerXdlopsRunPerThread static constexpr auto ScalesPerXdlopsRunPerThread
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:179
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::KXdlPack static constexpr index_t KXdlPack
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:86
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ARegBuf static constexpr auto ARegBuf
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:1116
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_thread_copy_ AThreadCopy a_thread_copy_
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:1129
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::NXdlPack static constexpr index_t NXdlPack
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:85
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::AccType typename Base::AccType AccType
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:154
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_scale_thread_desc static constexpr auto b_scale_thread_desc
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:1140
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::PrefillStages static constexpr index_t PrefillStages
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:161
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::KRepeat static constexpr index_t KRepeat
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:79
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::async_vmcnt_encoding static constexpr auto async_vmcnt_encoding
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:169
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::num_buffer_load_b_scale static constexpr auto num_buffer_load_b_scale
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:166
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::xdlops_gemm static constexpr auto xdlops_gemm
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:60
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::APackedSize static constexpr index_t APackedSize
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:38
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::HotloopLocalBufSwitch static constexpr index_t HotloopLocalBufSwitch
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:163
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ComputeTypeB typename Base::ComputeTypeB ComputeTypeB
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:157
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::scale_pack_size_b static constexpr auto scale_pack_size_b
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:184
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::A_K1 static constexpr index_t A_K1
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:55
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::mx_scale_t e8m0_bexp_t mx_scale_t
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:182
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::scale_pack_size_a static constexpr auto scale_pack_size_a
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:183
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BPackedSize static constexpr index_t BPackedSize
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:39
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::PrefetchStages static constexpr index_t PrefetchStages
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:159
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Run __device__ void Run(const AGridDesc &a_grid_desc, const ABlockDesc &a_block_desc, ABlockTransfer &a_blockwise_copy, const AGridBuffer &a_grid_buf, ABlockBuffer &a_block_bufs, const ABlockTransferStep &a_block_copy_step, const BGridDesc &b_grid_desc, const BBlockDesc &b_block_desc, BBlockTransfer &b_blockwise_copy, const BGridBuffer &b_grid_buf, BBlockBuffer &b_block_bufs, const BBlockTransferStep &b_block_copy_step, CThreadBuffer &c_thread_buf, const AScaleGridDesc &a_scale_grid_desc, AScaleThreadTransfer &a_scale_thread_copy, const AScaleGridBuffer &a_scale_grid_buf, const BScaleGridDesc &b_scale_grid_desc, BScaleThreadTransfer &b_scale_thread_copy, const BScaleGridBuffer &b_scale_grid_buf, index_t num_loop) const
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:443
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_scale_thread_vec_size static constexpr auto a_scale_thread_vec_size
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:189
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::HotLoopScheduler static __device__ constexpr auto HotLoopScheduler()
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:202
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::NWaves static constexpr index_t NWaves
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:50
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp:38
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops,(packed_size_v< ComputeTypeA > > 1||packed_size_v< ComputeTypeB > > 1)>::A_LDS_Read_Inst_Num static constexpr index_t A_LDS_Read_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:49
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops,(packed_size_v< ComputeTypeA > > 1||packed_size_v< ComputeTypeB > > 1)>::A_LDS_Read_Width static constexpr index_t A_LDS_Read_Width
Definition blkgemmpipe_scheduler.hpp:82
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops,(packed_size_v< ComputeTypeA > > 1||packed_size_v< ComputeTypeB > > 1)>::C_MFMA_Inst_Num static constexpr index_t C_MFMA_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:54
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops,(packed_size_v< ComputeTypeA > > 1||packed_size_v< ComputeTypeB > > 1)>::C_MFMA_Inst_Cycle static constexpr index_t C_MFMA_Inst_Cycle
Definition blkgemmpipe_scheduler.hpp:105
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops,(packed_size_v< ComputeTypeA > > 1||packed_size_v< ComputeTypeB > > 1)>::A_Buffer_Load_Inst_Num static constexpr index_t A_Buffer_Load_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:39
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops,(packed_size_v< ComputeTypeA > > 1||packed_size_v< ComputeTypeB > > 1)>::B_Buffer_Load_Inst_Num static constexpr index_t B_Buffer_Load_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:41
Definition utility/sequence.hpp:43
Definition threadwise_tensor_slice_transfer.hpp:1260
Unsigned representation of a conventional biased Float32 exponent.
Definition utility/e8m0.hpp:26
Definition functional2.hpp:33
Definition dtype_vector.hpp:10