25template <
typename GridwiseGemm,
30 typename AElementwiseOperation,
31 typename BElementwiseOperation,
32 typename CDEElementwiseOperation,
33 typename AGridDesc_AK0_M_AK1,
34 typename BGridDesc_BK0_N_BK1,
35 typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
36 typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
37 typename Block2ETileMap,
38 bool HasMainKBlockLoop>
40#if CK_USE_LAUNCH_BOUNDS
44 const ADataType* __restrict__ p_a_grid,
45 const BDataType* __restrict__ p_b_grid,
47 EDataType* __restrict__ p_e_grid,
48 const AElementwiseOperation a_element_op,
49 const BElementwiseOperation b_element_op,
50 const CDEElementwiseOperation cde_element_op,
51 const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
52 const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
53 const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
54 ds_grid_desc_mblock_mperblock_nblock_nperblock,
55 const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
56 e_grid_desc_mblock_mperblock_nblock_nperblock,
57 const Block2ETileMap block_2_etile_map)
59#if(defined(__gfx90a__) || defined(__gfx94__))
60 if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
62 __shared__
char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
64 GridwiseGemm::template Run<HasMainKBlockLoop>(
73 a_grid_desc_ak0_m_ak1,
74 b_grid_desc_bk0_n_bk1,
75 ds_grid_desc_mblock_mperblock_nblock_nperblock,
76 e_grid_desc_mblock_mperblock_nblock_nperblock,
87 ignore = a_grid_desc_ak0_m_ak1;
88 ignore = b_grid_desc_bk0_n_bk1;
89 ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
90 ignore = e_grid_desc_mblock_mperblock_nblock_nperblock;
91 ignore = block_2_etile_map;
104template <
typename ALayout,
110 typename AComputeDataType_,
111 typename AccDataType,
112 typename CShuffleDataType,
115 typename AElementwiseOperation,
116 typename BElementwiseOperation,
117 typename CDEElementwiseOperation,
131 typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
132 typename ABlockTransferSrcAccessOrder,
133 index_t ABlockTransferSrcVectorDim,
134 index_t ABlockTransferScalarPerVector,
136 typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
137 typename BBlockTransferSrcAccessOrder,
138 index_t BBlockTransferSrcVectorDim,
139 index_t BBlockTransferScalarPerVector,
141 index_t CShuffleMXdlPerWavePerShuffle,
142 index_t CShuffleNXdlPerWavePerShuffle,
143 typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
144 index_t CDEShuffleBlockTransferScalarPerVector_NPerBlock,
147 typename BComputeDataType_ = AComputeDataType_>
171#if CK_GFX90A_DENORM_WORKAROUND
213 __host__ __device__
static constexpr auto
216 constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
217 constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
219 constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
226 return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
236 return static_cast<const DDataType*
>(
nullptr);
250 a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
253 b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
256 constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
259 constexpr auto c_block_size =
260 c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
263 NumGemmKPrefetchStage * a_block_space_size_aligned *
sizeof(
AComputeDataType) +
264 NumGemmKPrefetchStage * b_block_space_size_aligned *
sizeof(
BComputeDataType),
265 c_block_size *
sizeof(CShuffleDataType));
268 __host__ __device__
static auto
271 constexpr auto matrix_padder =
273 MPerBlock, NPerBlock, KPerBlock};
275 const auto a_grid_desc_mraw_kraw = [&]() {
291 __host__ __device__
static auto
294 constexpr auto matrix_padder =
296 MPerBlock, NPerBlock, KPerBlock};
298 const auto b_grid_desc_nraw_kraw = [&]() {
314 __host__ __device__
static auto
317 constexpr auto matrix_padder =
319 MPerBlock, NPerBlock, KPerBlock};
320 const auto e_grid_desc_mraw_nraw = [&]() {
336 __host__ __device__
static auto
338 const std::array<index_t, NumDTensor>& NRaws,
339 const std::array<index_t, NumDTensor>& DsStride)
352 __host__ __device__
static constexpr auto
355 const auto M = a_grid_desc_m_k.GetLength(
I0);
356 const auto K = a_grid_desc_m_k.GetLength(
I1);
358 const auto AK0 = K /
AK1;
368 __host__ __device__
static constexpr auto
371 const auto N = b_grid_desc_n_k.GetLength(
I0);
372 const auto K = b_grid_desc_n_k.GetLength(
I1);
374 const auto BK0 = K /
BK1;
384 __host__ __device__
static constexpr auto
387 const auto M = e_grid_desc_m_n.GetLength(
I0);
388 const auto N = e_grid_desc_m_n.GetLength(
I1);
390 const auto MBlock = M / MPerBlock;
391 const auto NBlock = N / NPerBlock;
400 return e_grid_desc_mblock_mperblock_nblock_nperblock;
404 __host__ __device__
static constexpr auto
414 __host__ __device__
static constexpr auto
442 static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
443 (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
444 "Invalid tuning param!");
446 static_assert(KPerBlock % AK1Value == 0 && KPerBlock % BK1Value == 0,
447 "KPerBlock must be divisible by AK1Value and BK1Value!");
450 std::is_same_v<AElementwiseOperation,
452 std::is_same_v<BElementwiseOperation,
454 "Direct load transfers do not support elementwise operations other than passthrough.");
456 const auto M = a_grid_desc_m_k.GetLength(
I0);
457 const auto N = b_grid_desc_n_k.GetLength(
I0);
458 const auto AK = a_grid_desc_m_k.GetLength(
I1);
459 const auto BK = b_grid_desc_n_k.GetLength(
I1);
462 if(!(M == e_grid_desc_m_n.GetLength(
I0) && N == e_grid_desc_m_n.GetLength(
I1) && AK == BK))
470 valid = valid && (M == ds_grid_desc_m_n[i].GetLength(I0) &&
471 N == ds_grid_desc_m_n[i].GetLength(I1));
480 if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && AK % KPerBlock == 0))
486 const auto num_k_loop = AK / KPerBlock;
487 if(!GridwiseGemmPipe::IsSupported(num_k_loop))
493 if(!block_2_etile_map.CheckValidity(e_grid_desc_m_n))
501 if(!(a_grid_desc_m_k.GetElementSpaceSize() *
sizeof(ADataType) <= TwoGB &&
502 b_grid_desc_n_k.GetElementSpaceSize() *
sizeof(BDataType) <= TwoGB &&
503 e_grid_desc_m_n.GetElementSpaceSize() *
sizeof(EDataType) <= TwoGB))
513 const index_t num_loop = K / KPerBlock;
514 return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
519 __device__ __host__
static constexpr auto GetMPerBlock() {
return MPerBlock; }
521 template <
bool HasMainKBlockLoop,
526 __device__
static void Run(
const ADataType* __restrict__ p_a_grid,
527 const BDataType* __restrict__ p_b_grid,
529 EDataType* __restrict__ p_e_grid,
530 void* __restrict__ p_shared,
531 const AElementwiseOperation& a_element_op,
532 const BElementwiseOperation& b_element_op,
533 const CDEElementwiseOperation& cde_element_op,
537 ds_grid_desc_mblock_mperblock_nblock_nperblock,
539 e_grid_desc_mblock_mperblock_nblock_nperblock,
548 p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
551 p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
557 ds_grid_desc_mblock_mperblock_nblock_nperblock[i].GetElementSpaceSize());
562 p_e_grid, e_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
565 const auto block_work_idx =
568 if(!block_2_etile_map.ValidCTileIndex(
570 make_tuple(e_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(
I0),
571 e_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(
I2))))
577 const index_t m_block_data_idx_on_grid =
578 __builtin_amdgcn_readfirstlane(block_work_idx[
I0] * MPerBlock);
579 const index_t n_block_data_idx_on_grid =
580 __builtin_amdgcn_readfirstlane(block_work_idx[
I1] * NPerBlock);
590 auto a_blockwise_copy =
593 ABlockTransferThreadClusterLengths_AK0_M_AK1,
594 ABlockTransferSrcAccessOrder,
597 decltype(a_grid_desc_ak0_m_ak1),
598 decltype(a_block_desc_ak0_m_ak1),
599 ABlockTransferSrcAccessOrder,
600 ABlockTransferSrcVectorDim,
602 ABlockTransferScalarPerVector>(
603 a_grid_desc_ak0_m_ak1,
605 a_block_desc_ak0_m_ak1,
608 auto b_blockwise_copy =
611 BBlockTransferThreadClusterLengths_BK0_N_BK1,
612 BBlockTransferSrcAccessOrder,
615 decltype(b_grid_desc_bk0_n_bk1),
616 decltype(b_block_desc_bk0_n_bk1),
617 BBlockTransferSrcAccessOrder,
618 BBlockTransferSrcVectorDim,
620 BBlockTransferScalarPerVector>(
621 b_grid_desc_bk0_n_bk1,
623 b_block_desc_bk0_n_bk1,
633 constexpr bool is_single_rate_mfma =
642 constexpr auto is_scale_mfma =
false;
650 is_scale_mfma>::selected_mfma.k_per_blk);
657 decltype(a_block_desc_ak0_m_ak1),
658 decltype(b_block_desc_bk0_n_bk1),
666 BComputeDataType_>();
668 auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
672 a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
674 const auto a_buffers_offset = 0;
675 auto a_block_buffers =
676 ck::lds_utils::AllocateLdsBuffers<AComputeDataType, NumGemmKPrefetchStage>(
678 a_block_desc_ak0_m_ak1.GetElementSpaceSize(),
681 const auto b_buffers_offset = a_block_space_size_aligned * NumGemmKPrefetchStage;
682 auto b_block_buffers =
683 ck::lds_utils::AllocateLdsBuffers<BComputeDataType, NumGemmKPrefetchStage>(
685 b_block_desc_bk0_n_bk1.GetElementSpaceSize(),
692 const auto gridwise_gemm_pipeline =
695 const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
696 (a_grid_desc_ak0_m_ak1.GetLength(
I0) * a_grid_desc_ak0_m_ak1.GetLength(
I2)) /
700 a_block_desc_ak0_m_ak1,
704 a_block_slice_copy_step,
705 b_grid_desc_bk0_n_bk1,
706 b_block_desc_bk0_n_bk1,
710 b_block_slice_copy_step,
713 num_k_block_main_loop);
717 static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
718 NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
721 constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
722 constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
724 constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
725 blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
728 constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
729 blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
731 constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(
I0);
732 constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(
I1);
733 constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(
I2);
734 constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(
I3);
735 constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(
I4);
736 constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(
I5);
737 constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(
I6);
738 constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(
I7);
740 constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
744 static_cast<CShuffleDataType*
>(p_shared),
745 c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
748 c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
767 const auto c_thread_mtx_on_block =
768 blockwise_gemm.CalculateCThreadOriginDataIndex(
I0,
I0,
I0,
I0);
770 const index_t m_thread_data_on_block = c_thread_mtx_on_block[
I0];
771 const index_t n_thread_data_on_block = c_thread_mtx_on_block[
I1];
773 const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
779 const auto m_thread_data_on_block_idx =
780 m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
783 const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
789 const auto n_thread_data_on_block_idx =
790 n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
794 auto c_thread_copy_vgpr_to_lds =
797 decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
798 decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
800 Sequence<CShuffleMXdlPerWavePerShuffle,
801 CShuffleNXdlPerWavePerShuffle,
814 c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
817 m_thread_data_on_block_idx[
I1],
818 n_thread_data_on_block_idx[
I1],
819 m_thread_data_on_block_idx[
I2],
820 m_thread_data_on_block_idx[
I3],
821 m_thread_data_on_block_idx[
I4],
822 n_thread_data_on_block_idx[
I2]),
827 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
829 {
return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
834 tie(c_shuffle_block_buf),
836 {
return ds_grid_buf[i]; },
853 decltype(c_ds_desc_refs),
854 decltype(
tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
855 CDEElementwiseOperation,
858 CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
860 CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>,
861 CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
865 CDEShuffleBlockTransferScalarPerVector_NPerBlock,
872 idx_c_ds_block_begin,
873 tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
878 constexpr auto sfc_c_vgpr =
881 Sequence<CShuffleMXdlPerWavePerShuffle,
882 CShuffleNXdlPerWavePerShuffle,
891 constexpr auto sfc_cde_block =
895 CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
897 CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
899 constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
901 static_assert(num_access == sfc_cde_block.GetNumOfAccess(),
"wrong!");
908 c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
909 sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
911 c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
912 c_shuffle_block_buf);
918 cde_block_copy_lds_and_global.Run(
921 tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
924 if constexpr(access_id < num_access - 1)
926 constexpr auto cde_lds_and_global_step =
927 sfc_cde_block.GetForwardStep(access_id);
931 cde_block_copy_lds_and_global.MoveSrcSliceWindow(
932 c_ds_desc_refs, i +
I1, cde_lds_and_global_step);
936 cde_block_copy_lds_and_global.MoveDstSliceWindow(
937 tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
939 cde_lds_and_global_step);
948 const void* p_b_grid,
949 std::array<const void*, NumDTensor> p_ds_grid,
956 std::array<index_t, NumDTensor> StrideDs,
958 AElementwiseOperation a_element_op,
959 BElementwiseOperation b_element_op,
960 CDEElementwiseOperation cde_element_op)
961 :
p_a_grid_{static_cast<const ADataType*>(p_a_grid)},
962 p_b_grid_{static_cast<const BDataType*>(p_b_grid)},
964 p_e_grid_{static_cast<EDataType*>(p_e_grid)},
983 p_ds_grid_(i) =
static_cast<const DDataType*
>(p_ds_grid[i]);
1006 [&](
auto i) { std::cout <<
"Ds[M, N]: " <<
ds_grid_desc_m_n_[i] << std::endl; });
#define CK_MIN_BLOCK_PER_CU
Definition ck.hpp:31
#define CK_MAX_THREAD_PER_BLOCK
Definition ck.hpp:30
#define IS_VALID_COMPILATION_PARAMETER_IMPL(CDataType_)
Definition device_base.hpp:178
__host__ __device__ constexpr auto integer_least_multiple(X x, Y y)
Definition utility/math.hpp:78
__host__ __device__ constexpr T max(T x)
Definition utility/math.hpp:84
__host__ __device__ constexpr auto lcm(X x, Y y)
Definition utility/math.hpp:198
GemmSpecialization
Definition gemm_specialization.hpp:11
ushort bhalf_t
Definition data_type.hpp:30
__host__ __device__ constexpr auto make_multi_index(Xs &&... xs)
Definition array_multi_index.hpp:15
typename uniform_sequence_gen< NSize, I >::type uniform_sequence_gen_t
Definition utility/sequence.hpp:928
constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()
Definition blockwise_gemm_xdlops.hpp:620
__host__ __device__ constexpr auto make_pass_through_transform(const LowLength &low_length)
Definition multi_index_transform_helper.hpp:12
__host__ __device__ constexpr auto container_concat(const X &x, const Ys &... ys)
Definition utility/container_helper.hpp:320
constexpr auto GridwiseGemmPipeline_Selector()
Definition gridwise_gemm_pipeline_selector.hpp:31
int32_t index_t
Definition ck.hpp:299
typename conditional< predicate, X, Y >::type conditional_t
Definition utility/functional.hpp:115
__host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple< Lengths... > &lengths, const Tuple< Strides... > &strides)
Definition tensor_descriptor_helper.hpp:49
InMemoryDataOperationEnum
Definition ck.hpp:277
@ Set
Definition ck.hpp:278
__host__ __device__ constexpr auto make_single_stage_tensor_adaptor(const Transforms &transforms, LowerDimensionOldTopIdss, UpperDimensionNewTopIdss)
Definition tensor_description/tensor_adaptor.hpp:425
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition type.hpp:297
__host__ __device__ constexpr auto make_freeze_transform(const LowerIndex &low_idx)
Definition multi_index_transform_helper.hpp:151
constexpr Tuple< Args &... > tie(Args &... args) noexcept
Definition utility/tuple.hpp:218
integral_constant< index_t, N > Number
Definition number.hpp:12
typename tuple_element< I, TTuple >::type tuple_element_t
Definition utility/tuple.hpp:208
__host__ __device__ constexpr auto make_merge_transform(const LowLengths &low_lengths)
Definition multi_index_transform_helper.hpp:55
constexpr detail::ignore_t ignore
Definition utility/ignore.hpp:20
__device__ index_t get_block_1d_id()
Definition get_id.hpp:47
constexpr bool is_same_v
Definition type.hpp:283
__host__ __device__ constexpr auto generate_tuple(F &&f, Number< N >)
Definition tuple_helper.hpp:21
__host__ __device__ constexpr auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition tensor_descriptor_helper.hpp:101
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
LoopScheduler
Definition loop_scheduler.hpp:15
typename sequence_merge< Sx, Sy >::type sequence_merge_t
Definition utility/sequence.hpp:925
int64_t long_index_t
Definition ck.hpp:300
__host__ __device__ constexpr auto transform_tensor_descriptor(const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss)
Definition tensor_description/tensor_descriptor.hpp:319
__device__ void block_sync_lds()
Definition synchronization.hpp:16
PipelineVersion
Definition gridwise_gemm_pipeline_selector.hpp:18
@ v4
Definition gridwise_gemm_pipeline_selector.hpp:22
__host__ __device__ constexpr auto make_unmerge_transform(const UpLengths &up_lengths, integral_constant< bool, Use24BitIntegerCalculation >=integral_constant< bool, false >{})
Definition multi_index_transform_helper.hpp:90
__host__ __device__ constexpr auto make_dynamic_buffer(T *p, ElementSpaceSize element_space_size)
Definition dynamic_buffer.hpp:472
__host__ __device__ constexpr auto generate_tie(F &&f, Number< N >)
Definition tuple_helper.hpp:34
__host__ __device__ constexpr auto concat_tuple_of_reference(const Tuple< X &... > &tx, const Tuple< Y &... > &ty)
Definition tuple_helper.hpp:42
__global__ void kernel_gemm_multiple_d_xdl_cshuffle_lds_direct_load(const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map)
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:43
Definition block_to_ctile_map.hpp:261
DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock_
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:1026
const BDataType * p_b_grid_
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:1012
BElementwiseOperation b_element_op_
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:1034
index_t NRaw_
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:1039
index_t MRaw_
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:1038
AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:1023
index_t KRaw_
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:1040
EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:1027
void Print() const
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:1001
AGridDesc_M_K a_grid_desc_m_k_
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:1017
EGridDesc_M_N e_grid_desc_m_n_
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:1020
BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:1024
DsGridDesc_M_N ds_grid_desc_m_n_
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:1019
EDataType * p_e_grid_
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:1014
Block2ETileMap block_2_etile_map_
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:1030
BGridDesc_N_K b_grid_desc_n_k_
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:1018
const ADataType * p_a_grid_
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:1011
AElementwiseOperation a_element_op_
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:1033
CDEElementwiseOperation cde_element_op_
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:1035
DsGridPointer p_ds_grid_
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:1013
Argument(const void *p_a_grid, const void *p_b_grid, std::array< const void *, NumDTensor > p_ds_grid, void *p_e_grid, index_t MRaw, index_t NRaw, index_t KRaw, index_t StrideA, index_t StrideB, std::array< index_t, NumDTensor > StrideDs, index_t StrideE, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, CDEElementwiseOperation cde_element_op)
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:947
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:149
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >::GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1 __host__ static __device__ constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:197
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::AComputeDataType conditional_t< is_same_v< ComputeDataType, ck::tf32_t >, float, ComputeDataType > AComputeDataType
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:175
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::I4 static constexpr auto I4
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:156
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >::MakeDsGridDescriptor_M_N __host__ static __device__ auto MakeDsGridDescriptor_M_N(const std::array< index_t, NumDTensor > &MRaws, const std::array< index_t, NumDTensor > &NRaws, const std::array< index_t, NumDTensor > &DsStride)
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:337
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >::GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1 __host__ static __device__ constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:181
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::DsGridDesc_M_N remove_cvref_t< decltype(MakeDsGridDescriptor_M_N({}, {}, {}))> DsGridDesc_M_N
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:348
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::DsGridPointer decltype(MakeDsGridPointer()) DsGridPointer
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:517
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::BComputeDataType conditional_t< is_same_v< ComputeDataType, ck::tf32_t >, float, ComputeDataType > BComputeDataType
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:177
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::AGridDesc_M_K decltype(MakeAGridDescriptor_M_K(1, 1, 1)) AGridDesc_M_K
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:346
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >::MakeDefaultAGridDescriptor_AK0_M_AK1 __host__ static __device__ constexpr auto MakeDefaultAGridDescriptor_AK0_M_AK1(const AGridDesc_M_K &a_grid_desc_m_k)
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:353
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::I2 static constexpr auto I2
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:154
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >::MakeDefaultBlock2ETileMap __host__ static __device__ constexpr auto MakeDefaultBlock2ETileMap(const EGridDesc_M_N &e_grid_desc_m_n)
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:415
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >::CalculateHasMainKBlockLoop __host__ static __device__ constexpr bool CalculateHasMainKBlockLoop(index_t K)
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:511
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::I0 static constexpr auto I0
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:152
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::BK1 static constexpr auto BK1
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:162
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::BGridDesc_BK0_N_BK1 remove_cvref_t< decltype(MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))> BGridDesc_BK0_N_BK1
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:423
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::BK0PerBlock static constexpr auto BK0PerBlock
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:164
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::I6 static constexpr auto I6
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:158
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::ThisThreadBlock ThisThreadBlock< BlockSize > ThisThreadBlock
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:166
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >::GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock __host__ static __device__ constexpr auto GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:214
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::I5 static constexpr auto I5
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:157
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::GridwiseGemmPipe remove_cvref_t< decltype(GridwiseGemmPipeline_Selector< PipelineVer, NumGemmKPrefetchStage, LoopSched >())> GridwiseGemmPipe
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:168
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >::Run static __device__ void Run(const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsGridPointer p_ds_grid, EDataType *__restrict__ p_e_grid, void *__restrict__ p_shared, const AElementwiseOperation &a_element_op, const BElementwiseOperation &b_element_op, const CDEElementwiseOperation &cde_element_op, const AGridDesc_AK0_M_AK1 &a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 &b_grid_desc_bk0_n_bk1, const DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock &ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock &e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap &block_2_etile_map)
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:526
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::I3 static constexpr auto I3
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:155
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::BGridDesc_N_K decltype(MakeBGridDescriptor_N_K(1, 1, 1)) BGridDesc_N_K
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:347
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >::MakeDsGridPointer static constexpr auto MakeDsGridPointer()
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:230
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock __host__ static __device__ constexpr auto MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const DsGridDesc_M_N &ds_grid_desc_m_n)
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:405
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::NumDTensor static constexpr index_t NumDTensor
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:150
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::AK0PerBlock static constexpr auto AK0PerBlock
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:163
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >::MakeDefaultBGridDescriptor_BK0_N_BK1 __host__ static __device__ constexpr auto MakeDefaultBGridDescriptor_BK0_N_BK1(const BGridDesc_N_K &b_grid_desc_n_k)
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:369
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::CheckValidity __host__ static __device__ constexpr bool CheckValidity(const AGridDesc_M_K &a_grid_desc_m_k, const BGridDesc_N_K &b_grid_desc_n_k, const DsGridDesc_M_N &ds_grid_desc_m_n, const EGridDesc_M_N &e_grid_desc_m_n, const Block2ETileMap &block_2_etile_map)
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:436
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::AGridDesc_AK0_M_AK1 remove_cvref_t< decltype(MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))> AGridDesc_AK0_M_AK1
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:421
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::I1 static constexpr auto I1
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:153
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >::GetSharedMemoryNumberOfByte __host__ static __device__ constexpr index_t GetSharedMemoryNumberOfByte()
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:241
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >::MakeAGridDescriptor_M_K __host__ static __device__ auto MakeAGridDescriptor_M_K(index_t MRaw, index_t KRaw, index_t StrideA)
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:269
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >::MakeBGridDescriptor_N_K __host__ static __device__ auto MakeBGridDescriptor_N_K(index_t KRaw, index_t NRaw, index_t StrideB)
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:292
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >::MakeEGridDescriptor_M_N __host__ static __device__ auto MakeEGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideE)
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:315
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock remove_cvref_t< decltype(MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}))> EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:428
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock __host__ static __device__ constexpr auto MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const EGridDesc_M_N &e_grid_desc_m_n)
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:385
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::I7 static constexpr auto I7
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:159
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >::GetMPerBlock __device__ static __host__ constexpr auto GetMPerBlock()
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:519
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::AK1 static constexpr auto AK1
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:161
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::Block2ETileMap remove_cvref_t< decltype(MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))> Block2ETileMap
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:432
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::EGridDesc_M_N decltype(MakeEGridDescriptor_M_N(1, 1, 1)) EGridDesc_M_N
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:349
ck::GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer >< math::max(NXdlPerWave64, 1)>::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock remove_cvref_t< decltype(MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}))> DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:425
Selects the appropriate MFMA instruction type and configuration for given data types and tile sizes o...
Definition xdlops_gemm.hpp:1208
Definition utility/sequence.hpp:43
Definition tensor_space_filling_curve.hpp:20
Definition thread_group_tensor_slice_transfer_direct_load.hpp:55
Definition thread_group_tensor_slice_transfer_v7.hpp:42
Definition threadwise_tensor_slice_transfer.hpp:39
Definition utility/tuple.hpp:117
static constexpr value_type value
Definition utility/integral_constant.hpp:13
Definition functional2.hpp:33
Definition device_base.hpp:197
__host__ __device__ constexpr auto PadBDescriptor_N_K(const BDesc_NRaw_KRaw &b_desc_nraw_kraw) const
Definition matrix_padder.hpp:155
__host__ __device__ constexpr auto PadCDescriptor_M_N(const CDesc_MRaw_NRaw &c_desc_mraw_nraw) const
Definition matrix_padder.hpp:163
__host__ __device__ constexpr auto PadADescriptor_M_K(const ADesc_MRaw_KRaw &a_desc_mraw_kraw) const
Definition matrix_padder.hpp:147
Definition matrix_padder.hpp:180
Definition tensor_operation/gpu/element/unary_element_wise_operation.hpp:340