device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp Source File#
device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp
Go to the documentation of this file.
17#include "ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_welford_second_half_layernorm2d.hpp"
float launch_and_time_kernel(const StreamConfig &stream_config, F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
Definition host_utility/kernel_launch.hpp:14
Definition utility/math.hpp:13
__host__ __device__ constexpr auto integer_least_multiple(X x, Y y)
Definition utility/math.hpp:78
__host__ __device__ constexpr auto integer_divide_ceil(X x, Y y)
Definition utility/math.hpp:72
Definition convolution_backward_data_specialization.hpp:8
std::string getGemmSpecializationString(const GemmSpecialization &s)
Definition gemm_specialization.hpp:32
__host__ __device__ constexpr auto PadTensorDescriptor(const TensorDesc &desc, const TileLengths &tile_lengths, DoPads)
Definition matrix_padder.hpp:19
GemmSpecialization
Definition gemm_specialization.hpp:11
@ MKPadding
Definition gemm_specialization.hpp:18
@ KPadding
Definition gemm_specialization.hpp:16
@ MNKPadding
Definition gemm_specialization.hpp:20
@ NKPadding
Definition gemm_specialization.hpp:19
Definition convolution_backward_data_specialization.hpp:7
Definition ck.hpp:268
__global__ void kernel_welford_layernorm2d_second_half(const EMeanVarDataType *__restrict__ p_e_grid, const EMeanVarDataType *__restrict__ p_in_welford_mean_grid, const EMeanVarDataType *__restrict__ p_in_welford_var_grid, const int32_t *__restrict__ p_in_welford_count_grid, const GammaDataType *__restrict__ p_gamma_grid, const BetaDataType *__restrict__ p_beta_grid, HDataType *__restrict__ p_h_grid, const EHGridDesc_M_N e_grid_desc_m_n, const EHGridDesc_M_N h_grid_desc_m_n, const LayernormMeanVarGridDesc_M_NBlock mean_var_grid_desc_m_nblock, const LayernormCountGridDesc_M_NBlock count_grid_desc_m_nblock, const GammaBetaGridDesc_N gamma_grid_desc_n, const GammaBetaGridDesc_N beta_grid_desc_n, index_t numMeanVarCountBlockTileIteration_N, index_t NBlockClusterLength, ComputeDataType epsilon, HElementwiseOperation h_element_op)
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:87
__host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple< Lengths... > &lengths, const Tuple< Strides... > &strides)
Definition tensor_descriptor_helper.hpp:49
__global__ void kernel_gemm_multiple_d_welford_first_half_wmma_cshuffle_v3(typename GridwiseGemm::Argument karg, EMeanVarDataType *__restrict__ p_welford_mean_grid, EMeanVarDataType *__restrict__ p_welford_var_grid, int32_t *__restrict__ p_welford_count_grid)
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:33
__host__ __device__ constexpr auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition tensor_descriptor_helper.hpp:101
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
Definition ck/stream_config.hpp:10
static __host__ constexpr bool CheckValidity(const Argument &karg)
Definition gridwise_gemm_wmma_cshuffle_v3_common.hpp:624
static __host__ auto CalculateGridSize(index_t M, index_t N, index_t KBatch)
Definition gridwise_gemm_wmma_cshuffle_v3_common.hpp:273
static constexpr index_t KPack
Definition gridwise_gemm_wmma_cshuffle_v3_common.hpp:154
static __host__ constexpr bool CalculateHasMainKBlockLoop(index_t K)
Definition gridwise_gemm_wmma_cshuffle_v3_common.hpp:837
"Universal" GEMM kernel with SplitK support.
Definition gridwise_gemm_wmma_cshuffle_v3.hpp:233
Definition gridwise_welford_second_half_layernorm2d.hpp:42
Definition utility/sequence.hpp:43
__host__ static __device__ constexpr index_t At(index_t I)
Definition utility/sequence.hpp:53
Definition utility/tuple.hpp:117
Definition functional2.hpp:33
Definition device_base.hpp:197
BaseArgument()=default
BaseInvoker()=default
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:333
const GammaDataType * p_gamma_grid_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:415
EHGridDesc_M_N layernorm_e_grid_desc_m_n_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:420
std::array< index_t, NumDTensor > StrideDs_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:438
const BDataType * p_b_grid_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:409
LayernormMeanVarGridDesc_M_NBlock layernorm_mean_var_grid_desc_m_nblock_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:421
void * p_workspace_mean_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:412
index_t KRaw_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:435
index_t gemm_nblock_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:440
index_t MRaw_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:433
BElementwiseOperation b_element_op_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:429
EHGridDesc_M_N h_grid_desc_m_n_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:425
void * p_workspace_count_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:414
index_t StrideA_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:436
AElementwiseOperation a_element_op_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:428
const ADataType * p_a_grid_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:408
GammaBetaGridDesc_N gamma_grid_desc_n_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:423
void * p_workspace_var_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:413
LayernormCountGridDesc_M_NBlock layernorm_count_grid_desc_m_nblock_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:422
index_t StrideB_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:437
HDataType * p_h_grid_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:417
index_t StrideH_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:439
Argument(const void *p_a_grid, const void *p_b_grid, std::array< const void *, NumDTensor > p_ds_grid, const void *p_gamma_grid, const void *p_beta_grid, void *p_h_grid, index_t MRaw, index_t NRaw, index_t KRaw, index_t StrideA, index_t StrideB, std::array< index_t, NumDTensor > StrideDs, index_t StrideH, double epsilon, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, CDEElementwiseOperation cde_element_op, HElementwiseOperation h_element_op)
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:334
void * p_workspace_e_grid_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:411
HElementwiseOperation h_element_op_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:431
index_t NRaw_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:434
AccDataType epsilon_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:441
const BetaDataType * p_beta_grid_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:416
std::array< const void *, NumDTensor > p_ds_grid_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:410
GammaBetaGridDesc_N beta_grid_desc_n_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:424
CDEElementwiseOperation cde_element_op_
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:430
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:446
float Run(const BaseArgument *p_arg, const StreamConfig &stream_config=StreamConfig{}) override
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:608
float Run(const Argument &arg, const StreamConfig &stream_config=StreamConfig{})
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:447
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:200
static bool IsSupportedArgument(const Argument &arg)
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:674
decltype(GridwiseGemmWelford::EpilogueWelfordCShuffle::template MakeMeanVarDescriptor_M_N< Sequence< true, true >, LayernormBlockTileSize_M_N::At(0), LayernormBlockTileSize_M_N::At(1)>(1, 1)) LayernormMeanVarGridDesc_M_NBlock
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:295
static constexpr index_t LayernormGammaSrcVectorSize
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:206
decltype(MakeEHGridDescriptor_M_N< Sequence< true, true >, 1, 1 >(1, 1, 1)) EHGridDesc_M_N
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:308
static constexpr index_t LayernormHDstVectorSize
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:205
static constexpr index_t NumDTensor
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:204
Sequence< CDEShuffleBlockTransferScalarPerVector, CDEShuffleBlockTransferScalarPerVector, CDEShuffleBlockTransferScalarPerVector > CDEShuffleBlockTransferScalarPerVectors
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:220
static constexpr auto I2
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:217
ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3::MakeEHGridDescriptor_M_N
static auto MakeEHGridDescriptor_M_N(index_t M, index_t N, index_t Stride)
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:280
std::unique_ptr< BaseInvoker > MakeInvokerPointer() override
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:834
std::string GetTypeString() const override
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:840
static constexpr index_t LayernormESrcVectorSize
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:208
static constexpr index_t LayernormBetaSrcVectorSize
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:207
Sequence< LayernormThreadClusterSize_M_N::At(0) *LayernormThreadSliceSize_M, LayernormThreadClusterSize_M_N::At(1) *LayernormThreadSliceSize_N > LayernormBlockTileSize_M_N
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:211
static auto MakeDescriptor_X(index_t X)
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:289
static auto MakeArgument(const void *p_a, const void *p_b, std::array< const void *, NumDTensor > p_ds, const void *p_gamma, const void *p_beta, void *p_h, index_t MRaw, index_t NRaw, index_t KRaw, index_t StrideA, index_t StrideB, std::array< index_t, NumDTensor > StrideDs, index_t StrideH, double epsilon, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, CDEElementwiseOperation cde_element_op, HElementwiseOperation h_element_op)
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:752
static constexpr auto I0
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:215
GridwiseGemm_wmma_cshuffle_v3< ALayout, BLayout, DsLayout, HLayout, Tuple< ADataType >, Tuple< BDataType >, AccDataType, CShuffleDataType, DsDataType, EMeanVarDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVectors, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB > GridwiseGemmWelford
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:226
decltype(MakeDescriptor_X< LayernormBlockTileSize_M_N::At(1)>(1)) GammaBetaGridDesc_N
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:307
bool IsSupportedArgument(const BaseArgument *p_arg) override
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:747
static constexpr auto I3
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:218
decltype(GridwiseGemmWelford::EpilogueWelfordCShuffle::template MakeCountDescriptor_M_N< Sequence< true, true >, LayernormBlockTileSize_M_N::At(0), LayernormBlockTileSize_M_N::At(1)>(1, 1)) LayernormCountGridDesc_M_NBlock
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:301
static constexpr auto I1
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:216
std::unique_ptr< BaseArgument > MakeArgumentPointer(const void *p_a, const void *p_b, std::array< const void *, NumDTensor > p_ds, const void *p_gamma, const void *p_beta, void *p_h, index_t MRaw, index_t NRaw, index_t KRaw, index_t StrideA, index_t StrideB, std::array< index_t, NumDTensor > StrideDs, index_t StrideH, double epsilon, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, CDEElementwiseOperation cde_element_op, HElementwiseOperation h_element_op) override
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:794
static auto MakeInvoker()
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:791
void SetWorkSpacePointer(BaseArgument *pArg, void *p_workspace, const StreamConfig &=StreamConfig{}) const override
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:638
size_t GetWorkSpaceSize(const BaseArgument *pArg) const override
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:615
static constexpr index_t LayernormThreadSliceSize_N
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:209
ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3::GridwiseWelfordLayernorm
GridwiseWelfordSecondHalfLayernorm2d< EMeanVarDataType, HDataType, GammaDataType, BetaDataType, AccDataType, EHGridDesc_M_N, LayernormMeanVarGridDesc_M_NBlock, LayernormCountGridDesc_M_NBlock, GammaBetaGridDesc_N, HElementwiseOperation, BlockSize, LayernormThreadClusterSize_M_N::At(I0), LayernormThreadClusterSize_M_N::At(I1), LayernormThreadSliceSize_M, LayernormThreadSliceSize_N, LayernormESrcVectorSize, LayernormHDstVectorSize, LayernormGammaSrcVectorSize, LayernormBetaSrcVectorSize > GridwiseWelfordLayernorm
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:310
DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3 DeviceOp
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:202
Definition device_gemm_multiple_d_layernorm.hpp:40