UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ > Struct Template Reference#
Classes |
Public Types |
Public Member Functions |
Static Public Member Functions |
Static Public Attributes |
List of all members
ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ > Struct Template Reference
The Universal GEMM kernel template. More...
#include <universal_gemm_kernel.hpp>
Classes | |
| struct | has_persistent_kernel |
| struct | has_tile_partitioner_output_offset_impl |
| struct | SplitKBatchOffset |
Public Types | |
| using | TilePartitioner = remove_cvref_t<TilePartitioner_> |
| using | GemmPipeline = remove_cvref_t<GemmPipeline_> |
| using | EpiloguePipeline = remove_cvref_t<EpiloguePipeline_> |
| using | AsLayout |
| using | BsLayout |
| using | DsLayout |
| using | AsDataType |
| using | BsDataType |
| using | DsDataType |
| using | CLayout = remove_cvref_t<typename GemmPipeline::CLayout> |
| using | EDataType = remove_cvref_t<typename EpiloguePipeline::ODataType> |
| using | AElementWise = remove_cvref_t<typename GemmPipeline::AElementWise> |
| using | BElementWise = remove_cvref_t<typename GemmPipeline::BElementWise> |
| using | ADataType = remove_cvref_t<std::tuple_element_t<I0, AsDataType>> |
| using | BDataType = remove_cvref_t<std::tuple_element_t<I0, BsDataType>> |
| using | KernelArgs |
Public Member Functions | |
| template<bool U = !PersistentKernel, typename = std::enable_if_t<U>> | |
| CK_TILE_DEVICE void | operator() (KernelArgs kargs) const |
| template<bool U = PersistentKernel, typename = std::enable_if_t<U>, typename = void> | |
| CK_TILE_DEVICE void | operator() (KernelArgs kargs) const |
Static Public Member Functions | |
| static CK_TILE_HOST const std::string | GetName () |
| static CK_TILE_HOST constexpr auto | GridSize (index_t M, index_t N, index_t KBatch) |
| static CK_TILE_HOST auto | MaxOccupancyGridSize (const stream_config &s) -> dim3 |
| Get the maximum occupancy grid size for the persistent kernel on the current device. | |
| static CK_TILE_HOST auto | BlockSize () |
| static CK_TILE_HOST constexpr KernelArgs | MakeKernelArgs (const UniversalGemmHostArgs< NumATensor, NumBTensor, NumDTensor > &hostArgs) |
| static CK_TILE_HOST_DEVICE constexpr index_t | GetSmemSize () |
| static CK_TILE_HOST bool | IsSupportedArgument (const KernelArgs &kargs) |
| template<memory_operation_enum DstInMemOp = memory_operation_enum::set> | |
| static CK_TILE_DEVICE auto | MakeGemmTensorViews (const std::array< const ADataType *, NumATensor > &as_ptr, const std::array< const BDataType *, NumBTensor > &bs_ptr, const std::array< const void *, NumDTensor > &ds_ptr, EDataType *e_ptr, const KernelArgs &kargs, const index_t k_size) |
| template<typename TensorView> | |
| static CK_TILE_DEVICE auto | MakeGemmPadViews (const TensorView &views) |
| template<typename PadView> | |
| static CK_TILE_DEVICE auto | MakeGemmTileWindows (const PadView &views, const index_t i_m, const index_t i_n) |
| template<bool UseDefaultScheduler = true> | |
| static CK_TILE_DEVICE void | RunGemm (const std::array< const ADataType *, NumATensor > &as_ptr, const std::array< const BDataType *, NumBTensor > &bs_ptr, const std::array< const void *, NumDTensor > &ds_ptr, EDataType *e_ptr, void *smem_ptr_0, const KernelArgs &kargs, const SplitKBatchOffset &splitk_batch_offset, const index_t block_idx_m, const index_t block_idx_n) |
| Runs single GEMM problem cooperatively by whole workgroup. | |
| static CK_TILE_DEVICE void | RunGemm2LDS (const std::array< const ADataType *, NumATensor > &as_ptr, const std::array< const BDataType *, NumBTensor > &bs_ptr, const std::array< const void *, NumDTensor > &ds_ptr, EDataType *e_ptr, void *__restrict__ smem_ptr_0, void *__restrict__ smem_ptr_1, const KernelArgs &kargs, const SplitKBatchOffset &splitk_batch_offset, const index_t block_idx_m, const index_t block_idx_n) |
| Runs single GEMM problem cooperatively by whole workgroup. | |
Static Public Attributes | |
| static constexpr bool | ADataTypeIsTuple |
| static constexpr bool | BDataTypeIsTuple |
| static constexpr bool | DDataTypeIsTuple |
| static constexpr bool | ALayoutIsTuple |
| static constexpr bool | BLayoutIsTuple |
| static constexpr bool | DLayoutIsTuple |
| static constexpr index_t | kBlockSize = GemmPipeline::BlockSize |
| static constexpr bool | PersistentKernel = has_persistent_kernel::value |
| static constexpr bool | has_tile_partitioner_output_offset |
| static constexpr auto | I0 = number<0>() |
| static constexpr auto | I1 = number<1>() |
| static constexpr auto | I2 = number<2>() |
| static constexpr auto | I3 = number<3>{} |
| static constexpr index_t | NumATensor = AsDataType::size() |
| static constexpr index_t | NumBTensor = BsDataType::size() |
| static constexpr index_t | NumDTensor = DsDataType::size() |
Detailed Description
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
struct ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >
struct ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >
The Universal GEMM kernel template.
Overview
This class provides the generic matrix multiplication kernel template. By semantic division of GEMM algorithm into following parts we achieve flexible, versatile and robust kernel implementation.
- Prolog - The start of GEMM kernel implementation in operator() function call operator" which determines the work scope of each workgroup. @li @b GemmPipeline - The core part @a "heart" of matrix multiplication algorithm. This is the place where each workgroup is loading data from global memory and carrying out dot products. @li @b Epilogue - The @a "final" part of matrix multiplication implementation responsible for storing results to global memory. This is also the place where any additional operator fusion may take place. Additionally both @ref GemmPipeline_ "GemmPipeline" and @ref EpiloguePipeline_ "EpiloguePipeline" are parameterized with so called @a Policy which determines all internal details of those functional parts. You can think of it like both gemm and epilogue pipelines provides the control-flow logic controlled by policies. Moreover the policy is responsible for definition of all necessary data layouts and thread's work distribution. @tparam TilePartitioner_ The type of class providing mapping of workgroup index into the output data tile to be calculated. It determines the workgroup to data relationship (or in other words - which data would be processed and calculated by which workgroup). @tparam GemmPipeline_ The type of class which provides the core part of matrix multiplication. This class should provide implementation of data loading from global memory and performing block-wise matrix multiplication. You can think of it as a work done by single workgroup point of view. @tparam EpiloguePipeline_ The type of class providing the final part of matrix multiplication implementation. It is responsible for storing results calculated by @ref GemmPipeline_ "GemmPipeline" to the output E tensor in global memory.
Member Typedef Documentation
◆ ADataType
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
| using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::ADataType = remove_cvref_t<std::tuple_element_t<I0, AsDataType>> |
◆ AElementWise
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
| using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::AElementWise = remove_cvref_t<typename GemmPipeline::AElementWise> |
◆ AsDataType
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
| using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::AsDataType |
Initial value:
std::conditional_t<ADataTypeIsTuple,
remove_cv_t< std::remove_reference_t< T > > remove_cvref_t
Definition type_traits.hpp:21
static constexpr bool ADataTypeIsTuple
Definition universal_gemm_kernel.hpp:159
◆ AsLayout
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
| using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::AsLayout |
Initial value:
std::conditional_t<ALayoutIsTuple,
static constexpr bool ALayoutIsTuple
Definition universal_gemm_kernel.hpp:165
◆ BDataType
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
| using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::BDataType = remove_cvref_t<std::tuple_element_t<I0, BsDataType>> |
◆ BElementWise
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
| using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::BElementWise = remove_cvref_t<typename GemmPipeline::BElementWise> |
◆ BsDataType
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
| using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::BsDataType |
Initial value:
std::conditional_t<BDataTypeIsTuple,
static constexpr bool BDataTypeIsTuple
Definition universal_gemm_kernel.hpp:161
◆ BsLayout
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
| using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::BsLayout |
Initial value:
std::conditional_t<BLayoutIsTuple,
static constexpr bool BLayoutIsTuple
Definition universal_gemm_kernel.hpp:167
◆ CLayout
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
| using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::CLayout = remove_cvref_t<typename GemmPipeline::CLayout> |
◆ DsDataType
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
| using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::DsDataType |
Initial value:
std::conditional_t<DDataTypeIsTuple,
static constexpr bool DDataTypeIsTuple
Definition universal_gemm_kernel.hpp:163
◆ DsLayout
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
| using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::DsLayout |
Initial value:
std::conditional_t<DLayoutIsTuple,
static constexpr bool DLayoutIsTuple
Definition universal_gemm_kernel.hpp:169
◆ EDataType
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
| using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::EDataType = remove_cvref_t<typename EpiloguePipeline::ODataType> |
◆ EpiloguePipeline
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
| using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::EpiloguePipeline = remove_cvref_t<EpiloguePipeline_> |
◆ GemmPipeline
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
| using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::GemmPipeline = remove_cvref_t<GemmPipeline_> |
◆ KernelArgs
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
| using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::KernelArgs |
Initial value:
UniversalGemmKernelArgs<AsLayout::size(), BsLayout::size(), DsLayout::size()>
The GEMM kernel device arguments.
Definition universal_gemm_kernel.hpp:86
◆ TilePartitioner
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
| using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::TilePartitioner = remove_cvref_t<TilePartitioner_> |
Member Function Documentation
◆ BlockSize()
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
|
inlinestatic |
◆ GetName()
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
|
inlinestaticnodiscard |
◆ GetSmemSize()
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
|
inlinestaticconstexpr |
◆ GridSize()
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
|
inlinestaticconstexpr |
◆ IsSupportedArgument()
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
|
inlinestatic |
◆ MakeGemmPadViews()
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
template<typename TensorView>
|
inlinestatic |
◆ MakeGemmTensorViews()
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
template<memory_operation_enum DstInMemOp = memory_operation_enum::set>
|
inlinestatic |
◆ MakeGemmTileWindows()
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
template<typename PadView>
|
inlinestatic |
◆ MakeKernelArgs()
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
|
inlinestaticconstexpr |
◆ MaxOccupancyGridSize()
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
|
inlinestatic |
Get the maximum occupancy grid size for the persistent kernel on the current device.
- Returns
- The maximum occupancy grid size.
- Note
- This function queries the maximum occupancy of the kernel using hipOccupancyMaxActiveBlocksPerMultiprocessor.
◆ operator()() [1/2]
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
template<bool U = PersistentKernel, typename = std::enable_if_t<U>, typename = void>
|
inline |
◆ operator()() [2/2]
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
template<bool U = !PersistentKernel, typename = std::enable_if_t<U>>
|
inline |
◆ RunGemm()
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
template<bool UseDefaultScheduler = true>
|
inlinestatic |
Runs single GEMM problem cooperatively by whole workgroup.
- Parameters
-
as_ptr input As pointer bs_ptr input Bs pointer ds_ptr input Ds pointer e_ptr output E pointer smem_ptr_0 The start memory pointer of the shared memory block. kargs GEMM kernel arguments splitk_batch_offset splitk_batch_offset Utility structure used to calculate k batch. block_idx_m The GEMM's output M dimension tile index processed by this workgroup. block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
◆ RunGemm2LDS()
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
|
inlinestatic |
Runs single GEMM problem cooperatively by whole workgroup.
- Note
- RunGEMM2LDS in with two shared memory buffers using the ping pong buffer mechanism.
- Parameters
-
as_ptr input As pointer bs_ptr input Bs pointer ds_ptr input Ds pointer e_ptr output E pointer smem_ptr_0 The starting pointer of 1st shared memory block. smem_ptr_1 The starting pointer of 2nd shared memory block. kargs GEMM kernel arguments splitk_batch_offset Utility structure used to calculate k batch. block_idx_m The GEMM's output M dimension tile index processed by this workgroup. block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
Member Data Documentation
◆ ADataTypeIsTuple
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
|
staticconstexpr |
Initial value:
=
typename detail::detector< nonesuch, void, Op, Args... >::value_t is_detected
Definition type_traits.hpp:67
◆ ALayoutIsTuple
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
|
staticconstexpr |
Initial value:
◆ BDataTypeIsTuple
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
|
staticconstexpr |
Initial value:
◆ BLayoutIsTuple
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
|
staticconstexpr |
Initial value:
◆ DDataTypeIsTuple
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
|
staticconstexpr |
◆ DLayoutIsTuple
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
|
staticconstexpr |
◆ has_tile_partitioner_output_offset
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
|
staticconstexpr |
Initial value:
=
static constexpr bool value
Definition universal_gemm_kernel.hpp:226
◆ I0
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
|
staticconstexpr |
◆ I1
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
|
staticconstexpr |
◆ I2
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
|
staticconstexpr |
◆ I3
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
|
staticconstexpr |
◆ kBlockSize
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
|
staticconstexpr |
◆ NumATensor
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
|
staticconstexpr |
◆ NumBTensor
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
|
staticconstexpr |
◆ NumDTensor
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
|
staticconstexpr |
◆ PersistentKernel
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
|
staticconstexpr |
The documentation for this struct was generated from the following file: