/*************************************************************************************************** * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ /*! \file \brief Template for a Block-Ell sparse gemm kernel. */ #pragma once #include "cutlass/cutlass.h" #include "cutlass/numeric_types.h" #include "cutlass/arch/arch.h" #include "cutlass/device_kernel.h" #include "cutlass/gemm/threadblock/threadblock_swizzle.h" #include "cutlass/gemm/kernel/ell_gemm.h" #include "cutlass/gemm/kernel/default_ell_gemm.h" #include "cutlass/gemm/device/default_gemm_configuration.h" //////////////////////////////////////////////////////////////////////////////// namespace cutlass { namespace gemm { namespace device { ///////////////////////////////////////////////////////////////////////////////////////////////// /*! Blocked-Ell sparse gemm device-level operator. This is an interface to efficient CUTLASS Blocked-Ell kernels that may be invoked from host code. The contributions of this class are: 1. At compile time, it maps data types and high-level structural parameters onto specific CUTLASS components. 2. At runtime, it maps logical arguments to Blocked-Ell problems to kernel parameters. 3. At runtime, it launches kernels on the device. Example of a CUTLASS EllGemm operator is as follows: // // Instantiate the CUTLASS EllGemm operator. // cutlass::gemm::device::EllGemm< cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, cutlass::layout::ColumnMajor, float, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, cutlass::gemm::GemmShape<128, 128, 32>, cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 128 / cutlass::sizeof_bits::value, float, float>, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>, 4, // Stages 128 / cutlass::sizeof_bits::value, // Alignment A 128 / cutlass::sizeof_bits::value // Alignment B > ellgemm_op; // // Launch the EllGemm operation on the device // Description of parameters and tensors used to represent the Blocked-Ellpack (ELL) format: a_rows - Rows in the sparse matrix. a_cols - Colums in the sparse matrix. BlockedEllA - Packed matrix (ellValue matrix) that stores non-zero values in consecutive blocks, whose size is (a_rows * a_ell_num_columns) ell_idx - Blocked-ELL Column indices (ellColInd) matrix, whose size is (a_rows / a_ell_blocksize) * (a_ell_num_columns / a_ell_blocksize) a_ell_blocksize - Size of the ELL-Blocks. a_ell_num_columns - Number of columns in the Blocked-Ellpack format (ellValue columns) B - Input dense matrix whose size is (a_cols * n) C/D - Output dense matrix whose size is (a_rows * n) cutlass::Status status = ellgemm_op({ {a_rows, n, a_cols}, // GemmCoord problem_size {BlockedEllA, lda}, // TensorRef ref_BlockedEllA {B, ldb}, // TensorRef ref_B, {C, ldc}, // TensorRef ref_C, {D, ldd}, // TensorRef ref_D, ell_idx, // Blocked-ELL Column indices or ellColInd matrix (const int*) a_ell_num_columns, // Columns in the Blocked-Ellpack (ellValue) matrix (int) a_ell_blocksize, // Size of the ELL-Blocks (int) a_ell_base, // Base index of ellColInd (int) - Zero or One {alpha, beta} // EpilogueOutputOp::Params epilogue_op_params }); A simplified view of the template is listed below. template < /// Element type for A matrix operand typename ElementA, /// Layout type for A matrix operand typename LayoutA, /// Element type for B matrix operand typename ElementB, /// Layout type for B matrix operand typename LayoutB, /// Element type for C and D matrix operands typename ElementC, /// Layout type for C and D matrix operands typename LayoutC, /// Element type for internal accumulation typename ElementAccumulator, /// Operator class tag typename OperatorClass, /// Tag indicating architecture to tune for. This is the minimum SM that /// supports the intended feature. The device kernel can be built /// targeting any SM larger than this number. typename ArchTag, /// Threadblock-level tile size (concept: GemmShape) typename ThreadblockShape, /// Warp-level tile size (concept: GemmShape) typename WarpShape, /// Warp-level tile size (concept: GemmShape) typename InstructionShape, /// Epilogue output operator typename EpilogueOutputOp, /// Threadblock-level swizzling operator typename ThreadblockSwizzle, /// Number of stages used in the pipelined mainloop int Stages /// Access granularity of A matrix in units of elements int AlignmentA, /// Access granularity of B matrix in units of elements int AlignmentB, /// Supports split-K with serial reduction bool SplitKSerial, /// Operation performed by GEMM typename Operator, /// Sparse matrix is A or not bool IsASparse > class EllGemm; */ template < /// Element type for A matrix operand typename ElementA_, /// Layout type for A matrix operand typename LayoutA_, /// Element type for B matrix operand typename ElementB_, /// Layout type for B matrix operand typename LayoutB_, /// Element type for C and D matrix operands typename ElementC_, /// Layout type for C and D matrix operands typename LayoutC_, /// Element type for internal accumulation typename ElementAccumulator_ = ElementC_, /// Operator class tag typename OperatorClass_ = arch::OpClassTensorOp, /// Tag indicating architecture to tune for typename ArchTag_ = arch::Sm80, /// Threadblock-level tile size (concept: GemmShape) typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, /// Warp-level tile size (concept: GemmShape) typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, /// Instruction-level tile size (concept: GemmShape) typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, /// Epilogue output operator typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, /// Threadblock-level swizzling operator typename ThreadblockSwizzle_ = typename threadblock::GemmIdentityThreadblockSwizzle<>, /// Number of stages used in the pipelined mainloop int Stages = DefaultGemmConfiguration::kStages, /// Access granularity of A matrix in units of elements int AlignmentA = DefaultGemmConfiguration::kAlignmentA, /// Access granularity of B matrix in units of elements int AlignmentB = DefaultGemmConfiguration::kAlignmentB, /// If true, kernel supports split-K with serial reduction bool SplitKSerial = false, /// Operation performed by GEMM typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator, /// Sparse matrix is A or not bool IsASparse = true > class EllGemm { public: using ElementA = ElementA_; using LayoutA = LayoutA_; using TensorRefA = TensorRef; using ElementB = ElementB_; using LayoutB = LayoutB_; using TensorRefB = TensorRef; using ElementC = ElementC_; using LayoutC = LayoutC_; using TensorRefC = TensorRef; using TensorRefD = TensorRef; using ElementAccumulator = ElementAccumulator_; using OperatorClass = OperatorClass_; using ArchTag = ArchTag_; using ThreadblockShape = ThreadblockShape_; using WarpShape = WarpShape_; using InstructionShape = InstructionShape_; using EpilogueOutputOp = EpilogueOutputOp_; using ThreadblockSwizzle = ThreadblockSwizzle_; using Operator = Operator_; static int const kStages = Stages; static int const kAlignmentA = AlignmentA; static int const kAlignmentB = AlignmentB; static int const kAlignmentC = EpilogueOutputOp::kCount; static bool const kSplitKSerial = SplitKSerial; static ComplexTransform const kTransformA = ComplexTransform::kNone; static ComplexTransform const kTransformB = ComplexTransform::kNone; static bool const kIsASparse = IsASparse; /// Define the kernel using GemmKernel = typename kernel::DefaultEllGemm< ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC, LayoutC, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, kStages, kSplitKSerial, Operator, kIsASparse >::GemmKernel; /// Argument structure struct Arguments { // // Data members // GemmCoord problem_size; TensorRef ref_A; TensorRef ref_B; TensorRef ref_C; TensorRef ref_D; const int* ell_idx; int ell_ncol; int ell_blocksize; int ell_base_idx; typename EpilogueOutputOp::Params epilogue; int split_k_slices; // // Methods // /// Default ctor CUTLASS_HOST_DEVICE Arguments(): problem_size(0, 0, 0), split_k_slices(1) { } /// Constructs an Arguments structure CUTLASS_HOST_DEVICE Arguments( GemmCoord problem_size_, TensorRef ref_A_, TensorRef ref_B_, TensorRef ref_C_, TensorRef ref_D_, const int* ell_idx_, int ell_ncol_, int ell_blocksize_, int ell_base_idx_, typename EpilogueOutputOp::Params epilogue_ = typename EpilogueOutputOp::Params(), int split_k_slices = 1 ): problem_size(problem_size_), ref_A(ref_A_), ref_B(ref_B_), ref_C(ref_C_), ref_D(ref_D_), ell_idx(ell_idx_), ell_ncol(ell_ncol_), ell_blocksize(ell_blocksize_), ell_base_idx(ell_base_idx_), epilogue(epilogue_), split_k_slices(split_k_slices) { } }; private: /// Kernel parameters object typename GemmKernel::Params params_{}; public: /// Constructs the GEMM. EllGemm() { } /// Determines whether the GEMM can execute the given problem. static Status can_implement(Arguments const &args) { if (!kSplitKSerial && args.split_k_slices > 1) { return Status::kErrorInvalidProblem; } Status status = GemmKernel::can_implement( args.problem_size, args.ref_A.non_const_ref(), args.ref_B.non_const_ref(), args.ref_C.non_const_ref(), args.ref_D ); if (status != Status::kSuccess) { return status; } return Status::kSuccess; } /// Gets the workspace size static size_t get_workspace_size(Arguments const &args) { size_t bytes = 0; // Determine grid shape ThreadblockSwizzle threadblock_swizzle; cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape( args.problem_size, {args.ell_blocksize, ThreadblockShape::kN, ThreadblockShape::kK}, args.split_k_slices); tiled_shape.m() *= (args.ell_blocksize + ThreadblockShape::kM - 1 ) / ThreadblockShape::kM; if (kSplitKSerial && args.split_k_slices > 1) { bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n()); } return bytes; } Status set(Arguments const &args, cutlass::gemm::GemmCoord const &grid_shape, void *workspace){ // Initialize the Params structure params_ = typename GemmKernel::Params{ args.problem_size, grid_shape, args.ref_A.non_const_ref(), args.ref_B.non_const_ref(), args.ref_C.non_const_ref(), args.ref_D, args.ell_idx, args.ell_ncol, args.ell_blocksize, args.ell_base_idx, args.epilogue, static_cast(workspace) }; return Status::kSuccess; } /// Initializes GEMM state from arguments. Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) { // Determine grid shape ThreadblockSwizzle threadblock_swizzle; cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape( args.problem_size, {args.ell_blocksize, ThreadblockShape::kN, ThreadblockShape::kK}, args.split_k_slices); grid_shape.m() *= (args.ell_blocksize + ThreadblockShape::kM - 1 ) / ThreadblockShape::kM; if (kSplitKSerial) { if (args.split_k_slices > 1) { if (!workspace) { return Status::kErrorWorkspaceNull; } size_t bytes = get_workspace_size(args); cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream); if (result != cudaSuccess) { return Status::kErrorInternal; } } } else { if (args.split_k_slices > 1) { return Status::kErrorInvalidProblem; } } return set(args, grid_shape, workspace); } /// Lightweight update given a subset of arguments Status update(Arguments const &args, void *workspace = nullptr) { if (kSplitKSerial && args.split_k_slices > 1) { if (!workspace) { return Status::kErrorWorkspaceNull; } } params_.ref_A.reset(args.ref_A.non_const_ref().data()); params_.ref_B.reset(args.ref_B.non_const_ref().data()); params_.ref_C.reset(args.ref_C.non_const_ref().data()); params_.ref_D.reset(args.ref_D.data()); params_.output_op = args.epilogue; params_.semaphore = static_cast(workspace); return Status::kSuccess; } /// Runs the kernel using initialized state. Status run(cudaStream_t stream = nullptr) { ThreadblockSwizzle threadblock_swizzle; dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape); dim3 block(GemmKernel::kThreadCount, 1, 1); cudaError_t result; int smem_size = int(sizeof(typename GemmKernel::SharedStorage)); if (smem_size >= (48 << 10)) { result = cudaFuncSetAttribute(Kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size); if (result != cudaSuccess) { return Status::kErrorInternal; } } cutlass::arch::synclog_setup(); cutlass::Kernel<<>>(params_); result = cudaGetLastError(); return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal; } /// Runs the kernel using initialized state. Status operator()(cudaStream_t stream = nullptr) { return run(stream); } /// Runs the kernel using initialized state. Status operator()( Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) { Status status = initialize(args, workspace); if (status == Status::kSuccess) { status = run(stream); } return status; } }; //////////////////////////////////////////////////////////////////////////////// /// Partial specialization for column-major output exchanges problem size and operand. template < /// Element type for A matrix operand typename ElementA_, /// Layout type for A matrix operand typename LayoutA_, /// Element type for B matrix operand typename ElementB_, /// Layout type for B matrix operand typename LayoutB_, /// Element type for C and D matrix operands typename ElementC_, /// Element type for internal accumulation typename ElementAccumulator_, /// Operator class tag typename OperatorClass_, /// Tag indicating architecture to tune for typename ArchTag_, /// Threadblock-level tile size (concept: GemmShape) typename ThreadblockShape_, /// Warp-level tile size (concept: GemmShape) typename WarpShape_, /// Instruction-level tile size (concept: GemmShape) typename InstructionShape_, /// Epilogue output operator typename EpilogueOutputOp_, /// Threadblock-level swizzling operator typename ThreadblockSwizzle_, /// Number of stages used in the pipelined mainloop int Stages, /// Access granularity of A matrix in units of elements int AlignmentA, /// Access granularity of B matrix in units of elements int AlignmentB, /// If true, kernel supports split-K as a serial reduction bool SplitKSerial, /// Operation performed by GEMM typename Operator_, /// Sparse matrix is A or not bool IsASparse> class EllGemm { public: using ElementA = ElementA_; using LayoutA = LayoutA_; using TensorRefA = TensorRef; using ElementB = ElementB_; using LayoutB = LayoutB_; using TensorRefB = TensorRef; using ElementC = ElementC_; using LayoutC = layout::ColumnMajor; using TensorRefC = TensorRef; using TensorRefD = TensorRef; using ElementAccumulator = ElementAccumulator_; using OperatorClass = OperatorClass_; using ArchTag = ArchTag_; using ThreadblockShape = ThreadblockShape_; using WarpShape = WarpShape_; using InstructionShape = InstructionShape_; using EpilogueOutputOp = EpilogueOutputOp_; using ThreadblockSwizzle = ThreadblockSwizzle_; using Operator = Operator_; static int const kStages = Stages; static int const kAlignmentA = AlignmentA; static int const kAlignmentB = AlignmentB; static ComplexTransform const kTransformA = ComplexTransform::kNone; static ComplexTransform const kTransformB = ComplexTransform::kNone; static bool const kSplitKSerial = SplitKSerial; static bool const kIsASparse = false; using UnderlyingOperator = EllGemm< ElementB, typename layout::LayoutTranspose::type, ElementA, typename layout::LayoutTranspose::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA, SplitKSerial, Operator, kIsASparse >; using UnderlyingArguments = typename UnderlyingOperator::Arguments; using GemmKernel = typename UnderlyingOperator::GemmKernel; static int const kAlignmentC = UnderlyingOperator::kAlignmentC; /// Argument structure struct Arguments { // // Data members // GemmCoord problem_size; TensorRef ref_A; TensorRef ref_B; TensorRef ref_C; TensorRef ref_D; const int* ell_idx; int ell_ncol; int ell_blocksize; int ell_base_idx; typename EpilogueOutputOp::Params epilogue; int split_k_slices; // // Methods // /// Default ctor CUTLASS_HOST_DEVICE Arguments() { } /// Constructs an Arguments structure CUTLASS_HOST_DEVICE Arguments( GemmCoord problem_size_, TensorRef ref_A_, TensorRef ref_B_, TensorRef ref_C_, TensorRef ref_D_, const int* ell_idx_, int ell_ncol_, int ell_blocksize_, int ell_base_idx_, typename EpilogueOutputOp::Params epilogue_ = typename EpilogueOutputOp::Params(), int split_k_slices = 1 ): problem_size(problem_size_), ref_A(ref_A_), ref_B(ref_B_), ref_C(ref_C_), ref_D(ref_D_), ell_idx(ell_idx_), ell_ncol(ell_ncol_), ell_blocksize(ell_blocksize_), ell_base_idx(ell_base_idx_), epilogue(epilogue_), split_k_slices(split_k_slices) { } }; private: UnderlyingOperator underlying_operator_; public: /// Constructs the GEMM. EllGemm() { } /// Helper to construct a transposed equivalent for the underying GEMM operator static UnderlyingArguments to_underlying_arguments(Arguments const &args) { return UnderlyingArguments( {args.problem_size.n(), args.problem_size.m(), args.problem_size.k()}, {args.ref_B.data(), args.ref_B.stride(0)}, {args.ref_A.data(), args.ref_A.stride(0)}, {args.ref_C.data(), args.ref_C.stride(0)}, {args.ref_D.data(), args.ref_D.stride(0)}, args.ell_idx, args.ell_ncol, args.ell_blocksize, args.ell_base_idx, args.epilogue, args.split_k_slices ); } /// Determines whether the GEMM can execute the given problem. static Status can_implement(Arguments const &args) { return UnderlyingOperator::can_implement(to_underlying_arguments(args)); } /// Gets the workspace size static size_t get_workspace_size(Arguments const &args) { size_t bytes = 0; // Determine grid shape ThreadblockSwizzle threadblock_swizzle; cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape( args.problem_size, {ThreadblockShape::kM, args.ell_blocksize, ThreadblockShape::kK}, args.split_k_slices); tiled_shape.n() *= (args.ell_blocksize + ThreadblockShape::kN - 1 ) / ThreadblockShape::kN; if (kSplitKSerial && args.split_k_slices > 1) { bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n()); } return bytes; } Status set(Arguments const &args, cutlass::gemm::GemmCoord const &grid_shape, void *workspace){ // Initialize the Params structure return underlying_operator_.set(to_underlying_arguments(args), grid_shape, workspace); } /// Initializes GEMM state from arguments. Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) { // Determine grid shape ThreadblockSwizzle threadblock_swizzle; cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape( {args.problem_size.n(), args.problem_size.m(), args.problem_size.k()}, {ThreadblockShape::kM, args.ell_blocksize, ThreadblockShape::kK}, args.split_k_slices); grid_shape.n() *= (args.ell_blocksize + ThreadblockShape::kN - 1 ) / ThreadblockShape::kN; if (kSplitKSerial) { if (args.split_k_slices > 1) { if (!workspace) { return Status::kErrorWorkspaceNull; } size_t bytes = get_workspace_size(args); cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream); if (result != cudaSuccess) { return Status::kErrorInternal; } } } else { if (args.split_k_slices > 1) { return Status::kErrorInvalidProblem; } } // Initialize the Params structure set(args, grid_shape, workspace); return Status::kSuccess; } /// Lightweight update given a subset of arguments Status update(Arguments const &args, void *workspace = nullptr) { return underlying_operator_.update(to_underlying_arguments(args), workspace); } /// Runs the kernel using initialized state. Status run(cudaStream_t stream = nullptr) { return underlying_operator_.run(stream); } /// Runs the kernel using initialized state. Status operator()(cudaStream_t stream = nullptr) { return run(stream); } /// Runs the kernel using initialized state. Status operator()( Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) { Status status = initialize(args, workspace, stream); if (status == Status::kSuccess) { status = run(stream); } return status; } }; //////////////////////////////////////////////////////////////////////////////// } // namespace device } // namespace gemm } // namespace cutlass ////////////////////////////////////////////////////////////////////////////////