#pragma once #include "scaled_mm.cuh" #include "cutlass_gemm_caller.cuh" /** * This file defines Gemm kernel configurations for SM120 (fp8) based on the * Gemm shape. */ namespace vllm { using c3x::cutlass_gemm_caller; template typename Epilogue> struct sm120_fp8_config_default { static_assert(std::is_same()); using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; using TileShape = Shape<_128, _128, _128>; using ClusterShape = Shape<_1, _1, _1>; // Only work with Shape<_1, _1, _1> using Cutlass3xGemm = cutlass_3x_gemm_sm120; }; template typename Epilogue, typename... EpilogueArgs> inline void cutlass_gemm_sm120_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, EpilogueArgs&&... args) { static_assert(std::is_same()); TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn); TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn); using Cutlass3xGemmDefault = typename sm120_fp8_config_default::Cutlass3xGemm; return cutlass_gemm_caller( out, a, b, std::forward(args)...); } template