/*************************************************************************************************** * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ /*! \file \brief Architecture-specific operators on memory added for SM75 */ #pragma once #include "cutlass/array.h" #include "cutlass/detail/helper_macros.hpp" #include "cutlass/layout/matrix.h" #include "cute/arch/copy_sm75.hpp" #include "cute/arch/util.hpp" namespace cutlass { namespace arch { ///////////////////////////////////////////////////////////////////////////////////////////////// template < /// Layout of destination matrix (column-major implies transpose) typename Layout, /// .x1, .x2, or .x4 int MatrixCount > CUTLASS_DEVICE void ldsm(Array & D, void const* ptr); ///////////////////////////////////////////////////////////////////////////////////////////////// // // Determine the appropriate way to target PTX's "ldmatrix" instruction. // ///////////////////////////////////////////////////////////////////////////////////////////////// /// CUTLASS helper to get SMEM pointer CUTLASS_HOST_DEVICE unsigned cutlass_get_smem_pointer(void *ptr) { return cute::cast_smem_ptr_to_uint(ptr); } /// CUTLASS helper to get SMEM pointer CUTLASS_DEVICE unsigned cutlass_get_smem_pointer(void const *ptr) { return cutlass_get_smem_pointer(const_cast(ptr)); } ///////////////////////////////////////////////////////////////////////////////////////////////// template <> CUTLASS_DEVICE void ldsm( Array & D, void const* ptr) { #if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED) unsigned addr = cutlass_get_smem_pointer(ptr); int x; asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(x) : "r"(addr)); reinterpret_cast(D) = x; #else CUTLASS_UNUSED(D); CUTLASS_UNUSED(ptr); CUTLASS_NOT_IMPLEMENTED(); #endif } ///////////////////////////////////////////////////////////////////////////////////////////////// template <> CUTLASS_DEVICE void ldsm( Array & D, void const* ptr) { #if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED) unsigned addr = cutlass_get_smem_pointer(ptr); int x, y; asm volatile ("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];" : "=r"(x), "=r"(y) : "r"(addr)); reinterpret_cast(D) = make_int2(x, y); #else CUTLASS_UNUSED(D); CUTLASS_UNUSED(ptr); CUTLASS_NOT_IMPLEMENTED(); #endif } ///////////////////////////////////////////////////////////////////////////////////////////////// template <> CUTLASS_DEVICE void ldsm( Array & D, void const* ptr) { #if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED) unsigned addr = cutlass_get_smem_pointer(ptr); int x, y, z, w; asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(x), "=r"(y), "=r"(z), "=r"(w) : "r"(addr)); reinterpret_cast(D) = make_int4(x, y, z, w); #else CUTLASS_UNUSED(D); CUTLASS_UNUSED(ptr); CUTLASS_NOT_IMPLEMENTED(); #endif } ///////////////////////////////////////////////////////////////////////////////////////////////// // // Transpose on 16b granularity // ///////////////////////////////////////////////////////////////////////////////////////////////// template <> CUTLASS_DEVICE void ldsm( Array & D, void const* ptr) { #if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED) unsigned addr = cutlass_get_smem_pointer(ptr); int x; asm volatile ("ldmatrix.sync.aligned.x1.trans.m8n8.shared.b16 {%0}, [%1];" : "=r"(x) : "r"(addr)); reinterpret_cast(D) = x; #else CUTLASS_UNUSED(D); CUTLASS_UNUSED(ptr); CUTLASS_NOT_IMPLEMENTED(); #endif } ///////////////////////////////////////////////////////////////////////////////////////////////// template <> CUTLASS_DEVICE void ldsm( Array & D, void const* ptr) { #if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED) unsigned addr = cutlass_get_smem_pointer(ptr); int x, y; asm volatile ("ldmatrix.sync.aligned.x2.trans.m8n8.shared.b16 {%0, %1}, [%2];" : "=r"(x), "=r"(y) : "r"(addr)); reinterpret_cast(D) = make_int2(x, y); #else CUTLASS_UNUSED(D); CUTLASS_UNUSED(ptr); CUTLASS_NOT_IMPLEMENTED(); #endif } ///////////////////////////////////////////////////////////////////////////////////////////////// template <> CUTLASS_DEVICE void ldsm( Array & D, void const* ptr) { #if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED) unsigned addr = cutlass_get_smem_pointer(ptr); int x, y, z, w; asm volatile ("ldmatrix.sync.aligned.x4.trans.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(x), "=r"(y), "=r"(z), "=r"(w) : "r"(addr)); reinterpret_cast(D) = make_int4(x, y, z, w); #else CUTLASS_UNUSED(D); CUTLASS_UNUSED(ptr); CUTLASS_NOT_IMPLEMENTED(); #endif } ///////////////////////////////////////////////////////////////////////////////////////////////// template struct shared_load_op { CUTLASS_DEVICE shared_load_op(AccessType &D, void const *ptr) { D = *reinterpret_cast(ptr); } }; template CUTLASS_DEVICE void shared_load(AccessType &D, void const *ptr) { shared_load_op(D, ptr); } ///////////////////////////////////////////////////////////////////////////////////////////////// template struct shared_load_op { CUTLASS_DEVICE shared_load_op(AccessType &D, void const *ptr) { unsigned addr = cutlass_get_smem_pointer(ptr); uint4 v; asm volatile ("ld.shared.v4.b32 {%0, %1, %2, %3}, [%4];" : "=r"(v.x), "=r"(v.y), "=r"(v.z), "=r"(v.w) : "r"(addr)); D = reinterpret_cast(v); } }; ///////////////////////////////////////////////////////////////////////////////////////////////// template struct shared_load_op { CUTLASS_DEVICE shared_load_op(AccessType &D, void const *ptr) { unsigned addr = cutlass_get_smem_pointer(ptr); uint2 v; asm volatile ("ld.shared.v2.b32 {%0, %1}, [%2];" : "=r"(v.x), "=r"(v.y) : "r"(addr)); D = reinterpret_cast(v); } }; ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace arch } // namespace cutlass