/************************************************************************* * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef OP128_H_ #define OP128_H_ #include inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) { asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v0), "=l"(v1) : "l"(ptr) : "memory"); } inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) { asm volatile("st.volatile.global.v2.u64 [%2], {%0,%1};" :: "l"(v0), "l"(v1), "l"(ptr) : "memory"); } inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) { uint64_t* shmemAsmPtr; asm volatile("cvta.to.shared.u64 %0, %1;" : "=l"(shmemAsmPtr) : "l"(shmemGenericPtr) : "memory"); return shmemAsmPtr; } inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) { asm volatile("ld.volatile.shared.v2.u64 {%0,%1}, [%2];" : "=l"(v0), "=l"(v1) : "l"(shmemAsmPtr) : "memory"); } inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) { asm volatile("st.volatile.shared.v2.u64 [%2], {%0,%1};" :: "l"(v0), "l"(v1), "l"(shmemAsmPtr) : "memory"); } template inline __device__ void loadShmemMisaligned128(T *ptr, uint64_t &v0, uint64_t &v1) { union { uint32_t tmp4[4]; uint64_t tmp8[2]; }; if(sizeof(T) < 4) { uint32_t *ptr4 = reinterpret_cast(reinterpret_cast(ptr) & -uintptr_t(4)); #pragma unroll for(int e=0; e < 4; e++) { // Produce 4 bytes of sub-register type by reading 2 4-byte // aligned values and shifting. uint32_t lo, hi; asm volatile("ld.shared.b32 %0,[%1];" : "=r"(lo) : "l"(ptr4+e+0) : "memory"); asm volatile("ld.shared.b32 %0,[%1];" : "=r"(hi) : "l"(ptr4+e+1) : "memory"); tmp4[e] = __funnelshift_r(lo, hi, 8*(int(reinterpret_cast(ptr))%4)); } } else if(sizeof(T) == 4) { #pragma unroll for(int e=0; e < 4; e++) asm volatile("ld.shared.b32 %0,[%1];" : "=r"(tmp4[e]) : "l"(ptr+e) : "memory"); } else /*sizeof(T)==8*/ { #pragma unroll for(int e=0; e < 2; e++) asm volatile("ld.shared.b64 %0,[%1];" : "=l"(tmp8[e]) : "l"(ptr+e) : "memory"); } v0 = tmp8[0]; v1 = tmp8[1]; } template __device__ __forceinline__ uint32_t cvta_to_shared(T* ptr) { return (uint32_t)__cvta_generic_to_shared(ptr); } template __device__ __forceinline__ uintptr_t cvta_to_global(T* ptr) { return (uintptr_t)__cvta_generic_to_global(ptr); } template __device__ __forceinline__ T* cvta_from_shared(uint32_t shptr) { T* ans; asm("cvta.shared.u64 %0, %1;" : "=l"(ans) : "l"(uint64_t(shptr))); return ans; } template __device__ __forceinline__ T* cvta_from_global(uintptr_t gptr) { T* ans; asm("cvta.global.u64 %0, %1;" : "=l"(ans) : "l"(gptr)); return ans; } //////////////////////////////////////////////////////////////////////////////// // BytePack: struct of bytes. template union BytePack; template<> union BytePack<0> {}; template<> union BytePack<1> { uint8_t u8[1], native; }; template<> union BytePack<2> { BytePack<1> half[2]; BytePack<1> b1[2]; uint8_t u8[2]; uint16_t u16[1], native; }; template<> union BytePack<4> { BytePack<2> half[2]; BytePack<1> b1[4]; BytePack<2> b2[2]; uint8_t u8[4]; uint16_t u16[2]; uint32_t u32[1], native; }; template<> union BytePack<8> { BytePack<4> half[2]; BytePack<1> b1[8]; BytePack<2> b2[4]; BytePack<4> b4[2]; uint8_t u8[8]; uint16_t u16[4]; uint32_t u32[2]; uint64_t u64[1], native; }; template<> union alignas(16) BytePack<16> { BytePack<8> half[2]; BytePack<1> b1[16]; BytePack<2> b2[8]; BytePack<4> b4[4]; BytePack<8> b8[2]; uint8_t u8[16]; uint16_t u16[8]; uint32_t u32[4]; uint64_t u64[2]; ulong2 ul2[1], native; }; template union BytePack { BytePack half[2]; BytePack<1> b1[Size]; BytePack<2> b2[Size/2]; BytePack<4> b4[Size/4]; BytePack<8> b8[Size/8]; BytePack<16> b16[Size/16]; uint8_t u8[Size]; uint16_t u16[Size/2]; uint32_t u32[Size/4]; uint64_t u64[Size/8]; }; template struct BytePackOf { static constexpr int Size = sizeof(T); using Pack = BytePack; }; template<> struct BytePackOf> { static constexpr int Size = 0; using Pack = BytePack<0>; }; template __device__ __forceinline__ typename BytePackOf::Pack toPack(T value) { union { typename BytePackOf::Pack p; T v; }; // Coverity recommends the use of std::move here but, given that T is a POD // scalar, a plain copy will be just as efficient. // coverity[copy_assignment_call] v = value; return p; } template __device__ __forceinline__ T fromPack(typename BytePackOf::Pack pack) { union { typename BytePackOf::Pack p; T v; }; p = pack; return v; } //////////////////////////////////////////////////////////////////////////////// // Load/store of BytePack using integral addresses. template __device__ BytePack ld_global(uintptr_t addr); template __device__ BytePack ld_shared(uint32_t addr); template __device__ BytePack ld_volatile_global(uintptr_t addr); template __device__ BytePack ld_volatile_shared(uint32_t addr); template __device__ BytePack ld_relaxed_gpu_global(uintptr_t addr); template __device__ void st_global(uintptr_t addr, BytePack value); template __device__ void st_shared(uint32_t addr, BytePack value); template __device__ void st_relaxed_gpu_global(uintptr_t addr, BytePack value); template<> __device__ __forceinline__ BytePack<0> ld_global<0>(uintptr_t addr) { return {}; } template<> __device__ __forceinline__ BytePack<0> ld_shared<0>(uint32_t addr) { return {}; } template<> __device__ __forceinline__ BytePack<0> ld_volatile_global<0>(uintptr_t addr) { return {}; } template<> __device__ __forceinline__ BytePack<0> ld_volatile_shared<0>(uint32_t addr) { return {}; } template<> __device__ __forceinline__ BytePack<0> ld_relaxed_gpu_global<0>(uintptr_t addr) { return {}; } template<> __device__ __forceinline__ void st_global<0>(uintptr_t addr, BytePack<0> value) {} template<> __device__ __forceinline__ void st_shared<0>(uint32_t addr, BytePack<0> value) {} template<> __device__ __forceinline__ void st_relaxed_gpu_global<0>(uintptr_t addr, BytePack<0> value) {} // Used to define implementations for above prototypes. #define DEFINE_ld_st__size_space(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, space, addr_cxx_ty, addr_reg_ty) \ template<> \ __device__ __forceinline__ BytePack ld_##space(addr_cxx_ty addr) { \ data_cxx_ty tmp; \ asm volatile("ld." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr) : "memory"); \ BytePack ans; \ ans.native = tmp; \ return ans; \ } \ template<> \ __device__ __forceinline__ BytePack ld_volatile_##space(addr_cxx_ty addr) { \ data_cxx_ty tmp; \ asm volatile("ld.volatile." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr) : "memory"); \ BytePack ans; \ ans.native = tmp; \ return ans; \ } \ template<> \ __device__ __forceinline__ void st_##space(addr_cxx_ty addr, BytePack value) { \ data_cxx_ty tmp = value.native; \ asm volatile("st." #space "." #data_ptx_ty " [%0], %1;" :: #addr_reg_ty(addr), #data_reg_ty(tmp) : "memory"); \ } #if __CUDA_ARCH__ >= 700 #define PTX_relaxed_gpu "relaxed.gpu" #else #define PTX_relaxed_gpu "volatile" #endif #define DEFINE_ld_st_gpu_relaxed__size(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty) \ template<> \ __device__ __forceinline__ BytePack ld_relaxed_gpu_global(uintptr_t addr) { \ data_cxx_ty tmp; \ asm volatile("ld." PTX_relaxed_gpu ".global." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : "l"(addr) : "memory"); \ BytePack ans; \ ans.native = tmp; \ return ans; \ } \ template<> \ __device__ __forceinline__ void st_relaxed_gpu_global(uintptr_t addr, BytePack value) { \ data_cxx_ty tmp = value.native; \ asm volatile("st." PTX_relaxed_gpu ".global." #data_ptx_ty " [%0], %1;" :: "l"(addr), #data_reg_ty(tmp) : "memory"); \ } #define DEFINE_ld_st__size(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty) \ DEFINE_ld_st__size_space(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, global, uintptr_t, l) \ DEFINE_ld_st__size_space(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, shared, uint32_t, r) \ DEFINE_ld_st_gpu_relaxed__size(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty) // Single-byte types use 4-byte registers since there is no 1-byte register // character for asm blocks. See https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html#constraints DEFINE_ld_st__size(1, uint32_t, b8, r) DEFINE_ld_st__size(2, uint16_t, b16, h) DEFINE_ld_st__size(4, uint32_t, b32, r) DEFINE_ld_st__size(8, uint64_t, b64, l) #undef DEFINE_ld_st__size_space #undef DEFINE_ld_st__size #define DEFINE_ld_st_16__space(space, addr_cxx_ty, addr_reg_ty) \ template<> \ __device__ __forceinline__ BytePack<16> ld_##space<16>(addr_cxx_ty addr) { \ BytePack<16> ans; \ asm volatile("ld." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr) : "memory"); \ return ans; \ } \ template<> \ __device__ __forceinline__ BytePack<16> ld_volatile_##space<16>(addr_cxx_ty addr) { \ BytePack<16> ans; \ asm volatile("ld.volatile." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr) : "memory"); \ return ans; \ } \ template<> \ __device__ __forceinline__ void st_##space<16>(addr_cxx_ty addr, BytePack<16> value) { \ asm volatile("st." #space ".v2.b64 [%0], {%1,%2};" :: #addr_reg_ty(addr), "l"(value.u64[0]), "l"(value.u64[1]) : "memory"); \ } DEFINE_ld_st_16__space(global, uintptr_t, l) DEFINE_ld_st_16__space(shared, uint32_t, r) #undef DEFINE_ld_st_16 template<> __device__ __forceinline__ BytePack<16> ld_relaxed_gpu_global<16>(uintptr_t addr) { BytePack<16> ans; asm volatile("ld." PTX_relaxed_gpu ".global.v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : "l"(addr) : "memory"); return ans; } template<> __device__ __forceinline__ void st_relaxed_gpu_global<16>(uintptr_t addr, BytePack<16> value) { asm volatile("st." PTX_relaxed_gpu ".global.v2.b64 [%0], {%1,%2};" :: "l"(addr), "l"(value.u64[0]), "l"(value.u64[1]) : "memory"); } #undef PTX_relaxed_gpu //////////////////////////////////////////////////////////////////////////////// // Atomic load/store using c++ pointers. __device__ __forceinline__ uint64_t ld_volatile_global(uint64_t *ptr) { uint64_t ans; asm volatile("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory"); return ans; } __device__ __forceinline__ uint64_t ld_relaxed_sys_global(uint64_t *ptr) { uint64_t ans; #if __CUDA_ARCH__ >= 700 asm volatile("ld.relaxed.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory"); #else asm volatile("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory"); #endif return ans; } __device__ __forceinline__ uint64_t ld_relaxed_gpu_global(uint64_t *ptr) { uint64_t ans; #if __CUDA_ARCH__ >= 700 asm volatile("ld.relaxed.gpu.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory"); #else asm volatile("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory"); #endif return ans; } __device__ __forceinline__ uint64_t ld_acquire_sys_global(uint64_t *ptr) { uint64_t ans; #if __CUDA_ARCH__ >= 700 asm volatile("ld.acquire.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory"); #else asm volatile("ld.volatile.sys.global.u64 %0, [%1]; membar.gl;" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory"); #endif return ans; } __device__ __forceinline__ void st_volatile_global(uint64_t *ptr, uint64_t val) { asm volatile("st.volatile.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory"); } __device__ __forceinline__ void st_relaxed_sys_global(uint64_t *ptr, uint64_t val) { #if __CUDA_ARCH__ >= 700 asm volatile("st.relaxed.sys.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory"); #else asm volatile("st.volatile.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory"); #endif } __device__ __forceinline__ void st_release_sys_global(uint64_t *ptr, uint64_t val) { #if __CUDA_ARCH__ >= 700 asm volatile("st.release.sys.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory"); #else asm volatile("membar.sys; st.volatile.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory"); #endif } __device__ __forceinline__ void fence_acq_rel_sys() { #if __CUDA_ARCH__ >= 700 asm volatile("fence.acq_rel.sys;" ::: "memory"); #else asm volatile("membar.sys;" ::: "memory"); #endif } __device__ __forceinline__ void fence_acq_rel_gpu() { #if __CUDA_ARCH__ >= 700 asm volatile("fence.acq_rel.gpu;" ::: "memory"); #else asm volatile("membar.gl;" ::: "memory"); #endif } //////////////////////////////////////////////////////////////////////////////// // Multimem stores of BytePack. template __device__ __forceinline__ void multimem_st_global(uintptr_t addr, BytePack val); #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010 template<> __device__ __forceinline__ void multimem_st_global<0>(uintptr_t addr, BytePack<0> val) { // nop } template<> __device__ __forceinline__ void multimem_st_global<1>(uintptr_t addr, BytePack<1> val) { asm volatile("st.global.b8 [%0], %1;" :: "l"(addr), "r"((uint32_t)val.native) : "memory"); } template<> __device__ __forceinline__ void multimem_st_global<2>(uintptr_t addr, BytePack<2> val) { asm volatile("st.global.b16 [%0], %1;" :: "l"(addr), "h"(val.native) : "memory"); } template<> __device__ __forceinline__ void multimem_st_global<4>(uintptr_t addr, BytePack<4> val) { asm volatile("multimem.st.global.b32 [%0], %1;" :: "l"(addr), "r"(val.native) : "memory"); } template<> __device__ __forceinline__ void multimem_st_global<8>(uintptr_t addr, BytePack<8> val) { asm volatile("multimem.st.global.b64 [%0], %1;" :: "l"(addr), "l"(val.native) : "memory"); } template<> __device__ __forceinline__ void multimem_st_global<16>(uintptr_t addr, BytePack<16> val) { asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" :: "l"(addr), "r"(val.u32[0]), "r"(val.u32[1]), "r"(val.u32[2]), "r"(val.u32[3]) : "memory"); } #else template __device__ __forceinline__ void multimem_st_global(uintptr_t addr, BytePack val) { // nop } #endif // Load pack starting at index in array. Ignore elements past end (length of array). template __device__ __forceinline__ Pack loadPack(T* ptr, int ix, int end) { constexpr int Size = sizeof(Pack); ptr += ix; int n = end - ix; if (alignof(T) == Size && sizeof(T) == Size) { return *(Pack*)ptr; } else if ((Size+3)/4 + 1 < Size/sizeof(T)) { union { Pack ans; uint32_t part[Size/4]; }; int misalign = reinterpret_cast(ptr) % 4; uint32_t* down = reinterpret_cast(reinterpret_cast(ptr) & -uintptr_t(4)); int i; #pragma unroll for (i=0; i < Size/4; i++) { if (i*4/sizeof(T) < 1 || i*4/sizeof(T) < n) part[i] = down[i]; } uint32_t extra; if (misalign) extra = down[i]; #pragma unroll for (i=0; i < Size/4; i++) { part[i] = __funnelshift_r(part[i], part[i+1], 8*misalign); } if (misalign) part[i] = __funnelshift_r(part[i], extra, 8*misalign); return ans; } else { union { Pack ans; BytePack part[Size/sizeof(T)]; }; #pragma unroll for (int i=0; i < Size/sizeof(T); i++) { if (i < 1 || i < n) part[i] = ((BytePack*)ptr)[i]; } return ans; } } // Store pack starting at index in array. Ignore elements past end (length of array). template __device__ __forceinline__ void storePack(T* ptr, int ix, int end, Pack val) { constexpr int Size = sizeof(Pack); union { Pack tmp; BytePack part[Size/sizeof(T)]; }; tmp = val; ptr += ix; int n = end - ix; #pragma unroll for (int i=0; i < Size/sizeof(T); i++) { if (i < 1 || i < n) ((BytePack*)ptr)[i] = part[i]; } } // Warp-uniform memory copy from shared address (not generic) to global memory. // The number of bytes copied is `min(MaxBytes, nBytesAhead)`, a negative value // is interpeted as zero. EltSize is the guaranteed alignment of the addresses and sizes. template __device__ __forceinline__ void copyGlobalShared_WarpUnrolled( int lane, uintptr_t dstAddr, uint32_t srcAddr, IntBytes nBytesAhead ) { static_assert(std::is_signed::value, "`IntBytes` must be a signed integral type."); int nBytes = min(nBytesAhead, (IntBytes)MaxBytes); int nFrontBytes = min(nBytes, (16 - int(dstAddr%16))%16); int nMiddleBytes = (nBytes-nFrontBytes) & -16; int nBackBytes = (nBytes-nFrontBytes) % 16; { int backLane = WARP_SIZE-1 - lane; bool hasFront = lane*EltSize < nFrontBytes; bool hasBack = backLane*EltSize < nBackBytes; int offset = hasFront ? lane*EltSize : (nBytes - (backLane+1)*EltSize); if (hasFront | hasBack) { BytePack tmp = ld_shared(srcAddr+offset); // Can't use multimem_st since it doesn't support EltSize==2 st_global(dstAddr+offset, tmp); } } srcAddr += nFrontBytes; int srcMisalign = EltSize < 4 ? (srcAddr%4) : 0; srcAddr += -srcMisalign + lane*16; dstAddr += nFrontBytes + lane*16; nMiddleBytes -= lane*16; #pragma unroll for (int u=0; u < divUp(MaxBytes, WARP_SIZE*16); u++) { if (nMiddleBytes <= 0) break; union { BytePack<4> b4[4]; BytePack<16> b16; }; b4[0] = ld_shared<4>(srcAddr + 0*4); b4[1] = ld_shared<4>(srcAddr + 1*4); b4[2] = ld_shared<4>(srcAddr + 2*4); b4[3] = ld_shared<4>(srcAddr + 3*4); if (srcMisalign != 0) { BytePack<4> b4_4 = ld_shared<4>(srcAddr + 4*4); b4[0].native = __funnelshift_r(b4[0].native, b4[1].native, srcMisalign*8); b4[1].native = __funnelshift_r(b4[1].native, b4[2].native, srcMisalign*8); b4[2].native = __funnelshift_r(b4[2].native, b4[3].native, srcMisalign*8); b4[3].native = __funnelshift_r(b4[3].native, b4_4.native, srcMisalign*8); } if (Multimem) multimem_st_global<16>(dstAddr, b16); else st_global<16>(dstAddr, b16); srcAddr += WARP_SIZE*16; dstAddr += WARP_SIZE*16; nMiddleBytes -= WARP_SIZE*16; } } #endif