/*************************************************************************************************** * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ /* \file \brief Execution environment */ #include #include "cutlass/numeric_types.h" #include "cutlass/layout/matrix.h" #include "cutlass/layout/tensor.h" #include "cutlass/util/reference/device/tensor_compare.h" #include "cutlass/util/reference/device/tensor_fill.h" #include "cutlass/util/reference/host/tensor_fill.h" #include "cutlass/util/host_tensor.h" #include "cutlass/util/tensor_view_io.h" #include "cutlass/library/util.h" #include "cutlass/profiler/device_allocation.h" namespace cutlass { namespace profiler { ///////////////////////////////////////////////////////////////////////////////////////////////// size_t DeviceAllocation::bytes(library::NumericTypeID type, size_t capacity) { return size_t(cutlass::library::sizeof_bits(type)) * capacity / 8; } ///////////////////////////////////////////////////////////////////////////////////////////////// template static std::vector get_packed_layout_stride(std::vector const &extent) { typename Layout::TensorCoord extent_coord; typename Layout::Stride stride_coord; if (extent.size() != size_t(Layout::kRank)) { throw std::runtime_error("Layout does not have same rank as extent vector."); } for (int i = 0; i < Layout::kRank; ++i) { extent_coord[i] = extent.at(i); } std::vector stride; stride.resize(Layout::kStrideRank, 0); Layout layout = Layout::packed(extent_coord); stride_coord = layout.stride(); for (int i = 0; i < Layout::kStrideRank; ++i) { stride.at(i) = (int64_t)stride_coord[i]; } return stride; } /// Returns the stride of a packed layout std::vector DeviceAllocation::get_packed_layout( library::LayoutTypeID layout_id, std::vector const &extent) { std::vector stride; switch (layout_id) { case library::LayoutTypeID::kColumnMajor: stride = get_packed_layout_stride(extent); break; case library::LayoutTypeID::kRowMajor: stride = get_packed_layout_stride(extent); break; case library::LayoutTypeID::kColumnMajorInterleavedK2: stride = get_packed_layout_stride>(extent); break; case library::LayoutTypeID::kRowMajorInterleavedK2: stride = get_packed_layout_stride>(extent); break; case library::LayoutTypeID::kColumnMajorInterleavedK4: stride = get_packed_layout_stride>(extent); break; case library::LayoutTypeID::kRowMajorInterleavedK4: stride = get_packed_layout_stride>(extent); break; case library::LayoutTypeID::kColumnMajorInterleavedK16: stride = get_packed_layout_stride>(extent); break; case library::LayoutTypeID::kRowMajorInterleavedK16: stride = get_packed_layout_stride>(extent); break; case library::LayoutTypeID::kColumnMajorInterleavedK32: stride = get_packed_layout_stride>(extent); break; case library::LayoutTypeID::kRowMajorInterleavedK32: stride = get_packed_layout_stride>(extent); break; case library::LayoutTypeID::kColumnMajorInterleavedK64: stride = get_packed_layout_stride>(extent); break; case library::LayoutTypeID::kRowMajorInterleavedK64: stride = get_packed_layout_stride>(extent); break; case library::LayoutTypeID::kTensorNCHW: stride = get_packed_layout_stride(extent); break; case library::LayoutTypeID::kTensorNHWC: stride = get_packed_layout_stride(extent); break; case library::LayoutTypeID::kTensorNDHWC: stride = get_packed_layout_stride(extent); break; case library::LayoutTypeID::kTensorNC32HW32: stride = get_packed_layout_stride>(extent); break; case library::LayoutTypeID::kTensorNC64HW64: stride = get_packed_layout_stride>(extent); break; case library::LayoutTypeID::kTensorC32RSK32: stride = get_packed_layout_stride>(extent); break; case library::LayoutTypeID::kTensorC64RSK64: stride = get_packed_layout_stride>(extent); break; default: break; } return stride; } ///////////////////////////////////////////////////////////////////////////////////////////////// /// Template to use CUTLASS Layout functions to template static size_t construct_layout_( void *bytes, library::LayoutTypeID layout_id, std::vector const &extent, std::vector &stride) { if (extent.size() != Layout::kRank) { throw std::runtime_error( "Layout must have same rank as extent vector."); } if (Layout::kStrideRank && stride.empty()) { stride = get_packed_layout_stride(extent); return construct_layout_( bytes, layout_id, extent, stride); } else if (Layout::kStrideRank && stride.size() != Layout::kStrideRank) { throw std::runtime_error( "Layout requires either empty stride or stride vector matching Layout::kStrideRank"); } typename Layout::Stride stride_coord; for (int i = 0; i < Layout::kStrideRank; ++i) { stride_coord[i] = (int)stride.at(i); } typename Layout::TensorCoord extent_coord; for (int i = 0; i < Layout::kRank; ++i) { extent_coord[i] = extent.at(i); } // Construct the CUTLASS layout object from the stride object Layout layout(stride_coord); // Pack it into bytes if (bytes) { *reinterpret_cast(bytes) = layout; } // Return capacity size_t capacity_ = layout.capacity(extent_coord); return capacity_; } /// returns the capacity needed size_t DeviceAllocation::construct_layout( void *bytes, library::LayoutTypeID layout_id, std::vector const &extent, std::vector &stride) { switch (layout_id) { case library::LayoutTypeID::kColumnMajor: return construct_layout_(bytes, layout_id, extent, stride); case library::LayoutTypeID::kRowMajor: return construct_layout_(bytes, layout_id, extent, stride); case library::LayoutTypeID::kColumnMajorInterleavedK2: return construct_layout_>(bytes, layout_id, extent, stride); case library::LayoutTypeID::kRowMajorInterleavedK2: return construct_layout_>(bytes, layout_id, extent, stride); case library::LayoutTypeID::kColumnMajorInterleavedK4: return construct_layout_>(bytes, layout_id, extent, stride); case library::LayoutTypeID::kRowMajorInterleavedK4: return construct_layout_>(bytes, layout_id, extent, stride); case library::LayoutTypeID::kColumnMajorInterleavedK16: return construct_layout_>(bytes, layout_id, extent, stride); case library::LayoutTypeID::kRowMajorInterleavedK16: return construct_layout_>(bytes, layout_id, extent, stride); case library::LayoutTypeID::kColumnMajorInterleavedK32: return construct_layout_>(bytes, layout_id, extent, stride); case library::LayoutTypeID::kRowMajorInterleavedK32: return construct_layout_>(bytes, layout_id, extent, stride); case library::LayoutTypeID::kColumnMajorInterleavedK64: return construct_layout_>(bytes, layout_id, extent, stride); case library::LayoutTypeID::kRowMajorInterleavedK64: return construct_layout_>(bytes, layout_id, extent, stride); case library::LayoutTypeID::kTensorNCHW: return construct_layout_(bytes, layout_id, extent, stride); case library::LayoutTypeID::kTensorNHWC: return construct_layout_(bytes, layout_id, extent, stride); case library::LayoutTypeID::kTensorNDHWC: return construct_layout_(bytes, layout_id, extent, stride); case library::LayoutTypeID::kTensorNC32HW32: return construct_layout_>(bytes, layout_id, extent, stride); case library::LayoutTypeID::kTensorNC64HW64: return construct_layout_>(bytes, layout_id, extent, stride); case library::LayoutTypeID::kTensorC32RSK32: return construct_layout_>(bytes, layout_id, extent, stride); case library::LayoutTypeID::kTensorC64RSK64: return construct_layout_>(bytes, layout_id, extent, stride); default: break; } return 0; } ///////////////////////////////////////////////////////////////////////////////////////////////// DeviceAllocation::DeviceAllocation(): type_(library::NumericTypeID::kInvalid), batch_stride_(0), capacity_(0), pointer_(nullptr), layout_(library::LayoutTypeID::kUnknown), batch_count_(1) { cudaGetDevice(&device_); } DeviceAllocation::DeviceAllocation( library::NumericTypeID type, size_t capacity, int device ): type_(type), batch_stride_(capacity), capacity_(capacity), pointer_(nullptr), layout_(library::LayoutTypeID::kUnknown), batch_count_(1), device_(device) { cudaError_t result = this->malloc((void **)&pointer_, bytes(type, capacity)); if (result != cudaSuccess) { type_ = library::NumericTypeID::kInvalid; capacity_ = 0; pointer_ = nullptr; throw std::bad_alloc(); } } DeviceAllocation::DeviceAllocation( library::NumericTypeID type, library::LayoutTypeID layout_id, std::vector const &extent, std::vector const &stride, int batch_count, int device ): type_(type), batch_stride_(size_t(0)), capacity_(size_t(0)), pointer_(nullptr), batch_count_(1), device_(device) { reset(type, layout_id, extent, stride, batch_count); } DeviceAllocation::~DeviceAllocation() { if (pointer_) { int current_device; cudaGetDevice(¤t_device); if (current_device != device_) { cudaSetDevice(device_); } cudaFree(pointer_); if (current_device != device_) { cudaSetDevice(current_device); } } } DeviceAllocation &DeviceAllocation::reset() { if (pointer_) { int current_device; cudaGetDevice(¤t_device); if (current_device != device_) { cudaSetDevice(device_); } cudaFree(pointer_); if (current_device != device_) { cudaSetDevice(current_device); } } type_ = library::NumericTypeID::kInvalid; batch_stride_ = 0; capacity_ = 0; pointer_ = nullptr; layout_ = library::LayoutTypeID::kUnknown; stride_.clear(); extent_.clear(); tensor_ref_buffer_.clear(); batch_count_ = 1; return *this; } DeviceAllocation &DeviceAllocation::reset(library::NumericTypeID type, size_t capacity) { reset(); type_ = type; batch_stride_ = capacity; capacity_ = capacity; cudaError_t result = this->malloc((void **)&pointer_, bytes(type_, capacity_)); if (result != cudaSuccess) { throw std::bad_alloc(); } layout_ = library::LayoutTypeID::kUnknown; stride_.clear(); extent_.clear(); batch_count_ = 1; tensor_ref_buffer_.resize(sizeof(pointer_), 0); std::memcpy(tensor_ref_buffer_.data(), &pointer_, sizeof(pointer_)); return *this; } /// Allocates memory for a given layout and tensor DeviceAllocation &DeviceAllocation::reset( library::NumericTypeID type, library::LayoutTypeID layout_id, std::vector const &extent, std::vector const &stride, int batch_count) { reset(); tensor_ref_buffer_.resize(sizeof(pointer_) + (sizeof(int64_t) * library::get_layout_stride_rank(layout_id)), 0); type_ = type; layout_ = layout_id; stride_ = stride; extent_ = extent; batch_count_ = batch_count; batch_stride_ = construct_layout( tensor_ref_buffer_.data() + sizeof(pointer_), layout_id, extent, stride_); capacity_ = batch_stride_ * batch_count_; cudaError_t result = this->malloc((void **)&pointer_, bytes(type, capacity_)); if (result != cudaSuccess) { throw std::bad_alloc(); } std::memcpy(tensor_ref_buffer_.data(), &pointer_, sizeof(pointer_)); return *this; } bool DeviceAllocation::good() const { return (capacity_ && pointer_); } library::NumericTypeID DeviceAllocation::type() const { return type_; } void *DeviceAllocation::data() const { return pointer_; } void *DeviceAllocation::batch_data(int batch_idx) const { return static_cast(data()) + batch_stride_bytes() * batch_idx; } library::LayoutTypeID DeviceAllocation::layout() const { return layout_; } std::vector const & DeviceAllocation::stride() const { return stride_; } /// Gets the extent vector std::vector const & DeviceAllocation::extent() const { return extent_; } /// Gets the number of adjacent tensors in memory int DeviceAllocation::batch_count() const { return batch_count_; } /// Gets the stride (in units of elements) between items int64_t DeviceAllocation::batch_stride() const { return batch_stride_; } /// Gets the stride (in units of bytes) between items int64_t DeviceAllocation::batch_stride_bytes() const { return bytes(type_, batch_stride_); } size_t DeviceAllocation::capacity() const { return capacity_; } size_t DeviceAllocation::bytes() const { return bytes(type_, capacity_); } /// Copies from an equivalent-sized tensor in device memory void DeviceAllocation::copy_from_device(void const *ptr) { if (!bytes()) { #ifndef NDEBUG std::cout << "Skipping copy of size 0 allocation\n"; #endif return; } cudaError_t result = cudaMemcpy(data(), ptr, bytes(), cudaMemcpyDeviceToDevice); if (result != cudaSuccess) { throw std::runtime_error("Failed device-to-device copy"); } } /// Copies from an equivalent-sized tensor in device memory void DeviceAllocation::copy_from_host(void const *ptr) { if (!bytes()) { #ifndef NDEBUG std::cout << "Skipping copy of size 0 allocation\n"; #endif return; } cudaError_t result = cudaMemcpy(data(), ptr, bytes(), cudaMemcpyHostToDevice); if (result != cudaSuccess) { throw std::runtime_error("Failed host-to-device copy"); } } /// Copies from an equivalent-sized tensor in device memory void DeviceAllocation::copy_to_host(void *ptr) { if (!bytes()) { #ifndef NDEBUG std::cout << "Skipping copy of size 0 allocation\n"; #endif return; } cudaError_t result = cudaMemcpy(ptr, data(), bytes(), cudaMemcpyDeviceToHost); if (result != cudaSuccess) { throw std::runtime_error("Failed device-to-host copy"); } } void DeviceAllocation::initialize_random_device(int seed, Distribution dist) { if (!bytes()) { #ifndef NDEBUG std::cout << "Skipping initialization of size 0 allocation\n"; #endif return; } if (!data()) { throw std::runtime_error("Attempting to initialize invalid allocation."); } // Instantiate calls to CURAND here. This file takes a long time to compile for // this reason. switch (type_) { case library::NumericTypeID::kF16: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kBF16: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kTF32: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kF32: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kCBF16: cutlass::reference::device::BlockFillRandom>( reinterpret_cast *>(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kCTF32: cutlass::reference::device::BlockFillRandom>( reinterpret_cast *>(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kCF32: cutlass::reference::device::BlockFillRandom>( reinterpret_cast *>(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kFE4M3: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kFE5M2: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kFUE4M3: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kFUE8M0: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kFE2M3: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kFE3M2: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kFE2M1: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kF64: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kCF64: cutlass::reference::device::BlockFillRandom>( reinterpret_cast *>(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kS2: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kS4: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kS8: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kS16: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kS32: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kS64: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kB1: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kU2: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kU4: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kU8: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kU16: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kU32: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), capacity_, seed, dist ); break; case library::NumericTypeID::kU64: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), capacity_, seed, dist ); break; default: break; } } void DeviceAllocation::initialize_random_host(int seed, Distribution dist) { if (!bytes()) { #ifndef NDEBUG std::cout << "Skipping initialization of size 0 allocation\n"; #endif return; } if (!data()) { throw std::runtime_error("Attempting to initialize invalid allocation."); } std::vector host_data(bytes()); switch (type_) { case library::NumericTypeID::kFE4M3: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kFE5M2: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kFUE4M3: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kFE2M3: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kFE3M2: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kFE2M1: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kFUE8M0: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kF16: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kBF16: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kTF32: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kF32: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kCF16: cutlass::reference::host::BlockFillRandom>( reinterpret_cast *>(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kCBF16: cutlass::reference::host::BlockFillRandom>( reinterpret_cast *>(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kCTF32: cutlass::reference::host::BlockFillRandom>( reinterpret_cast *>(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kCF32: cutlass::reference::host::BlockFillRandom>( reinterpret_cast *>(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kF64: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kCF64: cutlass::reference::host::BlockFillRandom>( reinterpret_cast *>(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kS2: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kS4: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kS8: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kS16: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kS32: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kS64: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kB1: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kU2: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kU4: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kU8: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kU16: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kU32: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), capacity_, seed, dist ); break; case library::NumericTypeID::kU64: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), capacity_, seed, dist ); break; default: break; } copy_from_host(host_data.data()); } void DeviceAllocation::initialize_sequential_device(Distribution dist) { if (!bytes()) { #ifndef NDEBUG std::cout << "Skipping initialization of size 0 allocation\n"; #endif return; } if (!data()) { throw std::runtime_error("Attempting to initialize invalid allocation."); } switch (type_) { case library::NumericTypeID::kFE4M3: cutlass::reference::device::BlockFillSequential( reinterpret_cast(pointer_), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kFE5M2: cutlass::reference::device::BlockFillSequential( reinterpret_cast(pointer_), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kFUE4M3: cutlass::reference::device::BlockFillSequential( reinterpret_cast(pointer_), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kFE2M3: cutlass::reference::device::BlockFillSequential( reinterpret_cast(pointer_), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kFE3M2: cutlass::reference::device::BlockFillSequential( reinterpret_cast(pointer_), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kFE2M1: cutlass::reference::device::BlockFillSequential( reinterpret_cast(pointer_), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kFUE8M0: cutlass::reference::device::BlockFillSequential( reinterpret_cast(pointer_), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kF16: cutlass::reference::device::BlockFillSequential( reinterpret_cast(pointer_), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kBF16: cutlass::reference::device::BlockFillSequential( reinterpret_cast(pointer_), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kTF32: cutlass::reference::device::BlockFillSequential( reinterpret_cast(pointer_), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kF32: cutlass::reference::device::BlockFillSequential( reinterpret_cast(pointer_), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kCF16: cutlass::reference::device::BlockFillSequential>( reinterpret_cast *>(pointer_), capacity_, cutlass::complex( static_cast(dist.sequential.delta)), cutlass::complex( static_cast(dist.sequential.start)) ); break; case library::NumericTypeID::kCBF16: cutlass::reference::device::BlockFillSequential>( reinterpret_cast *>(pointer_), capacity_, cutlass::complex( static_cast(dist.sequential.delta)), cutlass::complex( static_cast(dist.sequential.start)) ); break; case library::NumericTypeID::kCTF32: cutlass::reference::device::BlockFillSequential>( reinterpret_cast *>(pointer_), capacity_, cutlass::complex( static_cast(dist.sequential.delta)), cutlass::complex( static_cast(dist.sequential.start)) ); break; case library::NumericTypeID::kCF32: cutlass::reference::device::BlockFillSequential>( reinterpret_cast *>(pointer_), capacity_, cutlass::complex( static_cast(dist.sequential.delta)), cutlass::complex( static_cast(dist.sequential.start)) ); break; case library::NumericTypeID::kF64: cutlass::reference::device::BlockFillSequential( reinterpret_cast(pointer_), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kCF64: cutlass::reference::device::BlockFillSequential>( reinterpret_cast *>(pointer_), capacity_, cutlass::complex( static_cast(dist.sequential.delta)), cutlass::complex( static_cast(dist.sequential.start)) ); break; case library::NumericTypeID::kS2: cutlass::reference::device::BlockFillSequential( reinterpret_cast(pointer_), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kS4: cutlass::reference::device::BlockFillSequential( reinterpret_cast(pointer_), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kS8: cutlass::reference::device::BlockFillSequential( reinterpret_cast(pointer_), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kS16: cutlass::reference::device::BlockFillSequential( reinterpret_cast(pointer_), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kS32: cutlass::reference::device::BlockFillSequential( reinterpret_cast(pointer_), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kS64: cutlass::reference::device::BlockFillSequential( reinterpret_cast(pointer_), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kB1: cutlass::reference::device::BlockFillSequential( reinterpret_cast(pointer_), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kU2: cutlass::reference::device::BlockFillSequential( reinterpret_cast(pointer_), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kU4: cutlass::reference::device::BlockFillSequential( reinterpret_cast(pointer_), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kU8: cutlass::reference::device::BlockFillSequential( reinterpret_cast(pointer_), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kU16: cutlass::reference::device::BlockFillSequential( reinterpret_cast(pointer_), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kU32: cutlass::reference::device::BlockFillSequential( reinterpret_cast(pointer_), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kU64: cutlass::reference::device::BlockFillSequential( reinterpret_cast(pointer_), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; default: break; } } void DeviceAllocation::initialize_sequential_host(Distribution dist) { if (!bytes()) { #ifndef NDEBUG std::cout << "Skipping initialization of size 0 allocation\n"; #endif return; } if (!data()) { throw std::runtime_error("Attempting to initialize invalid allocation."); } std::vector host_data(bytes()); switch (type_) { case library::NumericTypeID::kFE4M3: cutlass::reference::host::BlockFillSequential( reinterpret_cast(host_data.data()), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kFE5M2: cutlass::reference::host::BlockFillSequential( reinterpret_cast(host_data.data()), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kFUE4M3: cutlass::reference::host::BlockFillSequential( reinterpret_cast(host_data.data()), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kFE2M3: cutlass::reference::host::BlockFillSequential( reinterpret_cast(host_data.data()), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kFE3M2: cutlass::reference::host::BlockFillSequential( reinterpret_cast(host_data.data()), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kFE2M1: cutlass::reference::host::BlockFillSequential( reinterpret_cast(host_data.data()), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kFUE8M0: cutlass::reference::host::BlockFillSequential( reinterpret_cast(host_data.data()), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kF16: cutlass::reference::host::BlockFillSequential( reinterpret_cast(host_data.data()), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kBF16: cutlass::reference::host::BlockFillSequential( reinterpret_cast(host_data.data()), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kTF32: cutlass::reference::host::BlockFillSequential( reinterpret_cast(host_data.data()), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kF32: cutlass::reference::host::BlockFillSequential( reinterpret_cast(host_data.data()), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kCF16: cutlass::reference::host::BlockFillSequential>( reinterpret_cast *>(host_data.data()), capacity_, cutlass::complex( static_cast(dist.sequential.delta)), cutlass::complex( static_cast(dist.sequential.start)) ); break; case library::NumericTypeID::kCBF16: cutlass::reference::host::BlockFillSequential>( reinterpret_cast *>(host_data.data()), capacity_, cutlass::complex( static_cast(dist.sequential.delta)), cutlass::complex( static_cast(dist.sequential.start)) ); break; case library::NumericTypeID::kCTF32: cutlass::reference::host::BlockFillSequential>( reinterpret_cast *>(host_data.data()), capacity_, cutlass::complex( static_cast(dist.sequential.delta)), cutlass::complex( static_cast(dist.sequential.start)) ); break; case library::NumericTypeID::kCF32: cutlass::reference::host::BlockFillSequential>( reinterpret_cast *>(host_data.data()), capacity_, cutlass::complex( static_cast(dist.sequential.delta)), cutlass::complex( static_cast(dist.sequential.start)) ); break; case library::NumericTypeID::kF64: cutlass::reference::host::BlockFillSequential( reinterpret_cast(host_data.data()), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kCF64: cutlass::reference::host::BlockFillSequential>( reinterpret_cast *>(host_data.data()), capacity_, cutlass::complex( static_cast(dist.sequential.delta)), cutlass::complex( static_cast(dist.sequential.start)) ); break; case library::NumericTypeID::kS2: cutlass::reference::host::BlockFillSequential( reinterpret_cast(host_data.data()), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kS4: cutlass::reference::host::BlockFillSequential( reinterpret_cast(host_data.data()), capacity_, static_cast(dist.sequential.delta), static_cast(dist.sequential.start) ); break; case library::NumericTypeID::kS8: cutlass::reference::host::BlockFillSequential( reinterpret_cast(host_data.data()), capacity_, static_cast(dist.sequential.delta), static_cast