#pragma once // DO NOT DEFINE STATIC DATA IN THIS HEADER! // See Note [Do not compile initializers with SVE] #include #include #include #include #include #include #include // This file defines Vectorized<> for the quantized types. // // // Currently, we simply use these classes as efficient converters between // the quantized types and Vectorized, usually in bandwidth-bound cases // where doing the arithmetic in full-precision is acceptable (e.g. // elementwise operators). // // // Conversions are as follows: // Vectorized -> 4x Vectorized // Vectorized -> 4x Vectorized // Vectorized -> 1x Vectorized // // The size of the returned float vector is specified by the special // constexpr function float_num_vecs. The type of the value returned // from dequantize (and expected as an argument to quantize) is // specified by float_vec_return_type. // // When writing kernels with these vectors, it is expected that floating- // point operations will be carried out in a loop over // Vectorized::float_num_vecs iterations. namespace at::vec { // Note [CPU_CAPABILITY namespace] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // This header, and all of its subheaders, will be compiled with // different architecture flags for each supported set of vector // intrinsics. So we need to make sure they aren't inadvertently // linked together. We do this by declaring objects in an `inline // namespace` which changes the name mangling, but can still be // accessed as `at::vec`. inline namespace CPU_CAPABILITY { #if defined(CPU_CAPABILITY_SVE) // NOTE: These are low-performance implementations that we fall back on // if we are not building with SVE. This may not be an issue, because // currently for quantization we assume the user has at least SVE // installed, so these can simply act as a reference implementation. // // If in the future we relax this requirement (SVE+), we should probably // revisit these implementations template < typename T, typename float_vec_return_type_, typename int_vec_return_type_, int size_> struct VectorizedQuantizedConverter { using size_type = int; static constexpr size_type size() { return size_; } static constexpr int float_num_vecs() { return size() / Vectorized::size(); } static constexpr int int_num_vecs() { return size() / Vectorized::size(); } using float_vec_return_type = float_vec_return_type_; using int_vec_return_type = int_vec_return_type_; using value_type = typename T::underlying; std::array vals; VectorizedQuantizedConverter(T val) { for (size_t i = 0; i < size(); ++i) { vals[i] = val.val_; } } VectorizedQuantizedConverter(const void* ptr) { memcpy(vals.data(), ptr, sizeof(value_type) * size()); } void store(void* ptr, int count = size()) const { memcpy(ptr, vals.data(), count * sizeof(value_type)); } float_vec_return_type dequantize( Vectorized scale, Vectorized zero_point, Vectorized scale_zp_premul) const { float_vec_return_type rv; float tmp_scale[Vectorized::size()]; float tmp_zero_point[Vectorized::size()]; scale.store(tmp_scale); zero_point.store(tmp_zero_point); for (int i = 0; i < float_num_vecs(); ++i) { float tmp_vals[Vectorized::size()]; for (int j = 0; j < Vectorized::size(); ++j) { tmp_vals[j] = at::native::dequantize_val( tmp_scale[j], tmp_zero_point[j], T(vals[Vectorized::size() * i + j])); } rv[i] = Vectorized::loadu(tmp_vals); } return rv; } float_vec_return_type dequantize( Vectorized scale, Vectorized zero_point) const { float_vec_return_type rv; float tmp_scale[Vectorized::size()]; float tmp_zero_point[Vectorized::size()]; scale.store(tmp_scale); zero_point.store(tmp_zero_point); for (int i = 0; i < float_num_vecs(); ++i) { float tmp_vals[Vectorized::size()]; for (int j = 0; j < Vectorized::size(); ++j) { tmp_vals[j] = at::native::dequantize_val( tmp_scale[j], tmp_zero_point[j], T(vals[Vectorized::size() * i + j])); } rv[i] = Vectorized::loadu(tmp_vals); } return rv; } protected: VectorizedQuantizedConverter() {} }; template <> struct is_vec_specialized_for : std::bool_constant {}; template <> struct Vectorized : public VectorizedQuantizedConverter< c10::qint32, std::array, 1>, std::array, 1>, VECTOR_WIDTH / 4> { Vectorized() : VectorizedQuantizedConverter< c10::qint32, std::array, 1>, std::array, 1>, VECTOR_WIDTH / 4>() {} Vectorized(c10::qint32 val) : VectorizedQuantizedConverter< c10::qint32, std::array, 1>, std::array, 1>, VECTOR_WIDTH / 4>(val) {} Vectorized(const void* ptr) : VectorizedQuantizedConverter< c10::qint32, std::array, 1>, std::array, 1>, VECTOR_WIDTH / 4>(ptr) {} #if 1 static Vectorized loadu(const void* ptr) { return Vectorized(ptr); } static Vectorized loadu(const void* ptr, int64_t count) { __at_align__ value_type tmp_values[size()]; // Ensure uninitialized memory does not change the output value See // https://github.com/pytorch/pytorch/issues/32502 for more details. We do // not initialize arrays to zero using "={0}" because gcc would compile it // to two instructions while a loop would be compiled to one instruction. for (const auto i : c10::irange(size())) { tmp_values[i] = 0; } std::memcpy( tmp_values, reinterpret_cast(ptr), count * sizeof(value_type)); return loadu(tmp_values); } #else static Vectorized loadu( const void* ptr, int64_t count = size()) { if (count == size()) return svld1_s32(ptrue, reinterpret_cast(ptr)); svbool_t pg = svwhilelt_b32(0ull, count); return svld1_s32(pg, reinterpret_cast(ptr)); } #endif static Vectorized quantize( const float_vec_return_type& rhs, float scale, int32_t zero_point, float inverse_scale) { std::array qvals; std::array::size()> float_vals; for (int i = 0; i < float_num_vecs(); ++i) { rhs[i].store( &float_vals[i * Vectorized::size()], Vectorized::size()); } at::native::quantize_vec( scale, zero_point, float_vals.data(), (c10::qint32*)qvals.data(), Vectorized::size() * float_num_vecs()); return Vectorized::loadu(qvals.data()); } Vectorized maximum(Vectorized b) const { Vectorized retval; for (size_t i = 0; i < size(); ++i) { retval.vals[i] = std::max(vals[i], b.vals[i]); } return retval; } Vectorized minimum(Vectorized b) const { Vectorized retval; for (size_t i = 0; i < size(); ++i) { retval.vals[i] = std::min(vals[i], b.vals[i]); } return retval; } Vectorized relu(Vectorized zero_point) const { return maximum(zero_point); } Vectorized relu6( Vectorized zero_point, Vectorized q_six) { Vectorized retval; for (size_t i = 0; i < size(); ++i) { retval.vals[i] = std::min( std::max(vals[i], zero_point.vals[i]), q_six.vals[i]); } return retval; } int_vec_return_type widening_subtract(Vectorized b) const { int_vec_return_type retval; for (size_t i = 0; i < size(); ++i) { retval[0].vals[i] = vals[i] - b.vals[i]; } return retval; } static Vectorized requantize_from_int( const int_vec_return_type& inp, float multiplier, int32_t zero_point) { Vectorized retval; for (size_t i = 0; i < size(); ++i) { retval.vals[i] = nearbyint(static_cast(inp[0].vals[i]) * multiplier) + zero_point; } return retval; } }; template <> Vectorized inline maximum( const Vectorized& a, const Vectorized& b) { return a.maximum(b); } template <> Vectorized inline operator*( const Vectorized& a, const Vectorized& b) { Vectorized retval; for (size_t i = 0; i < std::decay_t::size(); ++i) { retval.vals[i] = a.vals[i] * b.vals[i]; } return retval; } template <> Vectorized inline operator+( const Vectorized& a, const Vectorized& b) { Vectorized retval; for (size_t i = 0; i < std::decay_t::size(); ++i) { retval.vals[i] = a.vals[i] + b.vals[i]; } return retval; } template <> struct is_vec_specialized_for : std::bool_constant {}; template <> struct Vectorized : public VectorizedQuantizedConverter< c10::qint8, std::array, 4>, std::array, 4>, VECTOR_WIDTH> { Vectorized() : VectorizedQuantizedConverter< c10::qint8, std::array, 4>, std::array, 4>, VECTOR_WIDTH>() {} Vectorized(c10::qint8 val) : VectorizedQuantizedConverter< c10::qint8, std::array, 4>, std::array, 4>, VECTOR_WIDTH>(val) {} Vectorized(const void* ptr) : VectorizedQuantizedConverter< c10::qint8, std::array, 4>, std::array, 4>, VECTOR_WIDTH>(ptr) {} static Vectorized loadu(const void* ptr) { return Vectorized(ptr); } static Vectorized loadu(const void* ptr, int64_t count) { __at_align__ value_type tmp_values[size()]; // Ensure uninitialized memory does not change the output value See // https://github.com/pytorch/pytorch/issues/32502 for more details. We do // not initialize arrays to zero using "={0}" because gcc would compile it // to two instructions while a loop would be compiled to one instruction. for (const auto i : c10::irange(size())) { tmp_values[i] = 0; } std::memcpy( tmp_values, reinterpret_cast(ptr), count * sizeof(value_type)); return loadu(tmp_values); } static Vectorized quantize( const float_vec_return_type& rhs, float scale, int32_t zero_point, float inverse_scale) { std::array qvals; std::array::size()> float_vals; for (int i = 0; i < float_num_vecs(); ++i) { rhs[i].store( &float_vals[i * Vectorized::size()], Vectorized::size()); } at::native::quantize_vec( scale, zero_point, float_vals.data(), (c10::qint8*)qvals.data(), Vectorized::size() * float_num_vecs()); return Vectorized::loadu(qvals.data()); } Vectorized maximum(Vectorized b) const { Vectorized retval; for (size_t i = 0; i < size(); ++i) { retval.vals[i] = std::max(vals[i], b.vals[i]); } return retval; } Vectorized minimum(Vectorized b) const { Vectorized retval; for (size_t i = 0; i < size(); ++i) { retval.vals[i] = std::min(vals[i], b.vals[i]); } return retval; } Vectorized relu(Vectorized zero_point) const { return maximum(zero_point); } Vectorized relu6( Vectorized zero_point, Vectorized q_six) { Vectorized retval; for (size_t i = 0; i < size(); ++i) { retval.vals[i] = std::min( std::max(vals[i], zero_point.vals[i]), q_six.vals[i]); } return retval; } int_vec_return_type widening_subtract(Vectorized b) const { int_vec_return_type retval; constexpr int elem_per_int_vec = size() / int_num_vecs(); for (size_t i = 0; i < int_num_vecs(); ++i) { for (size_t j = 0; j < elem_per_int_vec; ++j) { retval[i].vals[j] = static_cast(vals[i * elem_per_int_vec + j]) - static_cast(b.vals[i * elem_per_int_vec + j]); } } return retval; } static Vectorized requantize_from_int( const int_vec_return_type& inp, float multiplier, int32_t zero_point) { constexpr int elem_per_int_vec = size() / int_num_vecs(); constexpr auto min_val = std::numeric_limits::min(); constexpr auto max_val = std::numeric_limits::max(); Vectorized retval; for (size_t i = 0; i < int_num_vecs(); ++i) { for (size_t j = 0; j < elem_per_int_vec; ++j) { int32_t rounded = nearbyint(static_cast(inp[i].vals[j]) * multiplier) + zero_point; retval.vals[i * elem_per_int_vec + j] = std::min(std::max(rounded, min_val), max_val); } } return retval; } }; template <> Vectorized inline maximum( const Vectorized& a, const Vectorized& b) { return a.maximum(b); } template <> struct is_vec_specialized_for : std::bool_constant {}; template <> struct Vectorized : public VectorizedQuantizedConverter< c10::quint8, std::array, 4>, std::array, 4>, VECTOR_WIDTH> { Vectorized() : VectorizedQuantizedConverter< c10::quint8, std::array, 4>, std::array, 4>, VECTOR_WIDTH>() {} Vectorized(c10::quint8 val) : VectorizedQuantizedConverter< c10::quint8, std::array, 4>, std::array, 4>, VECTOR_WIDTH>(val) {} Vectorized(const void* ptr) : VectorizedQuantizedConverter< c10::quint8, std::array, 4>, std::array, 4>, VECTOR_WIDTH>(ptr) {} #if 1 static Vectorized loadu(const void* ptr) { return Vectorized(ptr); } static Vectorized loadu(const void* ptr, int64_t count) { __at_align__ value_type tmp_values[size()]; // Ensure uninitialized memory does not change the output value See // https://github.com/pytorch/pytorch/issues/32502 for more details. We do // not initialize arrays to zero using "={0}" because gcc would compile it // to two instructions while a loop would be compiled to one instruction. for (const auto i : c10::irange(size())) { tmp_values[i] = 0; } std::memcpy( tmp_values, reinterpret_cast(ptr), count * sizeof(value_type)); return loadu(tmp_values); } #else static Vectorized loadu( const void* ptr, int64_t count = size()) { if (count == size()) return svld1_u8(ptrue, reinterpret_cast(ptr)); svbool_t pg = svwhilelt_b8(0ull, count); return svld1_u8(pg, reinterpret_cast(ptr)); } #endif static Vectorized quantize( const float_vec_return_type& rhs, float scale, int32_t zero_point, float inverse_scale) { std::array qvals; std::array::size()> float_vals; for (int i = 0; i < float_num_vecs(); ++i) { rhs[i].store( &float_vals[i * Vectorized::size()], Vectorized::size()); } at::native::quantize_vec( scale, zero_point, float_vals.data(), (c10::quint8*)qvals.data(), Vectorized::size() * float_num_vecs()); return Vectorized::loadu(qvals.data()); } Vectorized maximum(Vectorized b) const { Vectorized retval; for (size_t i = 0; i < size(); ++i) { retval.vals[i] = std::max(vals[i], b.vals[i]); } return retval; } Vectorized minimum(Vectorized b) const { Vectorized retval; for (size_t i = 0; i < size(); ++i) { retval.vals[i] = std::min(vals[i], b.vals[i]); } return retval; } Vectorized relu(Vectorized zero_point) const { return maximum(zero_point); } Vectorized relu6( Vectorized zero_point, Vectorized q_six) { Vectorized retval; for (size_t i = 0; i < size(); ++i) { retval.vals[i] = std::min( std::max(vals[i], zero_point.vals[i]), q_six.vals[i]); } return retval; } int_vec_return_type widening_subtract(Vectorized b) const { int_vec_return_type retval; constexpr int elem_per_int_vec = size() / int_num_vecs(); for (size_t i = 0; i < int_num_vecs(); ++i) { for (size_t j = 0; j < elem_per_int_vec; ++j) { retval[i].vals[j] = static_cast(vals[i * elem_per_int_vec + j]) - static_cast(b.vals[i * elem_per_int_vec + j]); } } return retval; } static Vectorized requantize_from_int( const int_vec_return_type& inp, float multiplier, int32_t zero_point) { constexpr int elem_per_int_vec = size() / int_num_vecs(); constexpr auto min_val = std::numeric_limits::min(); constexpr auto max_val = std::numeric_limits::max(); Vectorized retval; for (size_t i = 0; i < int_num_vecs(); ++i) { for (size_t j = 0; j < elem_per_int_vec; ++j) { int32_t rounded = nearbyint(static_cast(inp[i].vals[j]) * multiplier) + zero_point; retval.vals[i * elem_per_int_vec + j] = std::min(std::max(rounded, min_val), max_val); } } return retval; } }; template <> Vectorized inline maximum( const Vectorized& a, const Vectorized& b) { return a.maximum(b); } #endif // defined(CPU_CAPABILITY_SVE) } // namespace CPU_CAPABILITY } // namespace at::vec