#include #include #include #include #include #if defined(__clang__) #include #elif defined(__GNUC__) || defined(__GNUG__) #include #include #endif #include #include #include namespace at { namespace vec { // See Note [CPU_CAPABILITY namespace] inline namespace CPU_CAPABILITY { template constexpr bool is_zarch_implemented() { return ( std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v); } template constexpr bool is_zarch_implemented_quant() { return ( std::is_same_v || std::is_same_v || std::is_same_v); } template constexpr bool is_zarch_implemented_complex() { return std::is_same_v> || std::is_same_v>; } constexpr int offset0 = 0; constexpr int offset16 = 16; template struct VecBinaryType { using type __attribute__((vector_size(16))) = uintmax_t; }; template <> struct VecBinaryType<8> { using type = __attribute__((vector_size(16))) unsigned long long; }; template <> struct VecBinaryType<4> { using type = __attribute__((vector_size(16))) unsigned int; }; template <> struct VecBinaryType<2> { using type = __attribute__((vector_size(16))) unsigned short; }; template <> struct VecBinaryType<1> { using type = __attribute__((vector_size(16))) unsigned char; }; template struct VecInnerType { using Type __attribute__((vector_size(16))) = T; using BinaryType = typename VecBinaryType::type; using ElementType = T; static constexpr int size = 16 / sizeof(T); }; // define for int64_t properly for load template <> struct VecInnerType { using Type = __attribute__((vector_size(16))) signed long long; using ElementType = signed long long; using BinaryType = typename VecBinaryType::type; static constexpr int size = 16 / sizeof(signed long long); }; template using ZSimdVect = typename VecInnerType::Type; template using ZSimdVectBinary = typename VecInnerType::BinaryType; template using ZSimdVectElement = typename VecInnerType::ElementType; constexpr int blendChoiceInner( const uint64_t mask, const uint64_t half1 = 0xF, const uint64_t half2 = 0xF0) { uint64_t none = 0; uint64_t both = half1 | half2; // clamp it between 0 and both auto res_mask = mask & both; // return (a._vec0, a._vec1) if (res_mask == none) return 0; // return (b._vec0,b._vec1) else if (res_mask == both) return 1; // return (b._vec0, a._vec1) else if (res_mask == half1) return 2; // return (a._vec0,b._vec1) else if (res_mask == half2) return 3; // return (*_vec0,a._vec1) else if (res_mask > 0 && res_mask < half1) return 4; // return (*_vec0,b._vec1) else if ((res_mask & half2) == half2) return 5; // return (a._vec0,*_vec1) else if ((res_mask & half1) == 0 && res_mask > half1) return 6; // return (b._vec0,*_vec1) else if ((res_mask & half1) == half1 && res_mask > half1) return 7; // return (*_vec0,*_vec1) return 8; } // it can be used to emulate blend faster template constexpr int blendChoice(const uint64_t mask) { static_assert(Z < 1 || Z > 8, "not implemented"); return blendChoiceInner(mask); } template <> constexpr int blendChoice<1>(const uint64_t mask) { return blendChoiceInner(mask, 0x0000FFFF, 0xFFFF0000); } template <> constexpr int blendChoice<2>(const uint64_t mask) { return blendChoiceInner(mask, 0x00FF, 0xFF00); } template <> constexpr int blendChoice<4>(const uint64_t mask) { return blendChoiceInner(mask, 0xF, 0xF0); } template <> constexpr int blendChoice<8>(const uint64_t mask) { // clamp it 0 and 0xF return blendChoiceInner(mask, 0x3, 0xC); } template constexpr auto GetMask1(const uint64_t mask) { return typename VecBinaryType::type{}; } template constexpr auto GetMask2(const uint64_t mask) { return typename VecBinaryType::type{}; } template <> constexpr auto GetMask1<1>(const uint64_t mask) { constexpr uint8_t t = (int)0xFF; uint8_t g0 = (mask & 1) * t; uint8_t g1 = ((mask & 2) >> 1) * t; uint8_t g2 = ((mask & 4) >> 2) * t; uint8_t g3 = ((mask & 8) >> 3) * t; uint8_t g4 = ((mask & 16) >> 4) * t; uint8_t g5 = ((mask & 32) >> 5) * t; uint8_t g6 = ((mask & 64) >> 6) * t; uint8_t g7 = ((mask & 128) >> 7) * t; uint8_t g8 = ((mask & 256) >> 8) * t; uint8_t g9 = ((mask & 512) >> 9) * t; uint8_t g10 = ((mask & 1024) >> 10) * t; uint8_t g11 = ((mask & 2048) >> 11) * t; uint8_t g12 = ((mask & 4096) >> 12) * t; uint8_t g13 = ((mask & 8192) >> 13) * t; uint8_t g14 = ((mask & 16384) >> 14) * t; uint8_t g15 = ((mask & 32768) >> 15) * t; return (typename VecBinaryType<1>::type){ g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15}; } template <> constexpr auto GetMask2<1>(const uint64_t mask) { uint64_t mask2 = (mask & 0xFFFFFFFF) >> 16; return GetMask1<1>(mask2); } template <> constexpr auto GetMask1<2>(const uint64_t mask) { constexpr uint16_t t = (int)0xFFFF; uint16_t g0 = (mask & 1) * t; uint16_t g1 = ((mask & 2) >> 1) * t; uint16_t g2 = ((mask & 4) >> 2) * t; uint16_t g3 = ((mask & 8) >> 3) * t; uint16_t g4 = ((mask & 16) >> 4) * t; uint16_t g5 = ((mask & 32) >> 5) * t; uint16_t g6 = ((mask & 64) >> 6) * t; uint16_t g7 = ((mask & 128) >> 7) * t; return (typename VecBinaryType<2>::type){g0, g1, g2, g3, g4, g5, g6, g7}; } template <> constexpr auto GetMask2<2>(const uint64_t mask) { uint64_t mask2 = (mask & 0xFFFF) >> 8; return GetMask1<2>(mask2); } template <> constexpr auto GetMask1<4>(const uint64_t mask) { uint32_t g0 = (mask & 1) * 0xffffffff; uint32_t g1 = ((mask & 2) >> 1) * 0xffffffff; uint32_t g2 = ((mask & 4) >> 2) * 0xffffffff; uint32_t g3 = ((mask & 8) >> 3) * 0xffffffff; return (typename VecBinaryType<4>::type){g0, g1, g2, g3}; } template <> constexpr auto GetMask2<4>(const uint64_t mask) { uint64_t mask2 = (mask & 0xFF) >> 4; return GetMask1<4>(mask2); } template <> constexpr auto GetMask1<8>(const uint64_t mask) { uint64_t g0 = (mask & 1) * 0xffffffffffffffff; uint64_t g1 = ((mask & 2) >> 1) * 0xffffffffffffffff; return (typename VecBinaryType<8>::type){g0, g1}; } template <> constexpr auto GetMask2<8>(const uint64_t mask) { uint64_t mask2 = (mask & 0xF) >> 2; return GetMask1<8>(mask2); } template constexpr int maskForComplex(uint32_t mask) { return 0; } template <> constexpr int maskForComplex<8>(uint32_t mask) { mask = mask & 0xF; int complex_mask = 0; if (mask & 1) complex_mask |= 3; if (mask & 2) complex_mask |= (3 << 2); if (mask & 4) complex_mask |= (3 << 4); if (mask & 8) complex_mask |= (3 << 6); return complex_mask; } template <> constexpr int maskForComplex<16>(uint32_t mask) { mask = mask & 0x3; int complex_mask = 0; if (mask & 1) complex_mask |= 3; if (mask & 2) complex_mask |= (3 << 2); return complex_mask; } template > constexpr int blend_choice() { return 0xAA; } template <> constexpr int blend_choice>() { return 0x0A; } constexpr int64_t allbitset(int16_t x) { int64_t onex = 1; return (onex << x) - onex; } namespace { /* unnamed namespace */ ZSimdVect vec_mergee(ZSimdVect x, ZSimdVect y) { constexpr ZSimdVectBinary mergee_mask{ 0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 24, 25, 26, 27}; return vec_perm(x, y, mergee_mask); } ZSimdVect vec_mergee(ZSimdVect x, ZSimdVect y) { return vec_mergeh(x, y); } ZSimdVect vec_mergeo(ZSimdVect x, ZSimdVect y) { constexpr ZSimdVectBinary mergeo_mask{ 4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31}; return vec_perm(x, y, mergeo_mask); } ZSimdVect vec_mergeo(ZSimdVect x, ZSimdVect y) { return vec_mergel(x, y); } } /* unnamed namespace */ // template constexpr auto GetBpermZeroMask() { return ZSimdVectBinary{ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 96, 64, 32, 0}; } template <> constexpr auto GetBpermZeroMask() { return ZSimdVectBinary{ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 64, 0}; } constexpr auto GetSwapMaskFloat() { return ZSimdVectBinary{ 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; } template struct is_vec_specialized_for()>> : std::bool_constant {}; template struct Vectorized()>> { public: using value_type = T; using vtype = ZSimdVect; using vmaskType = ZSimdVectBinary; using size_type = int; // because of gcc inconsistency for int64_t we are obliged to use this, not // value_type using ElementType = ZSimdVectElement; using vinner_data = std::pair; private: vtype _vec0; vtype _vec1; public: static constexpr size_type size() { return VECTOR_WIDTH / sizeof(ElementType); } Vectorized() {} C10_ALWAYS_INLINE Vectorized(vtype v) : _vec0{v}, _vec1{v} {} C10_ALWAYS_INLINE Vectorized(const vinner_data& v) : _vec0{v.first}, _vec1{v.second} {} C10_ALWAYS_INLINE Vectorized(vtype v1, vtype v2) : _vec0{v1}, _vec1{v2} {} C10_ALWAYS_INLINE Vectorized(T s) : _vec0{vec_splats((ElementType)s)}, _vec1{vec_splats((ElementType)s)} {} template struct LoaduHelper { static Vectorized C10_ALWAYS_INLINE loadu(const U* ptr, int count = size()) { __at_align__ ElementType tmp_values[size()] = {}; std::memcpy( tmp_values, ptr, std::min(count, size()) * sizeof(ElementType)); return { vec_xl(offset0, &(tmp_values[0])), vec_xl(offset16, &(tmp_values[0]))}; } }; template struct LoaduHelper { static Vectorized C10_ALWAYS_INLINE loadu(const ElementType* ptr, int count = size()) { if (count == size()) { return {vec_xl(offset0, ptr), vec_xl(offset16, ptr)}; } __at_align__ ElementType tmp_values[size()] = {}; std::memcpy( tmp_values, ptr, std::min(count, size()) * sizeof(ElementType)); return { vec_xl(offset0, &(tmp_values[0])), vec_xl(offset16, &(tmp_values[0]))}; } }; template static Vectorized C10_ALWAYS_INLINE loadu(const U* ptr, int count = size()) { return LoaduHelper::loadu(ptr, count); } template static Vectorized C10_ALWAYS_INLINE loadu_one_fourth(const U* ptr) { // load only first 8 bytes // only intended to be used with uint8_t return loadu(ptr, 8 / sizeof(ElementType)); } template struct StoreHelper { static void C10_ALWAYS_INLINE store(const Vectorized& vec, U* ptr, int count = size()) { if (count > 0) { __at_align__ ElementType tmp_values[size()]; vec_xst(vec._vec0, offset0, &(tmp_values[0])); vec_xst(vec._vec1, offset16, &(tmp_values[0])); std::memcpy( ptr, tmp_values, std::min(count, size()) * sizeof(ElementType)); } } }; template struct StoreHelper { static void C10_ALWAYS_INLINE store(const Vectorized& vec, ElementType* ptr, int count = size()) { if (count == size()) { vec_xst(vec._vec0, offset0, ptr); vec_xst(vec._vec1, offset16, ptr); } else if (count > 0) { __at_align__ ElementType tmp_values[size()]; vec_xst(vec._vec0, offset0, &(tmp_values[0])); vec_xst(vec._vec1, offset16, &(tmp_values[0])); std::memcpy( ptr, tmp_values, std::min(count, size()) * sizeof(ElementType)); } } }; template void C10_ALWAYS_INLINE store(U* ptr, int count = size()) const { return StoreHelper::store(*this, ptr, count); } C10_ALWAYS_INLINE const vtype& vec0() const { return _vec0; } C10_ALWAYS_INLINE const vtype& vec1() const { return _vec1; } C10_ALWAYS_INLINE vinner_data data() const { return std::make_pair<>(_vec0, _vec1); } C10_ALWAYS_INLINE operator vinner_data() const { return data(); } C10_ALWAYS_INLINE const vmaskType vecb0() const { return (vmaskType)_vec0; } C10_ALWAYS_INLINE const vmaskType vecb1() const { return (vmaskType)_vec1; } static Vectorized C10_ALWAYS_INLINE blendv( const Vectorized& a, const Vectorized& b, const Vectorized& mask) { return { vec_sel(a._vec0, b._vec0, mask.vecb0()), vec_sel(a._vec1, b._vec1, mask.vecb1())}; } template = 0> C10_ALWAYS_INLINE Vectorized(T s1, T s2, T s3, T s4) : _vec0{s1, s2}, _vec1{s3, s4} {} template = 0> C10_ALWAYS_INLINE Vectorized(T s1, T s2, T s3, T s4, T s5, T s6, T s7, T s8) : _vec0{s1, s2, s3, s4}, _vec1{s5, s6, s7, s8} {} template = 0> C10_ALWAYS_INLINE Vectorized( T s1, T s2, T s3, T s4, T s5, T s6, T s7, T s8, T s9, T s10, T s11, T s12, T s13, T s14, T s15, T s16) : _vec0{s1, s2, s3, s4, s5, s6, s7, s8}, _vec1{s9, s10, s11, s12, s13, s14, s15, s16} {} template = 0> C10_ALWAYS_INLINE Vectorized( T s1, T s2, T s3, T s4, T s5, T s6, T s7, T s8, T s9, T s10, T s11, T s12, T s13, T s14, T s15, T s16, T s17, T s18, T s19, T s20, T s21, T s22, T s23, T s24, T s25, T s26, T s27, T s28, T s29, T s30, T s31, T s32) : _vec0{s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16}, _vec1{ s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31, s32} {} template static std::enable_if_t> arange( T base = 0, step_t step = static_cast(1)) { return Vectorized(base, base + step, base + 2 * step, base + 3 * step); } template static std::enable_if_t> arange( T base = 0, step_t step = static_cast(1)) { return Vectorized( base, base + step, base + 2 * step, base + 3 * step, base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step); } template static std::enable_if_t> arange( T base = 0, step_t step = static_cast(1)) { return Vectorized( base, base + step, base + 2 * step, base + 3 * step, base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step, base + 8 * step, base + 9 * step, base + 10 * step, base + 11 * step, base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step); } template static std::enable_if_t> arange( T base = 0, step_t step = static_cast(1)) { return Vectorized( base, base + step, base + 2 * step, base + 3 * step, base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step, base + 8 * step, base + 9 * step, base + 10 * step, base + 11 * step, base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step, base + 16 * step, base + 17 * step, base + 18 * step, base + 19 * step, base + 20 * step, base + 21 * step, base + 22 * step, base + 23 * step, base + 24 * step, base + 25 * step, base + 26 * step, base + 27 * step, base + 28 * step, base + 29 * step, base + 30 * step, base + 31 * step); } // blend section template static std::enable_if_t(mask) == 0, Vectorized> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { return a; } template static std::enable_if_t(mask) == 1, Vectorized> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { return b; } template static std::enable_if_t(mask) == 2, Vectorized> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { return {b._vec0, a._vec1}; } template static std::enable_if_t(mask) == 3, Vectorized> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { return {a._vec0, b._vec1}; } template static std::enable_if_t(mask) == 4, Vectorized> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { const vmaskType mask_1st = GetMask1(mask); return {(vtype)vec_sel(a._vec0, b._vec0, mask_1st), a._vec1}; } template static std::enable_if_t(mask) == 5, Vectorized> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { const vmaskType mask_1st = GetMask1(mask); return {(vtype)vec_sel(a._vec0, b._vec0, mask_1st), b._vec1}; } template static std::enable_if_t(mask) == 6, Vectorized> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { const vmaskType mask_2nd = GetMask2(mask); // generated masks return {a._vec0, (vtype)vec_sel(a._vec1, b._vec1, mask_2nd)}; } template static std::enable_if_t(mask) == 7, Vectorized> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { const vmaskType mask_2nd = GetMask2(mask); // generated masks return {b._vec0, (vtype)vec_sel(a._vec1, b._vec1, mask_2nd)}; } template static std::enable_if_t(mask) == 8, Vectorized> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { const vmaskType mask_1st = GetMask1(mask); const vmaskType mask_2nd = GetMask2(mask); return { (vtype)vec_sel(a._vec0, b._vec0, mask_1st), (vtype)vec_sel(a._vec1, b._vec1, mask_2nd)}; } template static inline std::enable_if_t<(Z >= C), Vectorized> set_inner( const Vectorized& a, const Vectorized& b, size_t count) { return b; } template static inline std::enable_if_t<(Z < C), Vectorized> set_inner( const Vectorized& a, const Vectorized& b, size_t count) { if (count == Z) return blend(a, b); else return set_inner(a, b, count); } static Vectorized set( const Vectorized& a, const Vectorized& b, size_t count = size()) { if (count == 0) return a; return set_inner<1, size()>(a, b, count); } const ElementType& operator[](int idx) const = delete; ElementType& operator[](int idx) = delete; Vectorized _not() const { return {(vtype)vec_nor(vecb0(), vecb0()), (vtype)vec_nor(vecb1(), vecb1())}; } Vectorized C10_ALWAYS_INLINE eq(const Vectorized& other) const { return (*this == other) & Vectorized((T)1.0); } Vectorized C10_ALWAYS_INLINE ne(const Vectorized& other) const { return (*this != other) & Vectorized((T)1.0); } Vectorized C10_ALWAYS_INLINE gt(const Vectorized& other) const { return (*this > other) & Vectorized((T)1.0); } Vectorized C10_ALWAYS_INLINE ge(const Vectorized& other) const { return (*this >= other) & Vectorized((T)1.0); } Vectorized C10_ALWAYS_INLINE lt(const Vectorized& other) const { return (*this < other) & Vectorized((T)1.0); } Vectorized C10_ALWAYS_INLINE le(const Vectorized& other) const { return (*this <= other) & Vectorized((T)1.0); } template , int> = 0> Vectorized C10_ALWAYS_INLINE abs() const { return {vec_abs(_vec0), vec_abs(_vec1)}; } template , int> = 0> Vectorized C10_ALWAYS_INLINE abs() const { return {_vec0, _vec1}; } Vectorized C10_ALWAYS_INLINE neg() const { return {-_vec0, -_vec1}; } Vectorized isnan() const { auto x = *this; auto ret = (x == x); return ret._not(); } bool has_inf_nan() const { for (const auto i : c10::irange(size() / 2)) { if (_isnan(_vec0[i]) || _isinf(_vec0[i])) { return true; } } for (const auto i : c10::irange(size() / 2)) { if (_isnan(_vec1[i]) || _isinf(_vec1[i])) { return true; } } return false; } template < typename U = T, std::enable_if_t, int> = 0> Vectorized angle() const { auto tmp = blendv( Vectorized(0), Vectorized(c10::pi), *this < Vectorized(0)); return blendv(tmp, *this, isnan()); } template < typename U = T, std::enable_if_t, int> = 0> Vectorized angle() const { return blendv( Vectorized(0), Vectorized(c10::pi), *this < Vectorized(0)); } Vectorized real() const { return *this; } Vectorized imag() const { return Vectorized{0}; } Vectorized conj() const { return *this; } template < typename U = T, std::enable_if_t, int> = 0> int zero_mask() const { auto cmp = (*this == Vectorized(0)); constexpr auto mask_zero_bits = GetBpermZeroMask(); ZSimdVectBinary result0 = vec_bperm_u128((ZSimdVectBinary)cmp.vecb0(), mask_zero_bits); ZSimdVectBinary result1 = vec_bperm_u128((ZSimdVectBinary)cmp.vecb1(), mask_zero_bits); return (result0[0] | (result1[0] << (size() / 2))); } Vectorized C10_ALWAYS_INLINE floor() const { return {vec_floor(_vec0), vec_floor(_vec1)}; } Vectorized C10_ALWAYS_INLINE ceil() const { return {vec_ceil(_vec0), vec_ceil(_vec1)}; } Vectorized C10_ALWAYS_INLINE round() const { return {vec_round(_vec0), vec_round(_vec1)}; } Vectorized C10_ALWAYS_INLINE rint() const { return {vec_rint(_vec0), vec_rint(_vec1)}; } Vectorized C10_ALWAYS_INLINE trunc() const { return {vec_trunc(_vec0), vec_trunc(_vec1)}; } Vectorized C10_ALWAYS_INLINE frac() const { return *this - trunc(); } Vectorized C10_ALWAYS_INLINE sqrt() const { return {vec_sqrt(_vec0), vec_sqrt(_vec1)}; } Vectorized C10_ALWAYS_INLINE reciprocal() const { return Vectorized((T)1) / (*this); } Vectorized C10_ALWAYS_INLINE rsqrt() const { return sqrt().reciprocal(); } template , int> = 0> inline Vectorized mapOrdinary(float (*const f)(float)) const { float a00 = f(_vec0[0]); float a01 = f(_vec0[1]); float a02 = f(_vec0[2]); float a03 = f(_vec0[3]); float a10 = f(_vec1[0]); float a11 = f(_vec1[1]); float a12 = f(_vec1[2]); float a13 = f(_vec1[3]); return Vectorized{a00, a01, a02, a03, a10, a11, a12, a13}; } template < typename U = T, std::enable_if_t, int> = 0> inline Vectorized mapOrdinary(double (*const f)(double)) const { return Vectorized(f(_vec0[0]), f(_vec0[1]), f(_vec1[0]), f(_vec1[1])); } template , int> = 0> inline Vectorized mapOrdinary( float (*const f)(float, float), const Vectorized& b) const { float a00 = f(_vec0[0], b._vec0[0]); float a01 = f(_vec0[1], b._vec0[1]); float a02 = f(_vec0[2], b._vec0[2]); float a03 = f(_vec0[3], b._vec0[3]); float a10 = f(_vec1[0], b._vec1[0]); float a11 = f(_vec1[1], b._vec1[1]); float a12 = f(_vec1[2], b._vec1[2]); float a13 = f(_vec1[3], b._vec1[3]); return Vectorized{a00, a01, a02, a03, a10, a11, a12, a13}; } template < typename U = T, std::enable_if_t, int> = 0> inline Vectorized mapOrdinary( double (*const f)(double, double), const Vectorized& b) const { return Vectorized( f(_vec0[0], b._vec0[0]), f(_vec0[1], b._vec0[1]), f(_vec1[0], b._vec1[0]), f(_vec1[1], b._vec1[1])); } template < typename FloatOp, typename DoubleOp, typename U = T, std::enable_if_t, int> = 0> inline Vectorized mapSleef(FloatOp f, DoubleOp d) const { vtype a0 = f(_vec0); vtype a1 = f(_vec1); return Vectorized{a0, a1}; } template < typename FloatOp, typename DoubleOp, typename U = T, std::enable_if_t, int> = 0> inline Vectorized mapSleef(FloatOp f, DoubleOp d) const { return Vectorized(d(_vec0), d(_vec1)); } template < typename FloatOp, typename DoubleOp, typename U = T, std::enable_if_t, int> = 0> inline Vectorized mapSleef(FloatOp f, DoubleOp d, const Vectorized& b) const { vtype a0 = f(_vec0, b._vec0); vtype a1 = f(_vec1, b._vec1); return Vectorized{a0, a1}; } template < typename FloatOp, typename DoubleOp, typename U = T, std::enable_if_t, int> = 0> inline Vectorized mapSleef(FloatOp f, DoubleOp d, const Vectorized& b) const { return Vectorized(d(_vec0, b._vec0), d(_vec1, b._vec1)); } Vectorized acos() const { return mapSleef(Sleef_acosf4_u10, Sleef_acosd2_u10); } Vectorized asin() const { return mapSleef(Sleef_asinf4_u10, Sleef_asind2_u10); } Vectorized atan() const { return mapSleef(Sleef_atanf4_u10, Sleef_atand2_u10); } Vectorized atanh() const { return mapSleef(Sleef_atanhf4_u10, Sleef_atanhd2_u10); } Vectorized erf() const { return mapSleef(Sleef_erff4_u10, Sleef_erfd2_u10); } Vectorized erfc() const { return mapSleef(Sleef_erfcf4_u15, Sleef_erfcd2_u15); } Vectorized exp() const { return mapSleef(Sleef_expf4_u10, Sleef_expd2_u10); } Vectorized exp2() const { return mapSleef(Sleef_exp2f4_u10, Sleef_exp2d2_u10); } Vectorized expm1() const { return mapSleef(Sleef_expm1f4_u10, Sleef_expm1d2_u10); } Vectorized exp_u20() const { return exp(); } Vectorized log() const { return mapSleef(Sleef_logf4_u10, Sleef_logd2_u10); } Vectorized log2() const { return mapSleef(Sleef_log2f4_u10, Sleef_log2d2_u10); } Vectorized log10() const { return mapSleef(Sleef_log10f4_u10, Sleef_log10d2_u10); } Vectorized log1p() const { return mapSleef(Sleef_log1pf4_u10, Sleef_log1pd2_u10); } Vectorized sin() const { return mapSleef(Sleef_sinf4_u10, Sleef_sind2_u10); } Vectorized sinh() const { return mapSleef(Sleef_sinhf4_u10, Sleef_sinhd2_u10); } Vectorized cos() const { return mapSleef(Sleef_cosf4_u10, Sleef_cosd2_u10); } Vectorized cosh() const { return mapSleef(Sleef_coshf4_u10, Sleef_coshd2_u10); } Vectorized tan() const { return mapSleef(Sleef_tanf4_u10, Sleef_tand2_u10); } Vectorized tanh() const { return mapSleef(Sleef_tanhf4_u10, Sleef_tanhd2_u10); } Vectorized lgamma() const { return mapSleef(Sleef_lgammaf4_u10, Sleef_lgammad2_u10); } Vectorized atan2(const Vectorized& b) const { return mapSleef(Sleef_atan2f4_u10, Sleef_atan2d2_u10, b); } Vectorized copysign(const Vectorized& sign) const { return mapSleef(Sleef_copysignf4, Sleef_copysignd2, sign); } Vectorized fmod(const Vectorized& q) const { return mapSleef(Sleef_fmodf4, Sleef_fmodd2, q); } Vectorized hypot(const Vectorized& b) const { return mapSleef(Sleef_hypotf4_u05, Sleef_hypotd2_u05, b); } Vectorized pow(const Vectorized& b) const { return mapSleef(Sleef_powf4_u10, Sleef_powd2_u10, b); } Vectorized nextafter(const Vectorized& b) const { return mapSleef(Sleef_nextafterf4, Sleef_nextafterd2, b); } Vectorized erfinv() const { return mapOrdinary(calc_erfinv); } Vectorized digamma() const { return mapOrdinary(calc_digamma); } Vectorized igamma(const Vectorized& x) const { return mapOrdinary(calc_igamma, x); } Vectorized igammac(const Vectorized& x) const { return mapOrdinary(calc_igammac, x); } Vectorized i0() const { return mapOrdinary(calc_i0); } Vectorized i0e() const { return mapOrdinary(calc_i0e); } template < typename U = T, std::enable_if_t, int> = 0> Vectorized minimum(const Vectorized& other) const { return {vec_min(_vec0, other._vec0), vec_min(_vec1, other._vec1)}; } /* Propagates NaN if either input is a NaN. */ template < typename U = T, std::enable_if_t, int> = 0> Vectorized minimum(const Vectorized& other) const { Vectorized tmp = { vec_min(_vec0, other._vec0), vec_min(_vec1, other._vec1)}; tmp = blendv(tmp, *this, isnan()); return blendv(tmp, other, other.isnan()); } template < typename U = T, std::enable_if_t, int> = 0> Vectorized maximum(const Vectorized& other) const { return {vec_max(_vec0, other._vec0), vec_max(_vec1, other._vec1)}; } /* Propagates NaN if either input is a NaN. */ template < typename U = T, std::enable_if_t, int> = 0> Vectorized maximum(const Vectorized& other) const { Vectorized tmp = { vec_max(_vec0, other._vec0), vec_max(_vec1, other._vec1)}; tmp = blendv(tmp, *this, isnan()); return blendv(tmp, other, other.isnan()); } template < typename U = T, std::enable_if_t, int> = 0> Vectorized clamp_min(const Vectorized& min) const { return {vec_max(_vec0, min._vec0), vec_max(_vec1, min._vec1)}; } /* Keeps NaN if actual value is NaN */ template < typename U = T, std::enable_if_t, int> = 0> Vectorized clamp_min(const Vectorized& min) const { Vectorized tmp = {vec_max(_vec0, min._vec0), vec_max(_vec1, min._vec1)}; return blendv(tmp, *this, isnan()); } template < typename U = T, std::enable_if_t, int> = 0> Vectorized clamp_max(const Vectorized& max) const { return {vec_min(_vec0, max._vec0), vec_min(_vec1, max._vec1)}; } /* Keeps NaN if actual value is NaN */ template < typename U = T, std::enable_if_t, int> = 0> Vectorized clamp_max(const Vectorized& max) const { Vectorized tmp = {vec_min(_vec0, max._vec0), vec_min(_vec1, max._vec1)}; return blendv(tmp, *this, isnan()); } template , int> = 0> Vectorized swapped() const { auto swap_mask = GetSwapMaskFloat(); vtype v0 = vec_perm(_vec0, _vec0, swap_mask); vtype v1 = vec_perm(_vec1, _vec1, swap_mask); return {v0, v1}; } template < typename U = T, std::enable_if_t, int> = 0> Vectorized swapped() const { vtype v0 = {_vec0[1], _vec0[0]}; vtype v1 = {_vec1[1], _vec1[0]}; return {v0, v1}; } template < typename U = T, std::enable_if_t, int> = 0> static Vectorized mergee(Vectorized& first, Vectorized& second) { return { vec_mergee(first._vec0, second._vec0), vec_mergee(first._vec1, second._vec1)}; } template < typename U = T, std::enable_if_t, int> = 0> static Vectorized mergeo(Vectorized& first, Vectorized& second) { return { vec_mergeo(first._vec0, second._vec0), vec_mergeo(first._vec1, second._vec1)}; } static Vectorized horizontal_add_perm( Vectorized& first, Vectorized& second) { // we will simulate it differently with 6 instructions total // lets permute second so that we can add it getting horizontal sums auto first_perm = first.swapped(); // 2perm auto second_perm = second.swapped(); // 2perm // summ auto first_ret = first + first_perm; // 2add auto second_ret = second + second_perm; // 2 add // now lets choose evens return mergee(first_ret, second_ret); // 2 mergee's } static Vectorized horizontal_sub_perm( Vectorized& first, Vectorized& second) { // we will simulate it differently with 6 instructions total // lets permute second so that we can add it getting horizontal sums auto first_perm = first.swapped(); // 2perm auto second_perm = second.swapped(); // 2perm // summ auto first_ret = first - first_perm; // 2sub auto second_ret = second - second_perm; // 2 sub // now lets choose evens return mergee(first_ret, second_ret); // 2 mergee's } template < typename U = T, std::enable_if_t, int> = 0> Vectorized mergee() const { return {vec_mergee(_vec0, _vec0), vec_mergee(_vec1, _vec1)}; } template < typename U = T, std::enable_if_t, int> = 0> Vectorized mergeo() const { return {vec_mergeo(_vec0, _vec0), vec_mergeo(_vec1, _vec1)}; } template < typename U = T, std::enable_if_t, int> = 0> Vectorized to_vec_float_helper() const { int32_t values[8] = { _vec0[0], _vec0[1], _vec0[2], _vec0[3], _vec0[4], _vec0[5], _vec0[6], _vec0[7], }; return Vectorized{ values[0], values[1], values[2], values[3], values[4], values[5], values[6], values[7]}; } template < typename U = T, std::enable_if_t, int> = 0> Vectorized to_vec_uint8_helper() const { // helper function for float to uint8_t conversion uint8_t values[8] = { static_cast(_vec0[0]), static_cast(_vec0[1]), static_cast(_vec0[2]), static_cast(_vec0[3]), static_cast(_vec1[0]), static_cast(_vec1[1]), static_cast(_vec1[2]), static_cast(_vec1[3]), }; return Vectorized{ values[0], values[1], values[2], values[3], values[4], values[5], values[6], values[7], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; } }; #define ZVECTOR_OPERATORS(typex) \ template <> \ Vectorized C10_ALWAYS_INLINE operator+( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{a.vec0() + b.vec0(), a.vec1() + b.vec1()}; \ } \ \ template <> \ Vectorized C10_ALWAYS_INLINE operator-( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{a.vec0() - b.vec0(), a.vec1() - b.vec1()}; \ } \ \ template <> \ Vectorized C10_ALWAYS_INLINE operator*( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{a.vec0() * b.vec0(), a.vec1() * b.vec1()}; \ } \ \ template <> \ Vectorized C10_ALWAYS_INLINE operator/( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{a.vec0() / b.vec0(), a.vec1() / b.vec1()}; \ } \ \ template <> \ Vectorized C10_ALWAYS_INLINE operator&( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{ \ (Vectorized::vtype)(a.vecb0() & b.vecb0()), \ (Vectorized::vtype)(a.vecb1() & b.vecb1())}; \ } \ \ template <> \ Vectorized C10_ALWAYS_INLINE operator|( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{ \ (Vectorized::vtype)(a.vecb0() | b.vecb0()), \ (Vectorized::vtype)(a.vecb1() | b.vecb1())}; \ } \ \ template <> \ Vectorized C10_ALWAYS_INLINE operator^( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{ \ (Vectorized::vtype)(a.vecb0() ^ b.vecb0()), \ (Vectorized::vtype)(a.vecb1() ^ b.vecb1())}; \ } \ \ Vectorized C10_ALWAYS_INLINE operator==( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{ \ vec_cmpeq(a.vec0(), b.vec0()), vec_cmpeq(a.vec1(), b.vec1())}; \ } \ \ Vectorized C10_ALWAYS_INLINE operator!=( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{ \ vec_cmpeq(a.vec0(), b.vec0()), vec_cmpeq(a.vec1(), b.vec1())} \ ._not(); \ } \ \ Vectorized C10_ALWAYS_INLINE operator>( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{ \ vec_cmpgt(a.vec0(), b.vec0()), vec_cmpgt(a.vec1(), b.vec1())}; \ } \ \ Vectorized C10_ALWAYS_INLINE operator>=( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{ \ vec_cmpge(a.vec0(), b.vec0()), vec_cmpge(a.vec1(), b.vec1())}; \ } \ \ Vectorized C10_ALWAYS_INLINE operator<( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{ \ vec_cmplt(a.vec0(), b.vec0()), vec_cmplt(a.vec1(), b.vec1())}; \ } \ \ Vectorized C10_ALWAYS_INLINE operator<=( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{ \ vec_cmple(a.vec0(), b.vec0()), vec_cmple(a.vec1(), b.vec1())}; \ } ZVECTOR_OPERATORS(float) ZVECTOR_OPERATORS(double) ZVECTOR_OPERATORS(int8_t) ZVECTOR_OPERATORS(uint8_t) ZVECTOR_OPERATORS(uint16_t) ZVECTOR_OPERATORS(int16_t) ZVECTOR_OPERATORS(int32_t) ZVECTOR_OPERATORS(int64_t) #undef ZVECTOR_OPERATORS #define ZVECTOR_OPERATORS(typex) \ template <> \ Vectorized C10_ALWAYS_INLINE operator<<( \ const Vectorized& a, const Vectorized& b) { \ constexpr Vectorized::ElementType max_shift = \ sizeof(Vectorized::ElementType) * CHAR_BIT; \ \ Vectorized::ElementType a_array[Vectorized::size()]; \ Vectorized::ElementType b_array[Vectorized::size()]; \ Vectorized::ElementType c_array[Vectorized::size()]; \ \ a.store(a_array); \ b.store(b_array); \ \ for (int i = 0; i != Vectorized::size(); i++) { \ typex shift = b_array[i]; \ if ((static_cast>(shift) < 0) || \ (shift >= max_shift)) { \ c_array[i] = 0; \ } else { \ c_array[i] = static_cast>(a_array[i]) \ << shift; \ } \ } \ \ return Vectorized::loadu(c_array); \ } \ \ template <> \ Vectorized C10_ALWAYS_INLINE operator>>( \ const Vectorized& a, const Vectorized& b) { \ /* right shift value to retain sign bit for signed and no bits for \ * unsigned */ \ constexpr Vectorized::ElementType max_shift = \ sizeof(typex) * CHAR_BIT - std::is_signed_v; \ \ Vectorized::ElementType a_array[Vectorized::size()]; \ Vectorized::ElementType b_array[Vectorized::size()]; \ Vectorized::ElementType c_array[Vectorized::size()]; \ \ a.store(a_array); \ b.store(b_array); \ \ for (int i = 0; i != Vectorized::size(); i++) { \ typex shift = b_array[i]; \ if ((static_cast>(shift) < 0) || \ (shift >= max_shift)) { \ c_array[i] = a_array[i] >> max_shift; \ } else { \ c_array[i] = a_array[i] >> shift; \ } \ } \ \ return Vectorized::loadu(c_array); \ } \ \ template <> \ inline Vectorized operator~(const Vectorized& a) { \ return a._not(); \ } ZVECTOR_OPERATORS(int8_t) ZVECTOR_OPERATORS(uint8_t) ZVECTOR_OPERATORS(uint16_t) ZVECTOR_OPERATORS(int16_t) ZVECTOR_OPERATORS(int32_t) ZVECTOR_OPERATORS(int64_t) #undef ZVECTOR_OPERATORS #define DEFINE_MAXMIN_FUNCS(operand_type) \ template <> \ Vectorized inline maximum( \ const Vectorized& a, const Vectorized& b) { \ return a.maximum(b); \ } \ template <> \ Vectorized inline minimum( \ const Vectorized& a, const Vectorized& b) { \ return a.minimum(b); \ } #define DEFINE_CLAMP_MAXMIN_FUNCS(typex) \ DEFINE_MAXMIN_FUNCS(typex) \ template <> \ Vectorized C10_ALWAYS_INLINE clamp_min( \ const Vectorized& a, const Vectorized& min) { \ return a.clamp_min(min); \ } \ template <> \ Vectorized C10_ALWAYS_INLINE clamp_max( \ const Vectorized& a, const Vectorized& max) { \ return a.clamp_max(max); \ } \ template <> \ Vectorized C10_ALWAYS_INLINE clamp( \ const Vectorized& a, \ const Vectorized& min, \ const Vectorized& max) { \ return clamp_max(clamp_min(a, min), max); \ } DEFINE_CLAMP_MAXMIN_FUNCS(int8_t) DEFINE_CLAMP_MAXMIN_FUNCS(uint8_t) DEFINE_CLAMP_MAXMIN_FUNCS(int16_t) DEFINE_CLAMP_MAXMIN_FUNCS(int32_t) DEFINE_CLAMP_MAXMIN_FUNCS(int64_t) DEFINE_CLAMP_MAXMIN_FUNCS(float) DEFINE_CLAMP_MAXMIN_FUNCS(double) namespace { /* unnamed namespace */ #if !defined(vec_float) || __ARCH__ < 13 #warning \ "float->int and int->float conversion is simulated. compile for z15 for improved performance" inline ZSimdVect vec_int_flt(const ZSimdVect x) { return ZSimdVect{float(x[0]), float(x[1]), float(x[2]), float(x[3])}; } inline ZSimdVect vec_flt_int(const ZSimdVect x) { return ZSimdVect{int(x[0]), int(x[1]), int(x[2]), int(x[3])}; } #else #define vec_int_flt vec_float #define vec_flt_int vec_signed #endif Vectorized zvec_convert_to_float(const Vectorized& x) { return {vec_int_flt(x.vec0()), vec_int_flt(x.vec1())}; } Vectorized zvec_convert_to_int(const Vectorized& x) { return {vec_flt_int(x.vec0()), vec_flt_int(x.vec1())}; } Vectorized zvec_convert_to_float(const Vectorized& x) { return {vec_double(x.vec0()), vec_double(x.vec1())}; } Vectorized zvec_convert_to_int(const Vectorized& x) { return {vec_signed(x.vec0()), vec_signed(x.vec1())}; } } /* unnamed namespace */ template Vectorized cast_zvector(const Vectorized& x) { using cast_type = typename Vectorized::vtype; return Vectorized{(cast_type)x.vec0(), (cast_type)x.vec1()}; } template <> Vectorized C10_ALWAYS_INLINE fmadd( const Vectorized& a, const Vectorized& b, const Vectorized& c) { return Vectorized{ __builtin_s390_vfmasb(a.vec0(), b.vec0(), c.vec0()), __builtin_s390_vfmasb(a.vec1(), b.vec1(), c.vec1())}; } template <> Vectorized C10_ALWAYS_INLINE fmadd( const Vectorized& a, const Vectorized& b, const Vectorized& c) { return Vectorized{ __builtin_s390_vfmadb(a.vec0(), b.vec0(), c.vec0()), __builtin_s390_vfmadb(a.vec1(), b.vec1(), c.vec1())}; } template <> Vectorized C10_ALWAYS_INLINE fmadd( const Vectorized& a, const Vectorized& b, const Vectorized& c) { return Vectorized{ a.vec0() * b.vec0() + c.vec0(), a.vec1() * b.vec1() + c.vec1()}; } template <> Vectorized C10_ALWAYS_INLINE fmadd( const Vectorized& a, const Vectorized& b, const Vectorized& c) { return Vectorized{ a.vec0() * b.vec0() + c.vec0(), a.vec1() * b.vec1() + c.vec1()}; } template <> Vectorized C10_ALWAYS_INLINE fmadd( const Vectorized& a, const Vectorized& b, const Vectorized& c) { return Vectorized{ a.vec0() * b.vec0() + c.vec0(), a.vec1() * b.vec1() + c.vec1()}; } template <> Vectorized C10_ALWAYS_INLINE convert_to_int_of_same_size(const Vectorized& src) { return zvec_convert_to_int(src); } template <> Vectorized C10_ALWAYS_INLINE convert_to_int_of_same_size(const Vectorized& src) { return zvec_convert_to_int(src); } template <> inline void convert(const int32_t* src, float* dst, int64_t n) { // int32_t and float have same size int64_t i; for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { const int32_t* src_a = src + i; float* dst_a = dst + i; auto input_vec = Vectorized::loadu(src_a); auto output_vec = zvec_convert_to_float(input_vec); output_vec.store(dst_a); } for (; i < n; i++) { dst[i] = static_cast(src[i]); } } template <> inline void convert(const int64_t* src, double* dst, int64_t n) { int64_t i; for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { const int64_t* src_a = src + i; double* dst_a = dst + i; auto input_vec = Vectorized::loadu(src_a); auto output_vec = zvec_convert_to_float(input_vec); output_vec.store(dst_a); } for (; i < n; i++) { dst[i] = static_cast(src[i]); } } #define DEFINE_REINTERPRET_CAST_FUNCS(Fst, Cst) \ template <> \ C10_ALWAYS_INLINE Vectorized cast( \ const Vectorized& src) { \ return cast_zvector(src); \ } #define DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(Fst) \ DEFINE_REINTERPRET_CAST_FUNCS(Fst, double) \ DEFINE_REINTERPRET_CAST_FUNCS(Fst, float) \ DEFINE_REINTERPRET_CAST_FUNCS(Fst, int64_t) \ DEFINE_REINTERPRET_CAST_FUNCS(Fst, int32_t) \ DEFINE_REINTERPRET_CAST_FUNCS(Fst, int16_t) DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(float) DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(double) DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(int64_t) DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(int32_t) DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(int16_t) #undef DEFINE_REINTERPRET_CAST_FUNCS template struct unpack_type { using type = T; }; template <> struct unpack_type { using type = int16_t; }; template <> struct unpack_type { using type = int16_t; }; template <> struct unpack_type { using type = int32_t; }; template struct pack_type { using type = T; }; template <> struct pack_type { using type = int8_t; }; template <> struct pack_type { using type = int16_t; }; namespace { /* unnamed namespace */ template ::type> std::pair, Vectorized> unpack(const Vectorized& x) { auto vec0 = vec_unpackh(x.vec0()); auto vec1 = vec_unpackl(x.vec0()); auto vec2 = vec_unpackh(x.vec1()); auto vec3 = vec_unpackl(x.vec1()); return {Vectorized{vec0, vec1}, Vectorized{vec2, vec3}}; } C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-function") template <> std::pair, Vectorized> unpack( const Vectorized& x) { using typeX = typename Vectorized::vtype; typeX vec0 = vec_unpackh(x.vec0()); typeX vec1 = vec_unpackl(x.vec0()); typeX vec2 = vec_unpackh(x.vec1()); typeX vec3 = vec_unpackl(x.vec1()); // auto mask = Vectorized(0xFF); // vec0 = vec0 & mask; // vec1 = vec1 & mask; // vec2 = vec2 & mask; // vec3 = vec3 & mask; return { cast_zvector(Vectorized{vec0, vec1}), cast_zvector(Vectorized{vec2, vec3})}; } C10_DIAGNOSTIC_POP() template ::type> Vectorized pack(const Vectorized& first, const Vectorized& second) { auto vec0 = vec_packs(first.vec0(), first.vec1()); auto vec1 = vec_packs(second.vec0(), second.vec1()); return Vectorized{vec0, vec1}; } C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-function") template <> Vectorized pack( const Vectorized& first, const Vectorized& second) { auto vec0 = vec_packsu(first.vec0(), first.vec1()); auto vec1 = vec_packsu(second.vec0(), second.vec1()); return Vectorized{vec0, vec1}; } C10_DIAGNOSTIC_POP() } /* unnamed namespace */ //////////////////////////////////QUANT/////////////////////////////////////////// template struct is_vec_specialized_for< T, std::enable_if_t()>> : std::bool_constant {}; template struct Vectorized()>> { public: using value_type = typename T::underlying; using vtype = ZSimdVect; using vmaskType = ZSimdVectBinary; using vinner_type = Vectorized; using size_type = int; static constexpr size_type size() { return VECTOR_WIDTH / sizeof(value_type); } static constexpr int float_num_vecs() { return size() / Vectorized::size(); } static constexpr int int_num_vecs() { return float_num_vecs(); } using float_vec_return_type = std::array, float_num_vecs()>; using int_vec_return_type = std::array, int_num_vecs()>; private: vinner_type _vec; public: Vectorized() {} explicit C10_ALWAYS_INLINE Vectorized(vinner_type v) : _vec{v} {} Vectorized(const T& val) : _vec(val.val_) {} C10_ALWAYS_INLINE const vinner_type& vec() const { return _vec; } template static Vectorized C10_ALWAYS_INLINE loadu(const U* ptr, int count = size()) { return Vectorized{vinner_type::loadu(ptr, count)}; } template void C10_ALWAYS_INLINE store(U* ptr, int count = size()) const { _vec.store(ptr, count); } Vectorized relu(Vectorized zero_point) const { return Vectorized{_vec.maximum(zero_point._vec)}; } Vectorized relu6(Vectorized zero_point, Vectorized q_six) const { auto ret_max = _vec.maximum(zero_point._vec); auto ret_min = ret_max.minimum(q_six._vec); return Vectorized{ret_min}; } template < typename U = T, std::enable_if_t::float_num_vecs() == 1, int> = 0> int_vec_return_type widening_subtract(Vectorized b) const { return {*this - b}; } template < typename U = T, std::enable_if_t::float_num_vecs() == 1, int> = 0> float_vec_return_type dequantize( Vectorized scale, Vectorized zero_point, Vectorized scale_zp_premul) const { auto float_val = zvec_convert_to_float(_vec); return {fmadd(scale, float_val, scale_zp_premul)}; } template < typename U = T, std::enable_if_t::float_num_vecs() == 1, int> = 0> float_vec_return_type dequantize( Vectorized scale, Vectorized zero_point) const { auto float_val = zvec_convert_to_float(_vec); return {(float_val - zero_point) * scale}; } template < typename U = T, std::enable_if_t::float_num_vecs() == 1, int> = 0> static Vectorized quantize( const float_vec_return_type& rhs, float scale, int32_t zero_point, float inverse_scale) { Vectorized vecf = rhs[0]; vecf = vecf * Vectorized(inverse_scale); vecf = vecf.rint() + Vectorized((float)(zero_point)); auto veci = zvec_convert_to_int(vecf); return Vectorized{veci}; } template < typename U = T, std::enable_if_t::int_num_vecs() == 1, int> = 0> static Vectorized requantize_from_int( const int_vec_return_type& inp, float multiplier, int32_t zero_point) { Vectorized vi = inp[0]; auto vecf = zvec_convert_to_float(vi.vec()); vecf = vecf * Vectorized(multiplier); vecf = vecf.rint(); auto veci = zvec_convert_to_int(vecf) + Vectorized(zero_point); return Vectorized{veci}; } template < typename U = T, std::enable_if_t::int_num_vecs() == 4, int> = 0> int_vec_return_type widening_subtract(Vectorized b) const { auto ret16 = unpack(_vec); auto ret16B = unpack(b.vec()); auto ret32_0 = unpack(ret16.first); auto ret32_1 = unpack(ret16.second); auto ret32B_0 = unpack(ret16B.first); auto ret32B_1 = unpack(ret16B.second); return { Vectorized(ret32_0.first - ret32B_0.first), Vectorized(ret32_0.second - ret32B_0.second), Vectorized(ret32_1.first - ret32B_1.first), Vectorized(ret32_1.second - ret32B_1.second)}; } template < typename U = T, std::enable_if_t::float_num_vecs() == 4, int> = 0> float_vec_return_type C10_ALWAYS_INLINE dequantize( Vectorized scale, Vectorized zero_point, Vectorized scale_zp_premul) const { // unpacking unsigned as signed auto ret16 = unpack(_vec); auto ret32_0 = unpack(ret16.first); auto ret32_1 = unpack(ret16.second); auto vecf_0 = zvec_convert_to_float(ret32_0.first); auto vecf_1 = zvec_convert_to_float(ret32_0.second); auto vecf_2 = zvec_convert_to_float(ret32_1.first); auto vecf_3 = zvec_convert_to_float(ret32_1.second); return { fmadd(scale, vecf_0, scale_zp_premul), fmadd(scale, vecf_1, scale_zp_premul), fmadd(scale, vecf_2, scale_zp_premul), fmadd(scale, vecf_3, scale_zp_premul)}; } template < typename U = T, std::enable_if_t::float_num_vecs() == 4, int> = 0> float_vec_return_type dequantize( Vectorized scale, Vectorized zero_point) const { // unpacking unsigned as signed auto ret16 = unpack(_vec); auto ret32_0 = unpack(ret16.first); auto ret32_1 = unpack(ret16.second); auto vecf_0 = zvec_convert_to_float(ret32_0.first); auto vecf_1 = zvec_convert_to_float(ret32_0.second); auto vecf_2 = zvec_convert_to_float(ret32_1.first); auto vecf_3 = zvec_convert_to_float(ret32_1.second); return { (vecf_0 - zero_point) * scale, (vecf_1 - zero_point) * scale, (vecf_2 - zero_point) * scale, (vecf_3 - zero_point) * scale}; } template < typename U = T, std::enable_if_t::float_num_vecs() == 4, int> = 0> static Vectorized quantize( const float_vec_return_type& rhs, float scale, int32_t zero_point, float inverse_scale) { auto vec_inverse = Vectorized(inverse_scale); auto vec_zero_point = Vectorized((float)zero_point); auto vecf0 = rhs[0]; auto vecf2 = rhs[1]; auto vecf4 = rhs[2]; auto vecf6 = rhs[3]; vecf0 = vecf0 * vec_inverse; vecf2 = vecf2 * vec_inverse; vecf4 = vecf4 * vec_inverse; vecf6 = vecf6 * vec_inverse; vecf0 = vecf0.rint() + vec_zero_point; vecf2 = vecf2.rint() + vec_zero_point; vecf4 = vecf4.rint() + vec_zero_point; vecf6 = vecf6.rint() + vec_zero_point; auto veci0 = zvec_convert_to_int(vecf0); auto veci2 = zvec_convert_to_int(vecf2); auto veci4 = zvec_convert_to_int(vecf4); auto veci6 = zvec_convert_to_int(vecf6); auto vecshi0 = pack(veci0, veci2); auto vecshi2 = pack(veci4, veci6); auto ret = pack(vecshi0, vecshi2); return Vectorized{ret}; } template < typename U = T, std::enable_if_t::int_num_vecs() == 4, int> = 0> static Vectorized requantize_from_int( const int_vec_return_type& inp, float multiplier, int32_t zero_point) { Vectorized vec_multiplier = Vectorized(multiplier); Vectorized vec_zero_point = Vectorized(zero_point); Vectorized vi0 = inp[0]; Vectorized vi1 = inp[1]; Vectorized vi2 = inp[2]; Vectorized vi3 = inp[3]; auto vecf0 = zvec_convert_to_float(vi0.vec()); auto vecf2 = zvec_convert_to_float(vi1.vec()); auto vecf4 = zvec_convert_to_float(vi2.vec()); auto vecf6 = zvec_convert_to_float(vi3.vec()); vecf0 = vecf0 * vec_multiplier; vecf2 = vecf2 * vec_multiplier; vecf4 = vecf4 * vec_multiplier; vecf6 = vecf6 * vec_multiplier; vecf0 = vecf0.rint(); vecf2 = vecf2.rint(); vecf4 = vecf4.rint(); vecf6 = vecf6.rint(); auto veci0 = zvec_convert_to_int(vecf0); auto veci2 = zvec_convert_to_int(vecf2); auto veci4 = zvec_convert_to_int(vecf4); auto veci6 = zvec_convert_to_int(vecf6); veci0 = veci0 + vec_zero_point; veci2 = veci2 + vec_zero_point; veci4 = veci4 + vec_zero_point; veci6 = veci6 + vec_zero_point; auto vecshi0 = pack(veci0, veci2); auto vecshi2 = pack(veci4, veci6); auto ret = pack(vecshi0, vecshi2); return Vectorized{ret}; } Vectorized C10_ALWAYS_INLINE eq(const Vectorized& other) const { return Vectorized{_vec.eq(other._vec)}; } Vectorized C10_ALWAYS_INLINE ne(const Vectorized& other) const { return Vectorized{_vec.ne(other._vec)}; } Vectorized C10_ALWAYS_INLINE gt(const Vectorized& other) const { return Vectorized{_vec.gt(other._vec)}; } Vectorized C10_ALWAYS_INLINE ge(const Vectorized& other) const { return Vectorized{_vec.ge(other._vec)}; } Vectorized C10_ALWAYS_INLINE lt(const Vectorized& other) const { return Vectorized{_vec.lt(other._vec)}; } Vectorized C10_ALWAYS_INLINE le(const Vectorized& other) const { return Vectorized{_vec.le(other._vec)}; } Vectorized clamp_min(const Vectorized& min) const { return Vectorized{_vec.clamp_min(min._vec)}; } Vectorized clamp_max(const Vectorized& max) const { return Vectorized{_vec.clamp_max(max._vec)}; } Vectorized minimum(const Vectorized& other) const { return Vectorized{_vec.minimum(other._vec)}; } Vectorized maximum(const Vectorized& other) const { return Vectorized{_vec.maximum(other._vec)}; } }; #define ZVECTOR_OPERATORS(typex) \ template <> \ Vectorized C10_ALWAYS_INLINE operator+( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{a.vec() + b.vec()}; \ } \ \ template <> \ Vectorized C10_ALWAYS_INLINE operator-( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{a.vec() - b.vec()}; \ } \ \ template <> \ Vectorized C10_ALWAYS_INLINE operator*( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{a.vec() * b.vec()}; \ } \ \ template <> \ Vectorized C10_ALWAYS_INLINE operator/( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{a.vec() / b.vec()}; \ } \ \ template <> \ Vectorized C10_ALWAYS_INLINE operator&( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{a.vec() & b.vec()}; \ } \ \ template <> \ Vectorized C10_ALWAYS_INLINE operator|( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{a.vec() | b.vec()}; \ } \ \ template <> \ Vectorized C10_ALWAYS_INLINE operator^( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{a.vec() ^ b.vec()}; \ } \ \ Vectorized C10_ALWAYS_INLINE operator==( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{a.vec() == b.vec()}; \ } \ \ Vectorized C10_ALWAYS_INLINE operator!=( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{a.vec() != b.vec()}; \ } \ \ Vectorized C10_ALWAYS_INLINE operator>( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{a.vec() > b.vec()}; \ } \ \ Vectorized C10_ALWAYS_INLINE operator>=( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{a.vec() >= b.vec()}; \ } \ \ Vectorized C10_ALWAYS_INLINE operator<( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{a.vec() < b.vec()}; \ } \ \ Vectorized C10_ALWAYS_INLINE operator<=( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{a.vec() <= b.vec()}; \ } ZVECTOR_OPERATORS(c10::qint32) ZVECTOR_OPERATORS(c10::qint8) ZVECTOR_OPERATORS(c10::quint8) #undef ZVECTOR_OPERATORS DEFINE_CLAMP_MAXMIN_FUNCS(c10::quint8) DEFINE_CLAMP_MAXMIN_FUNCS(c10::qint8) DEFINE_CLAMP_MAXMIN_FUNCS(c10::qint32) template constexpr auto real_mask() { return (ZSimdVect)ZSimdVectBinary{0xFFFFFFFF, 0, 0xFFFFFFFF, 0}; } template <> constexpr auto real_mask() { return (ZSimdVect)ZSimdVectBinary{0xFFFFFFFFFFFFFFFF, 0}; } template constexpr auto image_mask() { return (ZSimdVect)ZSimdVectBinary{0, 0xFFFFFFFF, 0, 0xFFFFFFFF}; } template <> constexpr auto image_mask() { return (ZSimdVect)ZSimdVectBinary{0, 0xFFFFFFFFFFFFFFFF}; } template constexpr auto rsign_mask() { return ZSimdVect{-0.f, 0.f, -0.f, 0.f}; } template <> constexpr auto rsign_mask() { return ZSimdVect{-0.0, 0.f}; } template constexpr auto isign_mask() { return ZSimdVect{0.0, -0.f, 0.0, -0.f}; } template <> constexpr auto isign_mask() { return ZSimdVect{0.0, -0.0}; } template constexpr auto image_one() { return ZSimdVect{0, 1.f, 0, 1.f}; } template <> constexpr auto image_one() { return ZSimdVect{0.0, 1.0}; } template constexpr auto pi_half() { return ZSimdVect{(float)(M_PI / 2.0), 0.f, (float)(M_PI / 2.0), 0.f}; } template <> constexpr auto pi_half() { return ZSimdVect{M_PI / 2.0, 0.0}; } template constexpr auto image_half() { return ZSimdVect{0, 0.5f, 0, 0.5f}; } template <> constexpr auto image_half() { return ZSimdVect{0.0, 0.5}; } template constexpr U log2e_inv() { return static_cast(1.4426950408889634); } template constexpr U log10e_inv() { return static_cast(0.43429448190325176); } template struct is_vec_specialized_for< T, std::enable_if_t()>> : std::bool_constant {}; template struct Vectorized()>> { public: using underline_type = decltype(std::declval().imag()); using value_type = T; using vtype = ZSimdVect; using vmaskType = ZSimdVectBinary; using vinner_type = Vectorized; using size_type = int; using vinner_data = typename Vectorized::vinner_data; static constexpr size_type size() { return VECTOR_WIDTH / sizeof(value_type); } private: vinner_type _vec; public: Vectorized() {} C10_ALWAYS_INLINE Vectorized(const vinner_data& v) : _vec{v.first, v.second} {} template = 0> C10_ALWAYS_INLINE Vectorized(T s1, T s2) : _vec{s1.real(), s1.imag(), s2.real(), s2.imag()} {} template = 0> C10_ALWAYS_INLINE Vectorized(T s1, T s2, T s3, T s4) : _vec{ s1.real(), s1.imag(), s2.real(), s2.imag(), s3.real(), s3.imag(), s4.real(), s4.imag()} {} template = 0> C10_ALWAYS_INLINE Vectorized(T s) : Vectorized(s, s) {} template = 0> C10_ALWAYS_INLINE Vectorized(T s) : Vectorized(s, s, s, s) {} C10_ALWAYS_INLINE operator vinner_type() const { return _vec; } C10_ALWAYS_INLINE const vinner_type& vec() const { return _vec; } C10_ALWAYS_INLINE operator vinner_data() const { return _vec.data(); } C10_ALWAYS_INLINE vinner_data data() const { return _vec.data(); } template static Vectorized C10_ALWAYS_INLINE loadu(const U* ptr, int count = size()) { return Vectorized{vinner_type::loadu(ptr, 2 * count)}; } template void C10_ALWAYS_INLINE store(U* ptr, int count = size()) const { return _vec.store(ptr, 2 * count); } static Vectorized blendv( const Vectorized& a, const Vectorized& b, const Vectorized& mask) { // convert std::complex index mask to V index mask: xy -> xxyy vinner_type vmask = mask.vec(); auto mask_complex = vinner_type( vec_mergeh(vmask.vec0(), vmask.vec0()), vec_mergeh(vmask.vec1(), vmask.vec1())); return Vectorized{vinner_type::blendv(a.vec(), b.vec(), mask_complex)}; } template static auto C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { constexpr int mask_complex = maskForComplex(mask); return Vectorized{ vinner_type::template blend(a.vec(), b.vec())}; } template static std::enable_if_t> arange( T base = 0, step_t step = static_cast(1)) { return Vectorized(base, base + step); } template static std::enable_if_t> arange( T base = 0, step_t step = static_cast(1)) { return Vectorized( base, base + step, base + value_type(2) * step, base + value_type(3) * step); } template static inline std::enable_if_t<(Z >= C), Vectorized> set_inner( const Vectorized& a, const Vectorized& b, size_t count) { return b; } template static inline std::enable_if_t<(Z < C), Vectorized> set_inner( const Vectorized& a, const Vectorized& b, size_t count) { if (count == Z) return blend(a, b); else return set_inner(a, b, count); } static Vectorized set( const Vectorized& a, const Vectorized& b, size_t count = size()) { if (count == 0) return a; return set_inner<1, size()>(a, b, count); } const T& operator[](int idx) const = delete; T& operator[](int idx) = delete; template < typename U = T, std::enable_if_t>::value, int> = 0> Vectorized mapOrdinary(T (*const f)(const T&)) const { auto v0 = _vec.vec0(); auto v1 = _vec.vec1(); return Vectorized{ f(T(v0[0], v0[1])), f(T(v0[2], v0[3])), f(T(v1[0], v1[1])), f(T(v1[2], v1[3]))}; } template < typename U = T, std::enable_if_t>::value, int> = 0> Vectorized mapOrdinary(T (*const f)(const T&)) const { auto v0 = _vec.vec0(); auto v1 = _vec.vec1(); return Vectorized{f(T(v0[0], v0[1])), f(T(v1[0], v1[1]))}; } template < typename U = T, std::enable_if_t>::value, int> = 0> Vectorized mapOrdinary(T (*const f)(T)) const { auto v0 = _vec.vec0(); auto v1 = _vec.vec1(); return Vectorized{ f(T(v0[0], v0[1])), f(T(v0[2], v0[3])), f(T(v1[0], v1[1])), f(T(v1[2], v1[3]))}; } template < typename U = T, std::enable_if_t>::value, int> = 0> Vectorized mapOrdinary(T (*const f)(T)) const { auto v0 = _vec.vec0(); auto v1 = _vec.vec1(); return Vectorized{f(T(v0[0], v0[1])), f(T(v1[0], v1[1]))}; } template < typename U = T, std::enable_if_t>::value, int> = 0> inline Vectorized mapOrdinary( T (*const f)(const T&, const T&), const Vectorized& b) const { auto v0 = _vec.vec0(); auto v1 = _vec.vec1(); auto bvec = b.vec(); auto b0 = bvec.vec0(); auto b1 = bvec.vec1(); T a00 = f(T(v0[0], v0[1]), T(b0[0], b0[1])); T a01 = f(T(v0[2], v0[3]), T(b0[2], b0[3])); T a02 = f(T(v1[0], v1[1]), T(b1[0], b1[1])); T a03 = f(T(v1[2], v1[3]), T(b1[2], b1[3])); return Vectorized{a00, a01, a02, a03}; } template < typename U = T, std::enable_if_t>::value, int> = 0> inline Vectorized mapOrdinary( T (*const f)(const T&, const T&), const Vectorized& b) const { auto v0 = _vec.vec0(); auto v1 = _vec.vec1(); auto bvec = b.vec(); auto b0 = bvec.vec0(); auto b1 = bvec.vec1(); U a00 = f(U(v0[0], v0[1]), U(b0[0], b0[1])); U a01 = f(U(v1[0], v1[1]), U(b1[0], b1[1])); return Vectorized{a00, a01}; } template < typename U = T, std::enable_if_t>::value, int> = 0> static typename Vectorized::vinner_type real_neg( const typename Vectorized::vinner_type& a) { const auto swap_mask = ZSimdVectBinary{ 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31}; auto a_neg = a.neg(); vtype v0 = vec_perm(a_neg.vec0(), a.vec0(), swap_mask); vtype v1 = vec_perm(a_neg.vec1(), a.vec1(), swap_mask); return {v0, v1}; } template < typename U = T, std::enable_if_t>::value, int> = 0> static typename Vectorized::vinner_type real_neg( const typename Vectorized::vinner_type& a) { auto a_neg = a.neg(); vtype v0 = {a_neg.vec0()[0], a.vec0()[1]}; vtype v1 = {a_neg.vec1()[0], a.vec1()[1]}; return {v0, v1}; } Vectorized angle2_() const { auto b_a = _vec.swapped(); // b a return Vectorized{_vec.atan2(b_a).swapped()}; } Vectorized angle() const { return angle2_().real(); } Vectorized atan() const { // atan(x) = i/2 * ln((i + z)/(i - z)) auto ione = Vectorized{vinner_type(image_one())}; auto sum = ione + *this; auto sub = ione - *this; auto ln = (sum / sub).log(); // ln((i + z)/(i - z)) return ln * Vectorized{vinner_type(image_half())}; // i/2*ln() } Vectorized atanh() const { return mapOrdinary(std::atanh); } Vectorized asin() const { // asin(x) // = -i*ln(iz + sqrt(1 -z^2)) // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi))) // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi)) #if 1 vinner_type cnj = conj().vec(); vinner_type b_a = cnj.swapped(); vinner_type ab = cnj * b_a; vinner_type im = ab + ab; vinner_type val_2 = _vec * _vec; vinner_type val_2_swapped = val_2.swapped(); vinner_type re = vinner_type::horizontal_sub_perm(val_2, val_2_swapped); re = vinner_type(static_cast(1)) - re; constexpr int blend_mask = blend_choice(); // 0x0A for complex , 0xAA for complex vinner_type blendx = vinner_type::template blend(re, im); auto root = Vectorized(blendx).sqrt(); auto ln = Vectorized(Vectorized(b_a) + root).log(); return Vectorized(ln.vec().swapped()).conj(); #else return mapOrdinary(std::asin); #endif } Vectorized acos() const { // acos(x) = pi/2 - asin(x) return Vectorized(vinner_type(pi_half())) - asin(); } Vectorized sin() const { return mapOrdinary(std::sin); } Vectorized sinh() const { return mapOrdinary(std::sinh); } Vectorized cos() const { return mapOrdinary(std::cos); } Vectorized cosh() const { return mapOrdinary(std::cosh); } Vectorized ceil() const { return Vectorized{_vec.ceil()}; } Vectorized floor() const { return Vectorized{_vec.floor()}; } Vectorized neg() const { return Vectorized(_vec.neg()); } Vectorized round() const { return Vectorized{_vec.round()}; } Vectorized tan() const { return mapOrdinary(std::tan); } Vectorized tanh() const { return mapOrdinary(std::tanh); } Vectorized trunc() const { return Vectorized{_vec.trunc()}; } Vectorized C10_ALWAYS_INLINE eq(const Vectorized& other) const { auto eq = _vec.eq(other._vec); // compares real and imag individually // If both real numbers and imag numbers are equal, then the complex numbers // are equal auto real = eq & vinner_type(real_mask()); auto imag = (eq & vinner_type(image_mask())).swapped(); return Vectorized{real & imag}; } Vectorized C10_ALWAYS_INLINE ne(const Vectorized& other) const { auto ne = _vec.ne(other._vec); // compares real and imag individually // If either real numbers or imag numbers are not equal, then the complex // numbers are not equal auto real = ne & vinner_type(real_mask()); auto imag = (ne & vinner_type(image_mask())).swapped(); return Vectorized{real | imag}; } Vectorized real() const { return Vectorized(_vec & vinner_type(real_mask())); } Vectorized imag_() const { return Vectorized(_vec & vinner_type(image_mask())); } Vectorized imag() const { return Vectorized{ (_vec & vinner_type(image_mask())).swapped()}; } Vectorized conj() const { return Vectorized(_vec ^ vinner_type(isign_mask())); } vinner_data abs_2_() const { auto a = _vec * _vec; a = a + a.swapped(); return a.mergee().data(); } static T abs_helper(const T& value) { return T(std::abs(value)); } Vectorized abs() const { return mapOrdinary(abs_helper); } Vectorized exp() const { return mapOrdinary(std::exp); } Vectorized exp2() const { return mapOrdinary(exp2_impl); } Vectorized expm1() const { return mapOrdinary(std::expm1); } Vectorized log() const { return mapOrdinary(std::log); } Vectorized log2() const { // log2eB_inv auto ret = log(); return Vectorized{ret._vec * vinner_type(log2e_inv())}; } Vectorized log10() const { auto ret = log(); return Vectorized{ret._vec * vinner_type(log10e_inv())}; } Vectorized log1p() const { return mapOrdinary(std::log1p); } Vectorized sgn() const { return mapOrdinary(at::native::sgn_impl); } Vectorized pow(const Vectorized& exp) const { return mapOrdinary(std::pow, exp); } Vectorized sqrt() const { return mapOrdinary(std::sqrt); } Vectorized reciprocal() const { // re + im*i = (a + bi) / (c + di) // re = (ac + bd)/abs_2() = c/abs_2() // im = (bc - ad)/abs_2() = d/abs_2() vinner_type c_d = _vec ^ vinner_type(isign_mask()); vinner_type abs = abs_2_(); return Vectorized{c_d / abs}; } Vectorized rsqrt() const { return sqrt().reciprocal(); } Vectorized lt(const Vectorized& other) const { TORCH_CHECK(false, "not supported for complex numbers"); } Vectorized le(const Vectorized& other) const { TORCH_CHECK(false, "not supported for complex numbers"); } Vectorized gt(const Vectorized& other) const { TORCH_CHECK(false, "not supported for complex numbers"); } Vectorized ge(const Vectorized& other) const { TORCH_CHECK(false, "not supported for complex numbers"); } }; #define ZVECTOR_OPERATORS(typex) \ template <> \ Vectorized C10_ALWAYS_INLINE operator+( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{a.vec() + b.vec()}; \ } \ \ template <> \ Vectorized C10_ALWAYS_INLINE operator-( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{a.vec() - b.vec()}; \ } \ \ template <> \ Vectorized inline operator*( \ const Vectorized& a, const Vectorized& b) { \ /* (a + bi) * (c + di) = (ac - bd) + (ad + bc)i */ \ Vectorized::vinner_type bv = b.vec(); \ \ /* this is more z arch friendly than simulating horizontal from x86 */ \ Vectorized::vinner_type vi = bv.mergeo(); \ Vectorized::vinner_type vr = bv.mergee(); \ vi = vi ^ \ Vectorized::vinner_type( \ rsign_mask::underline_type>()); \ Vectorized::vinner_type ret = a.vec() * vr; \ Vectorized::vinner_type vx_swapped = a.vec().swapped(); \ ret = fmadd(vx_swapped, vi, ret); \ \ return Vectorized{ret}; \ } \ \ template <> \ Vectorized inline operator/( \ const Vectorized& a, const Vectorized& b) { \ /* Unfortunately, this breaks some tests */ \ /* Implement it like it's done for avx2 */ \ auto fabs_cd = b.vec().abs(); /* |c| |d| */ \ auto fabs_dc = fabs_cd.swapped(); /* |d| |c| */ \ auto scale = Vectorized::vinner_type{1.0} / \ maximum(fabs_cd, fabs_dc); /* 1/sc 1/sc */ \ auto a2 = a.vec() * scale; /* a/sc b/sc */ \ auto b2 = b.vec() * scale; /* c/sc d/sc */ \ auto acbd2 = a2 * b2; /* ac/sc^2 bd/sc^2 */ \ \ auto dc2 = b2.swapped(); /* d/sc c/sc */ \ dc2 = Vectorized::real_neg(dc2); /* -d/|c,d| c/sc */ \ auto adbc2 = a2 * dc2; /* -ad/sc^2 bc/sc^2 */ \ auto sum1 = acbd2 + acbd2.swapped(); /* (ac+bd)/sc^2 (ac+bd)/sc^2 */ \ auto sum2 = adbc2 + adbc2.swapped(); /* (bc-ad)/sc^2 (bc-ad)/sc^2 */ \ auto res2 = Vectorized::vinner_type::mergee( \ sum1, sum2); /* (ac+bd)/sc^2 (bc-ad)/sc^2 */ \ \ /* get the denominator */ \ Vectorized::vinner_type denom2 = \ Vectorized{b2}.abs_2_(); /* (c^2+d^2)/sc^2 (c^2+d^2)/sc^2 */ \ res2 = res2 / denom2; \ return Vectorized{res2}; \ } \ \ template <> \ Vectorized C10_ALWAYS_INLINE operator&( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{a.vec() & b.vec()}; \ } \ \ template <> \ Vectorized C10_ALWAYS_INLINE operator|( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{a.vec() | b.vec()}; \ } \ \ template <> \ Vectorized C10_ALWAYS_INLINE operator^( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{a.vec() ^ b.vec()}; \ } \ \ Vectorized C10_ALWAYS_INLINE operator==( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{a.vec() == b.vec()}; \ } \ \ Vectorized C10_ALWAYS_INLINE operator!=( \ const Vectorized& a, const Vectorized& b) { \ return Vectorized{a.vec() != b.vec()}; \ } \ \ Vectorized C10_ALWAYS_INLINE operator<( \ const Vectorized& a, const Vectorized& b) { \ TORCH_CHECK(false, "not supported for complex numbers"); \ } \ \ Vectorized C10_ALWAYS_INLINE operator<=( \ const Vectorized& a, const Vectorized& b) { \ TORCH_CHECK(false, "not supported for complex numbers"); \ } \ \ Vectorized C10_ALWAYS_INLINE operator>( \ const Vectorized& a, const Vectorized& b) { \ TORCH_CHECK(false, "not supported for complex numbers"); \ } \ \ Vectorized C10_ALWAYS_INLINE operator>=( \ const Vectorized& a, const Vectorized& b) { \ TORCH_CHECK(false, "not supported for complex numbers"); \ } ZVECTOR_OPERATORS(c10::complex) ZVECTOR_OPERATORS(c10::complex) #undef ZVECTOR_OPERATORS template = 0> std::pair, Vectorized> inline inner_interleave2( const Vectorized& a, const Vectorized& b) { // inputs: // a = {a0, a1, a2, a3} // b = {b0, b1, b2, b3} using vtype = typename Vectorized::vtype; vtype ab00 = {a.vec0()[0], b.vec0()[0]}; vtype ab11 = {a.vec0()[1], b.vec0()[1]}; vtype ab2_00 = {a.vec1()[0], b.vec1()[0]}; vtype ab2_11 = {a.vec1()[1], b.vec1()[1]}; // return {a0, b0, a1, b1} // {a2, b2, a3, b3} return std::make_pair( Vectorized{ab00, ab11}, Vectorized{ab2_00, ab2_11}); } template = 0> std::pair, Vectorized> inline inner_deinterleave2( const Vectorized& a, const Vectorized& b) { // inputs: // a = {a0, b0, a1, b1} // b = {a2, b2, a3, b3} using vtype = typename Vectorized::vtype; vtype aa01 = {a.vec0()[0], a.vec1()[0]}; vtype aa23 = {b.vec0()[0], b.vec1()[0]}; vtype bb_01 = {a.vec0()[1], a.vec1()[1]}; vtype bb_23 = {b.vec0()[1], b.vec1()[1]}; // swap lanes: // return {a0, a1, a2, a3} // {b0, b1, b2, b3} return std::make_pair(Vectorized{aa01, aa23}, Vectorized{bb_01, bb_23}); } template = 0> std::pair, Vectorized> inline inner_interleave2( const Vectorized& a, const Vectorized& b) { // inputs: // a = {a0, a1, a2, a3,, a4, a5, a6, a7} // b = {b0, b1, b2, b3,, b4, b5, b6, b7} using vtype = typename Vectorized::vtype; vtype ab0011 = vec_mergeh(a.vec0(), b.vec0()); vtype ab2233 = vec_mergel(a.vec0(), b.vec0()); vtype ab2_0011 = vec_mergeh(a.vec1(), b.vec1()); vtype ab2_2233 = vec_mergel(a.vec1(), b.vec1()); // group cols crossing lanes: // return {a0, b0, a1, b1,, a2, b2, a3, b3} // {a4, b4, a5, b5,, a6, b6, a7, b7} return std::make_pair( Vectorized{ab0011, ab2233}, Vectorized{ab2_0011, ab2_2233}); } template = 0> std::pair, Vectorized> inline inner_deinterleave2( const Vectorized& a, const Vectorized& b) { // inputs: // a = {a0, b0, a1, b1,, a2, b2, a3, b3} // b = {a4, b4, a5, b5,, a6, b6, a7, b7} using vtype = typename Vectorized::vtype; // {a0,a2,b0,b2} {a1,a3,b1,b3} vtype a0a2b0b2 = vec_mergeh(a.vec0(), a.vec1()); vtype a1a3b1b3 = vec_mergel(a.vec0(), a.vec1()); vtype aa0123 = vec_mergeh(a0a2b0b2, a1a3b1b3); vtype bb0123 = vec_mergel(a0a2b0b2, a1a3b1b3); vtype a0a2b0b2_2 = vec_mergeh(b.vec0(), b.vec1()); vtype a1a3b1b3_2 = vec_mergel(b.vec0(), b.vec1()); vtype aa0123_2 = vec_mergeh(a0a2b0b2_2, a1a3b1b3_2); vtype bb0123_2 = vec_mergel(a0a2b0b2_2, a1a3b1b3_2); // it could be done with vec_perm ,too // swap lanes: // return {a0, a1, a2, a3,, a4, a5, a6, a7} // {b0, b1, b2, b3,, b4, b5, b6, b7} return std::make_pair( Vectorized{aa0123, aa0123_2}, Vectorized{bb0123, bb0123_2}); } template <> std::pair, Vectorized> inline interleave2( const Vectorized& a, const Vectorized& b) { return inner_interleave2(a, b); } template <> std::pair, Vectorized> inline interleave2( const Vectorized& a, const Vectorized& b) { return inner_interleave2(a, b); } template <> std::pair, Vectorized> inline interleave2( const Vectorized& a, const Vectorized& b) { return inner_interleave2(a, b); } template <> std::pair, Vectorized> inline interleave2( const Vectorized& a, const Vectorized& b) { return inner_interleave2(a, b); } template <> std::pair, Vectorized> inline deinterleave2( const Vectorized& a, const Vectorized& b) { return inner_deinterleave2(a, b); } template <> std::pair, Vectorized> inline deinterleave2< int32_t>(const Vectorized& a, const Vectorized& b) { return inner_deinterleave2(a, b); } template <> std::pair, Vectorized> inline deinterleave2( const Vectorized& a, const Vectorized& b) { return inner_deinterleave2(a, b); } template <> std::pair, Vectorized> inline deinterleave2< int64_t>(const Vectorized& a, const Vectorized& b) { return inner_deinterleave2(a, b); } template std::enable_if_t< std::is_same_v, at::vec::Vectorized< float>> inline convert_int8_to_float(const Vectorized& src) { // Note: this function only convert inputs number of elements equal to // at::vec::Vectorized.size() Only handle first 64 bits auto vec_int = src.to_vec_float_helper(); return zvec_convert_to_float(vec_int); } template std::enable_if_t< std::is_same_v, at::vec::Vectorized< T>> inline convert_float_to_int8(const Vectorized& src) { constexpr auto min_val = std::numeric_limits::min(); constexpr auto max_val = std::numeric_limits::max(); auto vec_int = clamp( zvec_convert_to_int(src), Vectorized(min_val), Vectorized(max_val)); return vec_int.to_vec_uint8_helper(); } #undef DEFINE_CLAMP_MAXMIN_FUNCS #undef DEFINE_MAXMIN_FUNCS } // namespace CPU_CAPABILITY } // namespace vec } // namespace at