#pragma once #include #include #include namespace at { namespace vec { // See Note [CPU_CAPABILITY namespace] inline namespace CPU_CAPABILITY { template <> struct is_vec_specialized_for : std::bool_constant {}; template <> class Vectorized { private: union { struct { vint16 _vec0; vint16 _vec1; }; struct { vbool16 _vecb0; vbool16 _vecb1; }; } __attribute__((__may_alias__)); public: using value_type = int16_t; using vec_internal_type = vint16; using vec_internal_mask_type = vbool16; using size_type = int; static constexpr size_type size() { return 16; } Vectorized() {} C10_ALWAYS_INLINE Vectorized(vint16 v) : _vec0{v}, _vec1{v} {} C10_ALWAYS_INLINE Vectorized(vbool16 vmask) : _vecb0{vmask}, _vecb1{vmask} {} C10_ALWAYS_INLINE Vectorized(vint16 v1, vint16 v2) : _vec0{v1}, _vec1{v2} {} C10_ALWAYS_INLINE Vectorized(vbool16 v1, vbool16 v2) : _vecb0{v1}, _vecb1{v2} {} C10_ALWAYS_INLINE Vectorized(int16_t scalar) : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {} C10_ALWAYS_INLINE Vectorized( int16_t scalar1, int16_t scalar2, int16_t scalar3, int16_t scalar4, int16_t scalar5, int16_t scalar6, int16_t scalar7, int16_t scalar8, int16_t scalar9, int16_t scalar10, int16_t scalar11, int16_t scalar12, int16_t scalar13, int16_t scalar14, int16_t scalar15, int16_t scalar16) : _vec0{vint16{ scalar1, scalar2, scalar3, scalar4, scalar5, scalar6, scalar7, scalar8}}, _vec1{vint16{ scalar9, scalar10, scalar11, scalar12, scalar13, scalar14, scalar15, scalar16}} {} C10_ALWAYS_INLINE const vec_internal_type& vec0() const { return _vec0; } C10_ALWAYS_INLINE const vec_internal_type& vec1() const { return _vec1; } template static std::enable_if_t> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { return a; } template static std::enable_if_t<(mask & 65535) == 65535, Vectorized> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { return b; } template static std::enable_if_t> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { return {b._vec0, a._vec1}; } template static std::enable_if_t<(mask > 0 && mask < 255), Vectorized> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { constexpr int16_t g0 = (mask & 1) * 0xffff; constexpr int16_t g1 = ((mask & 2) >> 1) * 0xffff; constexpr int16_t g2 = ((mask & 4) >> 2) * 0xffff; constexpr int16_t g3 = ((mask & 8) >> 3) * 0xffff; constexpr int16_t g4 = ((mask & 16) >> 4) * 0xffff; constexpr int16_t g5 = ((mask & 32) >> 5) * 0xffff; constexpr int16_t g6 = ((mask & 64) >> 6) * 0xffff; constexpr int16_t g7 = ((mask & 128) >> 7) * 0xffff; const vint16 mask_1st = vint16{g0, g1, g2, g3, g4, g5, g6, g7}; return {(vint16)vec_sel(a._vec0, b._vec0, (vbool16)mask_1st), a._vec1}; } template static std::enable_if_t< (mask > 255 && (mask & 65535) != 65535 && ((mask & 255) == 255)), Vectorized> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { constexpr int16_t g0_2 = (mask & 1) * 0xffff; constexpr int16_t g1_2 = ((mask & 2) >> 1) * 0xffff; constexpr int16_t g2_2 = ((mask & 4) >> 2) * 0xffff; constexpr int16_t g3_2 = ((mask & 8) >> 3) * 0xffff; constexpr int16_t g4_2 = ((mask & 16) >> 4) * 0xffff; constexpr int16_t g5_2 = ((mask & 32) >> 5) * 0xffff; constexpr int16_t g6_2 = ((mask & 64) >> 6) * 0xffff; constexpr int16_t g7_2 = ((mask & 128) >> 7) * 0xffff; const vint16 mask_2nd = vint16{g0_2, g1_2, g2_2, g3_2, g4_2, g5_2, g6_2, g7_2}; // generated masks return {b._vec0, (vint16)vec_sel(a._vec1, b._vec1, (vbool16)mask_2nd)}; } template static std::enable_if_t< (mask > 255 && ((mask & 65535) != 65535) && ((mask & 255) == 0)), Vectorized> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { constexpr int16_t mask2 = (mask & 65535) >> 16; constexpr int16_t g0_2 = (mask & 1) * 0xffff; constexpr int16_t g1_2 = ((mask & 2) >> 1) * 0xffff; constexpr int16_t g2_2 = ((mask & 4) >> 2) * 0xffff; constexpr int16_t g3_2 = ((mask & 8) >> 3) * 0xffff; constexpr int16_t g4_2 = ((mask & 16) >> 4) * 0xffff; constexpr int16_t g5_2 = ((mask & 32) >> 5) * 0xffff; constexpr int16_t g6_2 = ((mask & 64) >> 6) * 0xffff; constexpr int16_t g7_2 = ((mask & 128) >> 7) * 0xffff; const vint16 mask_2nd = vint16{g0_2, g1_2, g2_2, g3_2, g4_2, g5_2, g6_2, g7_2}; // generated masks return {a, (vint16)vec_sel(a._vec1, b._vec1, (vbool16)mask_2nd)}; } template static std::enable_if_t< (mask > 255 && ((mask & 65535) != 65535) && ((mask & 255) != 0) && ((mask & 255) != 255)), Vectorized> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { constexpr int16_t g0 = (mask & 1) * 0xffff; constexpr int16_t g1 = ((mask & 2) >> 1) * 0xffff; constexpr int16_t g2 = ((mask & 4) >> 2) * 0xffff; constexpr int16_t g3 = ((mask & 8) >> 3) * 0xffff; constexpr int16_t g4 = ((mask & 16) >> 4) * 0xffff; constexpr int16_t g5 = ((mask & 32) >> 5) * 0xffff; constexpr int16_t g6 = ((mask & 64) >> 6) * 0xffff; constexpr int16_t g7 = ((mask & 128) >> 7) * 0xffff; constexpr int16_t mask2 = (mask & 65535) >> 16; constexpr int16_t g0_2 = (mask & 1) * 0xffff; constexpr int16_t g1_2 = ((mask & 2) >> 1) * 0xffff; constexpr int16_t g2_2 = ((mask & 4) >> 2) * 0xffff; constexpr int16_t g3_2 = ((mask & 8) >> 3) * 0xffff; constexpr int16_t g4_2 = ((mask & 16) >> 4) * 0xffff; constexpr int16_t g5_2 = ((mask & 32) >> 5) * 0xffff; constexpr int16_t g6_2 = ((mask & 64) >> 6) * 0xffff; constexpr int16_t g7_2 = ((mask & 128) >> 7) * 0xffff; const vint16 mask_1st = vint16{g0, g1, g2, g3, g4, g5, g6, g7}; const vint16 mask_2nd = vint16{g0_2, g1_2, g2_2, g3_2, g4_2, g5_2, g6_2, g7_2}; // generated masks return { (vint16)vec_sel(a._vec0, b._vec0, (vbool16)mask_1st), (vint16)vec_sel(a._vec1, b._vec1, (vbool16)mask_2nd)}; } static Vectorized C10_ALWAYS_INLINE blendv( const Vectorized& a, const Vectorized& b, const Vectorized& mask) { // the mask used here returned by comparision of vec256 // assuming this we can use the same mask directly with vec_sel // warning intel style mask will not work properly return { vec_sel(a._vec0, b._vec0, mask._vecb0), vec_sel(a._vec1, b._vec1, mask._vecb1)}; } template static Vectorized arange( int16_t base = 0, step_t step = static_cast(1)) { return Vectorized( base, base + step, base + 2 * step, base + 3 * step, base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step, base + 8 * step, base + 9 * step, base + 10 * step, base + 11 * step, base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step); } static Vectorized set( const Vectorized& a, const Vectorized& b, size_t count = size()) { switch (count) { case 0: return a; case 1: return blend<1>(a, b); case 2: return blend<3>(a, b); case 3: return blend<7>(a, b); case 4: return blend<15>(a, b); case 5: return blend<31>(a, b); case 6: return blend<63>(a, b); case 7: return blend<127>(a, b); case 8: return blend<255>(a, b); case 9: return blend<511>(a, b); case 10: return blend<1023>(a, b); case 11: return blend<2047>(a, b); case 12: return blend<4095>(a, b); case 13: return blend<8191>(a, b); case 14: return blend<16383>(a, b); case 15: return blend<32767>(a, b); } return b; } static Vectorized C10_ALWAYS_INLINE loadu(const void* ptr, int count = size()) { if (count == size()) { return { vec_vsx_ld(offset0, reinterpret_cast(ptr)), vec_vsx_ld(offset16, reinterpret_cast(ptr))}; } __at_align__ value_type tmp_values[size()] = {}; std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type)); return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)}; } void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const { if (count == size()) { vec_vsx_st(_vec0, offset0, reinterpret_cast(ptr)); vec_vsx_st(_vec1, offset16, reinterpret_cast(ptr)); } else if (count > 0) { __at_align__ value_type tmp_values[size()]; vec_vsx_st(_vec0, offset0, tmp_values); vec_vsx_st(_vec1, offset16, tmp_values); std::memcpy( ptr, tmp_values, std::min(count, size()) * sizeof(value_type)); } } const int16_t& operator[](int idx) const = delete; int16_t& operator[](int idx) = delete; Vectorized angle() const { return blendv( Vectorized(0), Vectorized(c10::pi), *this < Vectorized(0)); } Vectorized real() const { return *this; } Vectorized imag() const { return Vectorized{0}; } Vectorized conj() const { return *this; } Vectorized C10_ALWAYS_INLINE abs() const { return {vec_abs(_vec0), vec_abs(_vec1)}; } Vectorized C10_ALWAYS_INLINE neg() const { return {vec_neg(_vec0), vec_neg(_vec1)}; } DEFINE_MEMBER_UNARY_OP(operator~, int16_t, vec_not) DEFINE_MEMBER_OP(operator==, int16_t, vec_cmpeq) DEFINE_MEMBER_OP(operator!=, int16_t, vec_cmpne) DEFINE_MEMBER_OP(operator<, int16_t, vec_cmplt) DEFINE_MEMBER_OP(operator<=, int16_t, vec_cmple) DEFINE_MEMBER_OP(operator>, int16_t, vec_cmpgt) DEFINE_MEMBER_OP(operator>=, int16_t, vec_cmpge) DEFINE_MEMBER_OP_AND_ONE(eq, int16_t, vec_cmpeq) DEFINE_MEMBER_OP_AND_ONE(ne, int16_t, vec_cmpne) DEFINE_MEMBER_OP_AND_ONE(lt, int16_t, vec_cmplt) DEFINE_MEMBER_OP_AND_ONE(le, int16_t, vec_cmple) DEFINE_MEMBER_OP_AND_ONE(gt, int16_t, vec_cmpgt) DEFINE_MEMBER_OP_AND_ONE(ge, int16_t, vec_cmpge) DEFINE_MEMBER_OP(operator+, int16_t, vec_add) DEFINE_MEMBER_OP(operator-, int16_t, vec_sub) DEFINE_MEMBER_OP(operator*, int16_t, vec_mul) DEFINE_MEMBER_EMULATE_BINARY_OP(operator/, int16_t, /) DEFINE_MEMBER_OP(maximum, int16_t, vec_max) DEFINE_MEMBER_OP(minimum, int16_t, vec_min) DEFINE_MEMBER_OP(operator&, int16_t, vec_and) DEFINE_MEMBER_OP(operator|, int16_t, vec_or) DEFINE_MEMBER_OP(operator^, int16_t, vec_xor) }; template <> Vectorized inline operator<<( const Vectorized& a, const Vectorized& b) { vuint16 shift_vec0 = reinterpret_cast(b.vec0()); vuint16 shift_vec1 = reinterpret_cast(b.vec1()); return Vectorized{ vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)}; } template <> Vectorized inline operator>>( const Vectorized& a, const Vectorized& b) { vuint16 shift_vec0 = reinterpret_cast(b.vec0()); vuint16 shift_vec1 = reinterpret_cast(b.vec1()); return Vectorized{ vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)}; } template <> Vectorized inline maximum( const Vectorized& a, const Vectorized& b) { return a.maximum(b); } template <> Vectorized inline minimum( const Vectorized& a, const Vectorized& b) { return a.minimum(b); } template <> Vectorized C10_ALWAYS_INLINE operator+(const Vectorized& a, const Vectorized& b) { return Vectorized{ vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; } template <> Vectorized C10_ALWAYS_INLINE operator-(const Vectorized& a, const Vectorized& b) { return Vectorized{ vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())}; } template <> Vectorized C10_ALWAYS_INLINE operator*(const Vectorized& a, const Vectorized& b) { return Vectorized{ vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())}; } template <> Vectorized C10_ALWAYS_INLINE operator/(const Vectorized& a, const Vectorized& b) { return Vectorized{a.vec0() / b.vec0(), a.vec1() / b.vec1()}; } template <> Vectorized C10_ALWAYS_INLINE operator&(const Vectorized& a, const Vectorized& b) { return Vectorized{ vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())}; } template <> Vectorized C10_ALWAYS_INLINE operator|(const Vectorized& a, const Vectorized& b) { return Vectorized{ vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())}; } template <> Vectorized C10_ALWAYS_INLINE operator^(const Vectorized& a, const Vectorized& b) { return Vectorized{ vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())}; } } // namespace CPU_CAPABILITY } // namespace vec } // namespace at