#pragma once #include #include #include namespace at { namespace vec { // See Note [CPU_CAPABILITY namespace] inline namespace CPU_CAPABILITY { template <> struct is_vec_specialized_for : std::bool_constant {}; template <> class Vectorized { private: union { struct { vint32 _vec0; vint32 _vec1; }; struct { vbool32 _vecb0; vbool32 _vecb1; }; } __attribute__((__may_alias__)); public: using value_type = int32_t; using vec_internal_type = vint32; using vec_internal_mask_type = vbool32; using size_type = int; static constexpr size_type size() { return 8; } Vectorized() {} C10_ALWAYS_INLINE Vectorized(vint32 v) : _vec0{v}, _vec1{v} {} C10_ALWAYS_INLINE Vectorized(vbool32 vmask) : _vecb0{vmask}, _vecb1{vmask} {} C10_ALWAYS_INLINE Vectorized(vint32 v1, vint32 v2) : _vec0{v1}, _vec1{v2} {} C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2) : _vecb0{v1}, _vecb1{v2} {} C10_ALWAYS_INLINE Vectorized(int32_t scalar) : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {} C10_ALWAYS_INLINE Vectorized( int32_t scalar1, int32_t scalar2, int32_t scalar3, int32_t scalar4, int32_t scalar5, int32_t scalar6, int32_t scalar7, int32_t scalar8) : _vec0{vint32{scalar1, scalar2, scalar3, scalar4}}, _vec1{vint32{scalar5, scalar6, scalar7, scalar8}} {} C10_ALWAYS_INLINE const vec_internal_type& vec0() const { return _vec0; } C10_ALWAYS_INLINE const vec_internal_type& vec1() const { return _vec1; } template static std::enable_if_t> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { return a; } template static std::enable_if_t<(mask & 255) == 255, Vectorized> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { return b; } template static std::enable_if_t> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { return {b._vec0, a._vec1}; } template static std::enable_if_t<(mask > 0 && mask < 15), Vectorized> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { constexpr uint32_t g0 = (mask & 1) * 0xffffffff; constexpr uint32_t g1 = ((mask & 2) >> 1) * 0xffffffff; constexpr uint32_t g2 = ((mask & 4) >> 2) * 0xffffffff; constexpr uint32_t g3 = ((mask & 8) >> 3) * 0xffffffff; const vbool32 mask_1st = (vbool32){g0, g1, g2, g3}; return {(vint32)vec_sel(a._vec0, b._vec0, (vbool32)mask_1st), a._vec1}; } template static std::enable_if_t< (mask > 15 && (mask & 255) != 255 && ((mask & 15) == 15)), Vectorized> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { constexpr uint32_t mask2 = (mask & 255) >> 4; constexpr uint32_t g0_2 = (mask2 & 1) * 0xffffffff; constexpr uint32_t g1_2 = ((mask2 & 2) >> 1) * 0xffffffff; constexpr uint32_t g2_2 = ((mask2 & 4) >> 2) * 0xffffffff; constexpr uint32_t g3_2 = ((mask2 & 8) >> 3) * 0xffffffff; const vbool32 mask_2nd = (vbool32){g0_2, g1_2, g2_2, g3_2}; // generated masks return {b._vec0, (vint32)vec_sel(a._vec1, b._vec1, (vbool32)mask_2nd)}; } template static std::enable_if_t< (mask > 15 && ((mask & 255) != 255) && ((mask & 15) == 0)), Vectorized> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { constexpr uint32_t mask2 = (mask & 255) >> 4; constexpr uint32_t g0_2 = (mask2 & 1) * 0xffffffff; constexpr uint32_t g1_2 = ((mask2 & 2) >> 1) * 0xffffffff; constexpr uint32_t g2_2 = ((mask2 & 4) >> 2) * 0xffffffff; constexpr uint32_t g3_2 = ((mask2 & 8) >> 3) * 0xffffffff; const vbool32 mask_2nd = (vbool32){g0_2, g1_2, g2_2, g3_2}; // generated masks return {a, (vint32)vec_sel(a._vec1, b._vec1, (vbool32)mask_2nd)}; } template static std::enable_if_t< (mask > 15 && ((mask & 255) != 255) && ((mask & 15) != 0) && ((mask & 15) != 15)), Vectorized> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { constexpr uint32_t g0 = (mask & 1) * 0xffffffff; constexpr uint32_t g1 = ((mask & 2) >> 1) * 0xffffffff; constexpr uint32_t g2 = ((mask & 4) >> 2) * 0xffffffff; constexpr uint32_t g3 = ((mask & 8) >> 3) * 0xffffffff; constexpr uint32_t mask2 = (mask & 255) >> 4; constexpr uint32_t g0_2 = (mask2 & 1) * 0xffffffff; constexpr uint32_t g1_2 = ((mask2 & 2) >> 1) * 0xffffffff; constexpr uint32_t g2_2 = ((mask2 & 4) >> 2) * 0xffffffff; constexpr uint32_t g3_2 = ((mask2 & 8) >> 3) * 0xffffffff; const vbool32 mask_1st = (vbool32){g0, g1, g2, g3}; const vbool32 mask_2nd = (vbool32){g0_2, g1_2, g2_2, g3_2}; // generated masks return { (vint32)vec_sel(a._vec0, b._vec0, (vbool32)mask_1st), (vint32)vec_sel(a._vec1, b._vec1, (vbool32)mask_2nd)}; } static Vectorized C10_ALWAYS_INLINE blendv( const Vectorized& a, const Vectorized& b, const Vectorized& mask) { // the mask used here returned by comparision of vec256 // assuming this we can use the same mask directly with vec_sel // warning intel style mask will not work properly return { vec_sel(a._vec0, b._vec0, mask._vecb0), vec_sel(a._vec1, b._vec1, mask._vecb1)}; } template static Vectorized arange( int32_t base = 0.f, step_t step = static_cast(1)) { return Vectorized( base, base + step, base + 2 * step, base + 3 * step, base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step); } static Vectorized set( const Vectorized& a, const Vectorized& b, size_t count = size()) { switch (count) { case 0: return a; case 1: return blend<1>(a, b); case 2: return blend<3>(a, b); case 3: return blend<7>(a, b); case 4: return blend<15>(a, b); case 5: return blend<31>(a, b); case 6: return blend<63>(a, b); case 7: return blend<127>(a, b); } return b; } static Vectorized C10_ALWAYS_INLINE loadu(const void* ptr, int count = size()) { if (count == size()) { return { vec_vsx_ld(offset0, reinterpret_cast(ptr)), vec_vsx_ld(offset16, reinterpret_cast(ptr))}; } __at_align__ value_type tmp_values[size()] = {}; std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type)); return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)}; } void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const { if (count == size()) { vec_vsx_st(_vec0, offset0, reinterpret_cast(ptr)); vec_vsx_st(_vec1, offset16, reinterpret_cast(ptr)); } else if (count > 0) { __at_align__ value_type tmp_values[size()]; vec_vsx_st(_vec0, offset0, tmp_values); vec_vsx_st(_vec1, offset16, tmp_values); std::memcpy( ptr, tmp_values, std::min(count, size()) * sizeof(value_type)); } } const int32_t& operator[](int idx) const = delete; int32_t& operator[](int idx) = delete; Vectorized angle() const { return blendv( Vectorized(0), Vectorized(c10::pi), *this < Vectorized(0)); } Vectorized real() const { return *this; } Vectorized imag() const { return Vectorized{0}; } Vectorized conj() const { return *this; } Vectorized C10_ALWAYS_INLINE abs() const { return {vec_abs(_vec0), vec_abs(_vec1)}; } Vectorized C10_ALWAYS_INLINE neg() const { return {vec_neg(_vec0), vec_neg(_vec1)}; } DEFINE_MEMBER_UNARY_OP(operator~, int32_t, vec_not) DEFINE_MEMBER_OP(operator==, int32_t, vec_cmpeq) DEFINE_MEMBER_OP(operator!=, int32_t, vec_cmpne) DEFINE_MEMBER_OP(operator<, int32_t, vec_cmplt) DEFINE_MEMBER_OP(operator<=, int32_t, vec_cmple) DEFINE_MEMBER_OP(operator>, int32_t, vec_cmpgt) DEFINE_MEMBER_OP(operator>=, int32_t, vec_cmpge) DEFINE_MEMBER_OP_AND_ONE(eq, int32_t, vec_cmpeq) DEFINE_MEMBER_OP_AND_ONE(ne, int32_t, vec_cmpne) DEFINE_MEMBER_OP_AND_ONE(lt, int32_t, vec_cmplt) DEFINE_MEMBER_OP_AND_ONE(le, int32_t, vec_cmple) DEFINE_MEMBER_OP_AND_ONE(gt, int32_t, vec_cmpgt) DEFINE_MEMBER_OP_AND_ONE(ge, int32_t, vec_cmpge) DEFINE_MEMBER_OP(operator+, int32_t, vec_add) DEFINE_MEMBER_OP(operator-, int32_t, vec_sub) DEFINE_MEMBER_OP(operator*, int32_t, vec_mul) DEFINE_MEMBER_EMULATE_BINARY_OP(operator/, int32_t, /) DEFINE_MEMBER_OP(maximum, int32_t, vec_max) DEFINE_MEMBER_OP(minimum, int32_t, vec_min) DEFINE_MEMBER_OP(operator&, int32_t, vec_and) DEFINE_MEMBER_OP(operator|, int32_t, vec_or) DEFINE_MEMBER_OP(operator^, int32_t, vec_xor) }; template <> Vectorized inline operator<<( const Vectorized& a, const Vectorized& b) { vuint32 shift_vec0 = reinterpret_cast(b.vec0()); vuint32 shift_vec1 = reinterpret_cast(b.vec1()); return Vectorized{ vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)}; } template <> Vectorized inline operator>>( const Vectorized& a, const Vectorized& b) { vuint32 shift_vec0 = reinterpret_cast(b.vec0()); vuint32 shift_vec1 = reinterpret_cast(b.vec1()); return Vectorized{ vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)}; } template <> Vectorized inline maximum( const Vectorized& a, const Vectorized& b) { return a.maximum(b); } template <> Vectorized inline minimum( const Vectorized& a, const Vectorized& b) { return a.minimum(b); } template <> Vectorized C10_ALWAYS_INLINE operator+(const Vectorized& a, const Vectorized& b) { return Vectorized{ vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; } template <> Vectorized C10_ALWAYS_INLINE operator-(const Vectorized& a, const Vectorized& b) { return Vectorized{ vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())}; } template <> Vectorized C10_ALWAYS_INLINE operator*(const Vectorized& a, const Vectorized& b) { return Vectorized{ vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())}; } template <> Vectorized C10_ALWAYS_INLINE operator/(const Vectorized& a, const Vectorized& b) { return Vectorized{a.vec0() / b.vec0(), a.vec1() / b.vec1()}; } template <> Vectorized C10_ALWAYS_INLINE operator&(const Vectorized& a, const Vectorized& b) { return Vectorized{ vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())}; } template <> Vectorized C10_ALWAYS_INLINE operator|(const Vectorized& a, const Vectorized& b) { return Vectorized{ vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())}; } template <> Vectorized C10_ALWAYS_INLINE operator^(const Vectorized& a, const Vectorized& b) { return Vectorized{ vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())}; } } // namespace CPU_CAPABILITY } // namespace vec } // namespace at