// Copyright Naoki Shibata and contributors 2010 - 2021. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) #if CONFIG == 1 || CONFIG == 2 || CONFIG == 3 || CONFIG == 4 #ifndef __VSX__ #error Please specify -mcpu=power8 or -mcpu=power9 #endif #else #error CONFIG macro invalid or not defined #endif #define ENABLE_DP //@#define ENABLE_DP #define LOG2VECTLENDP 1 //@#define LOG2VECTLENDP 1 #define VECTLENDP (1 << LOG2VECTLENDP) //@#define VECTLENDP (1 << LOG2VECTLENDP) #define ENABLE_SP //@#define ENABLE_SP #define LOG2VECTLENSP (LOG2VECTLENDP+1) //@#define LOG2VECTLENSP (LOG2VECTLENDP+1) #define VECTLENSP (1 << LOG2VECTLENSP) //@#define VECTLENSP (1 << LOG2VECTLENSP) #if CONFIG == 1 || CONFIG == 3 #define ENABLE_FMA_DP //@#define ENABLE_FMA_DP #define ENABLE_FMA_SP //@#define ENABLE_FMA_SP #endif #define ACCURATE_SQRT //@#define ACCURATE_SQRT #define FULL_FP_ROUNDING //@#define FULL_FP_ROUNDING #if !defined(SLEEF_GENHEADER) #include // undef altivec types since CPP and C99 use them as compiler tokens // use __vector and __bool instead #undef vector #undef bool #include #include "misc.h" #endif // #if !defined(SLEEF_GENHEADER) #if CONFIG == 1 || CONFIG == 2 #define ISANAME "VSX" #else #define ISANAME "VSX-3" #endif #define DFTPRIORITY 25 static INLINE int vavailability_i(int name) { return 3; } static INLINE void vprefetch_v_p(const void *ptr) { } /********************************************** ** Types ***********************************************/ typedef __vector unsigned int vmask; // using __bool with typedef may cause ambiguous errors #define vopmask __vector __bool int //@#define vopmask __vector __bool int typedef __vector signed int vint; typedef __vector signed int vint2; typedef __vector float vfloat; typedef __vector double vdouble; // internal use types typedef __vector unsigned int v__u32; typedef __vector unsigned char v__u8; typedef __vector signed long long v__i64; typedef __vector unsigned long long v__u64; #define v__b64 __vector __bool long long typedef __vector long long vint64; typedef __vector unsigned long long vuint64; typedef struct { vmask x, y; } vquad; typedef vquad vargquad; /********************************************** ** Utilities ***********************************************/ #define vset__vi(v0, v1) ((vint) {v0, v1, v0, v1}) #define vset__vi2(...) ((vint2) {__VA_ARGS__}) #define vset__vm(...) ((vmask) {__VA_ARGS__}) #define vset__vo(...) ((vopmask) {__VA_ARGS__}) #define vset__vf(...) ((vfloat) {__VA_ARGS__}) #define vset__vd(...) ((vdouble) {__VA_ARGS__}) #define vset__u8(...) ((v__u8) {__VA_ARGS__}) #define vset__u32(...) ((v__u32) {__VA_ARGS__}) #define vset__s64(...) ((v__i64) {__VA_ARGS__}) #define vset__u64(...) ((v__u64) {__VA_ARGS__}) #define vsetall__vi(v) vset__vi((int)v, (int)v) #define vsetall__vi2(v) vset__vi2((int)v, (int)v, (int)v, (int)v) #define vsetall__vm(v) vset__vm(v, v, v, v) #define vsetall__vo(v) vset__vo(v, v, v, v) #define vsetall__vf(v) vset__vf((float)v, (float)v, (float)v, (float)v) #define vsetall__vd(v) vset__vd((double)v, (double)v) #define vsetall__u8(v) vset__u8((uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v) #define vsetall__u32(v) vset__u32((uint32_t)v, (uint32_t)v, (uint32_t)v, (uint32_t)v) #define vsetall__s64(v) vset__s64((int64_t)v, (int64_t)v) #define vsetall__u64(v) vset__u64((uint64_t)v, (uint64_t)v) #define vzero__vi() vsetall__vi(0) #define vzero__vi2() vsetall__vi2(0) #define vzero__vm() vsetall__vm(0) #define vzero__vo() vsetall__vo(0) #define vzero__vf() vsetall__vf(0) #define vzero__vd() vsetall__vd(0) #define vzero__u8() vsetall__u8(0) #define vzero__u32() vsetall__u32(0) #define vzero__s64() vsetall__s64(0) #define vzero__u64() vsetall__u64(0) //// Swap doubleword elements #if defined(__clang__) || __GNUC__ >= 7 static INLINE v__u64 v__swapd_u64(v__u64 v) { return vec_xxpermdi(v, v, 2); } #else static INLINE v__u64 v__swapd_u64(v__u64 v) { __asm__ __volatile__("xxswapd %x0,%x1" : "=wa" (v) : "wa" (v)); return v; } #endif /********************************************** ** Memory ***********************************************/ ////////////// Unaligned memory access ////////////// /** * It's not safe to use vector assignment via (cast & dereference) for unaligned memory access * with almost all clang versions and gcc8 when VSX3 isn't enabled, * these compilers tends to generate instructions 'lvx/stvx' instead of 'lxvd2x/lxvw4x/stxvd2x/stxvw4x' * for more information check https://github.com/seiko2plus/vsx_mem_test * * TODO: check GCC(9, 10) */ //// load #if defined(__POWER9_VECTOR__) || (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8) static vint vloadu_vi_p(const int32_t *ptr) { return *((vint*)ptr); } static INLINE vint2 vloadu_vi2_p(const int32_t *ptr) { return *((vint2*)ptr); } static INLINE vfloat vloadu_vf_p(const float *ptr) { return *((vfloat*)ptr); } static INLINE vdouble vloadu_vd_p(const double *ptr) { return *((vdouble*)ptr); } #else static vint vloadu_vi_p(const int32_t *ptr) { return vec_vsx_ld(0, ptr); } static INLINE vint2 vloadu_vi2_p(const int32_t *ptr) { return vec_vsx_ld(0, ptr); } static INLINE vfloat vloadu_vf_p(const float *ptr) { return vec_vsx_ld(0, ptr); } static INLINE vdouble vloadu_vd_p(const double *ptr) { return vec_vsx_ld(0, ptr); } #endif //// store #if defined(__POWER9_VECTOR__) || (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8) static void vstoreu_v_p_vi(int32_t *ptr, vint v) { *((vint*)ptr) = v; } static void vstoreu_v_p_vi2(int32_t *ptr, vint2 v) { *((vint2*)ptr) = v; } static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { *((vfloat*)ptr) = v; } static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { *((vdouble*)ptr) = v; } #else static void vstoreu_v_p_vi(int32_t *ptr, vint v) { vec_vsx_st(v, 0, ptr); } static void vstoreu_v_p_vi2(int32_t *ptr, vint2 v) { vec_vsx_st(v, 0, ptr); } static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { vec_vsx_st(v, 0, ptr); } static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { vec_vsx_st(v, 0, ptr); } #endif ////////////// aligned memory access ////////////// //// load static INLINE vfloat vload_vf_p(const float *ptr) { return vec_ld(0, ptr); } static INLINE vdouble vload_vd_p(const double *ptr) { return *((vdouble*)ptr); } //// store static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vec_st(v, 0, ptr); } static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *((vdouble*)ptr) = v; } ////////////// non-temporal memory access ////////////// //// store static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); } static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { vstore_v_p_vd(ptr, v); } ////////////// LUT ////////////// //// load static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return vset__vd(ptr[vec_extract(vi, 0)], ptr[vec_extract(vi, 1)]); } static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return vset__vf( ptr[vec_extract(vi2, 0)], ptr[vec_extract(vi2, 1)], ptr[vec_extract(vi2, 2)], ptr[vec_extract(vi2, 3)] ); } //// store static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { const v__u64 vll = (v__u64)v; float *ptr_low = ptr + offset*2; float *ptr_high = ptr + (offset + step)*2; *((uint64_t*)ptr_low) = vec_extract(vll, 0); *((uint64_t*)ptr_high) = vec_extract(vll, 1); } static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); } static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); } static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); } /********************************************** ** Misc **********************************************/ // vector with a specific value set to all lanes (Vector Splat) static INLINE vint vcast_vi_i(int i) { return vsetall__vi(i); } static INLINE vint2 vcast_vi2_i(int i) { return vsetall__vi2(i); } static INLINE vfloat vcast_vf_f(float f) { return vsetall__vf(f); } static INLINE vdouble vcast_vd_d(double d) { return vsetall__vd(d); } // cast static INLINE vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; } static INLINE vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; } // get the first element static INLINE float vcast_f_vf(vfloat v) { return vec_extract(v, 0); } static INLINE double vcast_d_vd(vdouble v) { return vec_extract(v, 0); } static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return (vmask)vd; } static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return (vdouble)vm; } static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; } static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (vfloat)vm; } static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return (vfloat)vi; } static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return (vint2)vf; } // per element select via mask (blend) static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return vec_sel(y, x, (v__b64)o); } static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return vec_sel(y, x, o); } static INLINE vint vsel_vi_vo_vi_vi(vopmask o, vint x, vint y) { return vec_sel(y, x, o); } static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y) { return vec_sel(y, x, o); } static INLINE vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) { return vsel_vf_vo_vf_vf(o, vsetall__vf(v1), vsetall__vf(v0)); } static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) { return vsel_vf_vo_vf_vf(o0, vsetall__vf(d0), vsel_vf_vo_f_f(o1, d1, d2)); } static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) { return vsel_vf_vo_vf_vf(o0, vsetall__vf(d0), vsel_vf_vo_vf_vf(o1, vsetall__vf(d1), vsel_vf_vo_f_f(o2, d2, d3))); } static INLINE vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) { return vsel_vd_vo_vd_vd(o, vsetall__vd(v1), vsetall__vd(v0)); } static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) { return vsel_vd_vo_vd_vd(o0, vsetall__vd(d0), vsel_vd_vo_d_d(o1, d1, d2)); } static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) { return vsel_vd_vo_vd_vd(o0, vsetall__vd(d0), vsel_vd_vo_vd_vd(o1, vsetall__vd(d1), vsel_vd_vo_d_d(o2, d2, d3))); } static INLINE int vtestallones_i_vo32(vopmask g) { return vec_all_ne((vint2)g, vzero__vi2()); } static INLINE int vtestallones_i_vo64(vopmask g) { return vec_all_ne((v__i64)g, vzero__s64()); } /********************************************** ** Conversions **********************************************/ ////////////// Numeric ////////////// // pack 64-bit mask to 32-bit static INLINE vopmask vcast_vo32_vo64(vopmask m) { return (vopmask)vec_pack((v__u64)m, (v__u64)m); } // clip 64-bit lanes to lower 32-bit static INLINE vint vcastu_vi_vi2(vint2 vi2) { return vec_mergeo(vi2, vec_splat(vi2, 3)); } static INLINE vint vcastu_vi_vm(vmask vi2) { return vec_mergeo((vint2)vi2, vec_splat((vint2)vi2, 3)); } // expand lower 32-bit mask static INLINE vopmask vcast_vo64_vo32(vopmask m) { return vec_mergeh(m, m); } // unsigned expand lower 32-bit integer static INLINE vint2 vcastu_vi2_vi(vint vi) { return vec_mergeh(vzero__vi(), vi); } static INLINE vmask vcastu_vm_vi(vint vi) { return (vmask)vec_mergeh(vzero__vi(), vi); } static INLINE vopmask vcast_vo_i(int i) { i = i ? -1 : 0; return (vopmask) { (unsigned int)i, (unsigned int)i, (unsigned int)i, (unsigned int)i }; } // signed int to single-precision static INLINE vfloat vcast_vf_vi2(vint2 vi) { vfloat ret; #if defined(__clang__) || __GNUC__ >= 9 ret = __builtin_convertvector(vi, vfloat); #else __asm__ __volatile__("xvcvsxwsp %x0,%x1" : "=wa" (ret) : "wa" (vi)); #endif return ret; } // lower signed int to double-precision static INLINE vdouble vcast_vd_vi(vint vi) { vdouble ret; vint swap = vec_mergeh(vi, vi); #if defined(__clang__) || __GNUC__ >= 7 ret = __builtin_vsx_xvcvsxwdp(swap); #else __asm__ __volatile__("xvcvsxwdp %x0,%x1" : "=wa" (ret) : "wa" (swap)); #endif return ret; } // zip two scalars static INLINE vmask vcast_vm_i_i(int l, int h) { return (vmask)vec_mergeh(vsetall__vi2(h), vsetall__vi2(l)); } static INLINE vmask vcast_vm_i64(int64_t i) { return (vmask)vsetall__s64(i); } static INLINE vmask vcast_vm_u64(uint64_t i) { return (vmask)vsetall__u64(i); } ////////////// Truncation ////////////// static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { vint2 ret; #if defined(__clang__) || __GNUC__ >= 9 ret = __builtin_convertvector(vf, vint2); #else __asm__ __volatile__("xvcvspsxws %x0,%x1" : "=wa" (ret) : "wa" (vf)); #endif return ret; } static INLINE vint vtruncate_vi_vd(vdouble vd) { vint ret; #if defined(__clang__) || __GNUC__ >= 7 ret = __builtin_vsx_xvcvdpsxws(vd); #else __asm__ __volatile__("xvcvdpsxws %x0,%x1" : "=wa" (ret) : "wa" (vd)); #endif return vec_mergeo(ret, vec_splat(ret, 3)); } static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vec_trunc(vd); } static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return vec_trunc(vf); } ////////////// Rounding ////////////// // towards the nearest even static INLINE vint vrint_vi_vd(vdouble vd) { return vtruncate_vi_vd(vec_rint(vd)); } static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vtruncate_vi2_vf(vec_rint(vf)); } static INLINE vdouble vrint_vd_vd(vdouble vd) { return vec_rint(vd); } static INLINE vfloat vrint_vf_vf(vfloat vf) { return vec_rint(vf); } /********************************************** ** Logical **********************************************/ ////////////// And ////////////// static INLINE vint vand_vi_vi_vi(vint x, vint y) { return vec_and(x, y); } static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return vec_and((vint)x, y); } static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return vec_and(x, y); } static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)vec_and((vint2)x, y); } static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vec_and(x, y); } static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vec_and((vmask)x, y); } static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vec_and((vmask)x, y); } static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vec_and(x, y); } ////////////// Or ////////////// static INLINE vint vor_vi_vi_vi(vint x, vint y) { return vec_or(x, y); } static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return vec_or(x, y); } static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vec_or(x, y); } static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vec_or((vmask)x, y); } static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vec_or((vmask)x, y); } static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vec_or(x, y); } ////////////// Xor ////////////// static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return vec_xor(x, y); } static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return vec_xor(x, y); } static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vec_xor(x, y); } static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vec_xor((vmask)x, y); } static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vec_xor((vmask)x, y); } static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vec_xor(x, y); } ////////////// Not ////////////// static INLINE vopmask vnot_vo_vo(vopmask o) { return vec_nor(o, o); } ////////////// And Not ((~x) & y) ////////////// static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return vec_andc(y, x); } static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return vec_andc(y, (vint)x); } static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return vec_andc(y, x); } static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vec_andc(y, x); } static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vec_andc(y, x); } static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vec_andc(y, x); } static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vec_andc(y, x); } static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vec_andc(y, (vint2)x); } /********************************************** ** Comparison **********************************************/ ////////////// Equal ////////////// static INLINE vint veq_vi_vi_vi(vint x, vint y) { return (vint)vec_cmpeq(x, y); } static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return vec_cmpeq(x, y); } static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return vec_cmpeq(x, y); } static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return (vint2)vec_cmpeq(x, y); } static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return (vopmask)vec_cmpeq((v__u64)x, (v__u64)y); } static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vec_cmpeq(x, y); } static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmpeq(x, y); } ////////////// Not Equal ////////////// static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vnot_vo_vo(vec_cmpeq(x, y)); } static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vnot_vo_vo((vopmask)vec_cmpeq(x, y)); } ////////////// Less Than ////////////// static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vec_cmplt(x, y); } static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmplt(x, y); } ////////////// Greater Than ////////////// static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return (vint)vec_cmpgt(x, y); } static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return vec_cmpgt(x, y);} static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return (vint2)vec_cmpgt(x, y); } static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return vec_cmpgt(x, y); } static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vec_cmpgt(x, y); } static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmpgt(x, y); } ////////////// Less Than Or Equal ////////////// static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vec_cmple(x, y); } static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmple(x, y); } ////////////// Greater Than Or Equal ////////////// static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vec_cmpge(x, y); } static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmpge(x, y); } ////////////// Special Cases ////////////// static INLINE vopmask visinf_vo_vf(vfloat d) { return vec_cmpeq(vec_abs(d), vsetall__vf(SLEEF_INFINITYf)); } static INLINE vopmask visinf_vo_vd(vdouble d) { return (vopmask)vec_cmpeq(vec_abs(d), vsetall__vd(SLEEF_INFINITY)); } static INLINE vopmask vispinf_vo_vf(vfloat d) { return vec_cmpeq(d, vsetall__vf(SLEEF_INFINITYf)); } static INLINE vopmask vispinf_vo_vd(vdouble d) { return (vopmask)vec_cmpeq(d, vsetall__vd(SLEEF_INFINITY)); } static INLINE vopmask visminf_vo_vf(vfloat d) { return vec_cmpeq(d, vsetall__vf(-SLEEF_INFINITYf)); } static INLINE vopmask visminf_vo_vd(vdouble d) { return (vopmask)vec_cmpeq(d, vsetall__vd(-SLEEF_INFINITY)); } static INLINE vopmask visnan_vo_vf(vfloat d) { return vnot_vo_vo(vec_cmpeq(d, d)); } static INLINE vopmask visnan_vo_vd(vdouble d) { return vnot_vo_vo((vopmask)vec_cmpeq(d, d)); } /********************************************** ** Shift **********************************************/ ////////////// Left ////////////// static INLINE vint vsll_vi_vi_i(vint x, int c) { return vec_sl (x, vsetall__u32(c)); } static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return vec_sl(x, vsetall__u32(c)); } ////////////// Right ////////////// static INLINE vint vsrl_vi_vi_i(vint x, int c) { return vec_sr(x, vsetall__u32(c)); } static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return vec_sr(x, vsetall__u32(c)); } ////////////// Algebraic Right ////////////// static INLINE vint vsra_vi_vi_i(vint x, int c) { return vec_sra(x, vsetall__u32(c)); } static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return vec_sra(x, vsetall__u32(c)); } /********************************************** ** Reorder **********************************************/ ////////////// Reverse ////////////// // Reverse elements order inside the lower and higher parts static INLINE vint2 vrev21_vi2_vi2(vint2 vi) { return vec_mergee(vec_mergeo(vi, vi), vi); } static INLINE vfloat vrev21_vf_vf(vfloat vf) { return (vfloat)vrev21_vi2_vi2((vint2)vf); } // Swap the lower and higher parts static INLINE vfloat vreva2_vf_vf(vfloat vf) { return (vfloat)v__swapd_u64((v__u64)vf); } static INLINE vdouble vrev21_vd_vd(vdouble vd) { return (vdouble)v__swapd_u64((v__u64)vd); } static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; } /********************************************** ** Arithmetic **********************************************/ ////////////// Negation ////////////// static INLINE vint vneg_vi_vi(vint e) { #if defined(__clang__) || __GNUC__ >= 9 return vec_neg(e); #else return vec_sub(vzero__vi(), e); #endif } static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vneg_vi_vi(e); } static INLINE vfloat vneg_vf_vf(vfloat d) { vfloat ret; #if defined(__clang__) || __GNUC__ >= 9 ret = vec_neg(d); #else __asm__ __volatile__("xvnegsp %x0,%x1" : "=wa" (ret) : "wa" (d)); #endif return ret; } static INLINE vdouble vneg_vd_vd(vdouble d) { vdouble ret; #if defined(__clang__) || __GNUC__ >= 9 ret = vec_neg(d); #else __asm__ __volatile__("xvnegdp %x0,%x1" : "=wa" (ret) : "wa" (d)); #endif return ret; } static INLINE vfloat vposneg_vf_vf(vfloat d) { return vec_xor(d, vset__vf(+0.0f, -0.0f, +0.0f, -0.0f)); } static INLINE vdouble vposneg_vd_vd(vdouble d) { return vec_xor(d, vset__vd(+0.0, -0.0)); } static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vec_xor(d, vset__vf(-0.0f, +0.0f, -0.0f, +0.0f)); } static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vec_xor(d, vset__vd(-0.0, +0.0)); } ////////////// Addition ////////////// static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return vec_add(x, y); } static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return vec_add(x, y); } static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return vec_add(x, y); } static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return vec_add(x, y); } static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return (vmask)vec_add((v__i64)x, (v__i64)y); } ////////////// Subtraction ////////////// static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return vec_sub(x, y); } static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return vec_sub(x, y); } static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return vec_sub(x, y); } static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return vec_sub(x, y); } static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vec_add(x, vnegpos_vd_vd(y)); } static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vec_add(x, vnegpos_vf_vf(y)); } ////////////// Multiplication ////////////// static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return vec_mul(x, y); } static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return vec_mul(x, y); } static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return vec_div(x, y); } static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return vec_div(x, y); } static INLINE vfloat vrec_vf_vf(vfloat x) { return vec_div(vsetall__vf(1.0f), x); } static INLINE vdouble vrec_vd_vd(vdouble x) { return vec_div(vsetall__vd(1.0), x); } /********************************************** ** Math **********************************************/ static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return vec_max(x, y); } static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return vec_max(x, y); } static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return vec_min(x, y); } static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return vec_min(x, y); } static INLINE vfloat vabs_vf_vf(vfloat f) { return vec_abs(f); } static INLINE vdouble vabs_vd_vd(vdouble d) { return vec_abs(d); } static INLINE vfloat vsqrt_vf_vf(vfloat f) { return vec_sqrt(f); } static INLINE vdouble vsqrt_vd_vd(vdouble d) { return vec_sqrt(d); } /********************************************** ** FMA3 **********************************************/ #if CONFIG == 1 || CONFIG == 3 static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_madd(x, y, z); } static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_madd(x, y, z); } static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_msub(x, y, z); } static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_msub(x, y, z); } static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_nmsub(x, y, z); } static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_nmsub(x, y, z); } #else static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_add(vec_mul(x, y), z); } static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_add(vec_mul(x, y), z); } static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_sub(vec_mul(x, y), z); } static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_sub(vec_mul(x, y), z); } static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_sub(z, vec_mul(x, y)); } static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_sub(z, vec_mul(x, y)); } #endif static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_madd(x, y, z); } static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_madd(x, y, z); } static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_madd(x, y, z); } static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_madd(x, y, z); } static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_msub(x, y, z); } static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_msub(x, y, z); } static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_nmsub(x, y, z); } static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_nmsub(x, y, z); } static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_nmadd(x, y, z); } static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_nmadd(x, y, z); } static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); } static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); } // static vquad loadu_vq_p(void *p) { vquad vq; memcpy(&vq, p, VECTLENDP * 16); return vq; } static INLINE vquad cast_vq_aq(vargquad aq) { vquad vq; memcpy(&vq, &aq, VECTLENDP * 16); return vq; } static INLINE vargquad cast_aq_vq(vquad vq) { vargquad aq; memcpy(&aq, &vq, VECTLENDP * 16); return aq; } static INLINE int vtestallzeros_i_vo64(vopmask g) { return vec_all_eq((__vector signed long long)g, vzero__s64()); } static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) { return (vmask)vec_sel((__vector signed long long)y, (__vector signed long long)x, (v__b64)o); } static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return (vmask)vec_sub((__vector signed long long)x, (__vector signed long long)y); } static INLINE vmask vneg64_vm_vm(vmask x) { return (vmask)vec_sub((__vector signed long long) {0, 0}, (__vector signed long long)x); } static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return (vopmask)vec_cmpgt((__vector signed long long)x, (__vector signed long long)y); } #define vsll64_vm_vm_i(x, c) ((vmask)vec_sl((__vector signed long long)x, (__vector unsigned long long)vsetall__vm(c))) #define vsrl64_vm_vm_i(x, c) ((vmask)vec_sr((__vector signed long long)x, (__vector unsigned long long)vsetall__vm(c))) static INLINE vint vcast_vi_vm(vmask vm) { return (vint) { (int)vm[0], (int)vm[2] }; } static INLINE vmask vcast_vm_vi(vint vi) { return (vmask) (__vector signed long long) { (signed long long)vi[0], (signed long long)vi[1] }; } static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return (vmask)v; } static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return (vint64)m; } static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return (vmask)v; } static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return (vuint64)m; }