// Copyright Naoki Shibata and contributors 2010 - 2021. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) #include #include "misc.h" #ifndef CONFIG #error CONFIG macro not defined #endif #define ENABLE_DP #define ENABLE_SP #define LOG2VECTLENDP CONFIG #define VECTLENDP (1 << LOG2VECTLENDP) #define LOG2VECTLENSP (LOG2VECTLENDP+1) #define VECTLENSP (1 << LOG2VECTLENSP) #define DFTPRIORITY LOG2VECTLENDP #if defined(__clang__) #define ISANAME "Clang Vector Extension" typedef uint32_t vmask __attribute__((ext_vector_type(VECTLENDP*2))); typedef uint32_t vopmask __attribute__((ext_vector_type(VECTLENDP*2))); typedef double vdouble __attribute__((ext_vector_type(VECTLENDP))); typedef int32_t vint __attribute__((ext_vector_type(VECTLENDP))); typedef float vfloat __attribute__((ext_vector_type(VECTLENDP*2))); typedef int32_t vint2 __attribute__((ext_vector_type(VECTLENDP*2))); #ifdef ENABLE_LONGDOUBLE typedef uint8_t vmaskl __attribute__((ext_vector_type(sizeof(long double)*VECTLENDP))); typedef long double vlongdouble __attribute__((ext_vector_type(VECTLENDP))); #endif #if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128) typedef uint8_t vmaskq __attribute__((ext_vector_type(sizeof(Sleef_quad)*VECTLENDP))); #ifdef ENABLE_LONGDOUBLE typedef Sleef_quad vquad __attribute__((ext_vector_type(VECTLENDP))); #endif #endif #elif defined(__GNUC__) #define ISANAME "GCC Vector Extension" typedef uint32_t vmask __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP*2))); typedef uint32_t vopmask __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP*2))); typedef double vdouble __attribute__((vector_size(sizeof(double)*VECTLENDP))); typedef int32_t vint __attribute__((vector_size(sizeof(int32_t)*VECTLENDP))); typedef float vfloat __attribute__((vector_size(sizeof(float)*VECTLENDP*2))); typedef int32_t vint2 __attribute__((vector_size(sizeof(int32_t)*VECTLENDP*2))); #ifdef ENABLE_LONGDOUBLE typedef uint8_t vmaskl __attribute__((vector_size(sizeof(long double)*VECTLENDP))); typedef long double vlongdouble __attribute__((vector_size(sizeof(long double)*VECTLENDP))); #endif #if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128) typedef uint8_t vmaskq __attribute__((vector_size(sizeof(Sleef_quad)*VECTLENDP))); typedef Sleef_quad vquad __attribute__((vector_size(sizeof(Sleef_quad)*VECTLENDP))); #endif #endif // #if VECTLENDP == 2 static INLINE vopmask vcast_vo32_vo64(vopmask m) { return (vopmask){ m[1], m[3], 0, 0 }; } static INLINE vopmask vcast_vo64_vo32(vopmask m) { return (vopmask){ m[0], m[0], m[1], m[1] }; } static INLINE vint vcast_vi_i(int i) { return (vint) { i, i }; } static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i }; } static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f }; } static INLINE vdouble vcast_vd_d(double d) { return (vdouble) { d, d }; } #ifdef ENABLE_LONGDOUBLE static INLINE vlongdouble vcast_vl_l(long double d) { return (vlongdouble) { d, d }; } #endif #if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128) static INLINE vquad vcast_vq_q(Sleef_quad d) { return (vquad) { d, d }; } #endif static INLINE vmask vcast_vm_i_i(int h, int l) { return (vmask){ l, h, l, h }; } static INLINE vint2 vcastu_vi2_vi(vint vi) { return (vint2){ 0, vi[0], 0, vi[1] }; } static INLINE vint vcastu_vi_vi2(vint2 vi2) { return (vint){ vi2[1], vi2[3] }; } static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) { return (vint){ vi2[0], vi2[1] }; } static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) { return (vint2){ vi[0], vi[1], 0, 0 }; } static INLINE vdouble vrev21_vd_vd(vdouble vd) { return (vdouble) { vd[1], vd[0] }; } static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; } static INLINE vfloat vrev21_vf_vf(vfloat vd) { return (vfloat) { vd[1], vd[0], vd[3], vd[2] }; } static INLINE vfloat vreva2_vf_vf(vfloat vd) { return (vfloat) { vd[2], vd[3], vd[0], vd[1] }; } #ifdef ENABLE_LONGDOUBLE static INLINE vlongdouble vrev21_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[1], vd[0] }; } static INLINE vlongdouble vreva2_vl_vl(vlongdouble vd) { return vd; } static INLINE vlongdouble vposneg_vl_vl(vlongdouble vd) { return (vlongdouble) { +vd[0], -vd[1] }; } static INLINE vlongdouble vnegpos_vl_vl(vlongdouble vd) { return (vlongdouble) { -vd[0], +vd[1] }; } #endif #if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128) static INLINE vquad vrev21_vq_vq(vquad vd) { return (vquad) { vd[1], vd[0] }; } static INLINE vquad vreva2_vq_vq(vquad vd) { return vd; } static INLINE vquad vposneg_vq_vq(vquad vd) { return (vquad) { +vd[0], -vd[1] }; } static INLINE vquad vnegpos_vq_vq(vquad vd) { return (vquad) { -vd[0], +vd[1] }; } #endif #define PNMASK ((vdouble) { +0.0, -0.0 }) #define NPMASK ((vdouble) { -0.0, +0.0 }) static INLINE vdouble vposneg_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)PNMASK); } static INLINE vdouble vnegpos_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)NPMASK); } #define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f }) #define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f }) static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)PNMASKf); } static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)NPMASKf); } #elif VECTLENDP == 4 static INLINE vopmask vcast_vo32_vo64(vopmask m) { return (vopmask){ m[1], m[3], m[5], m[7], 0, 0, 0, 0 }; } static INLINE vopmask vcast_vo64_vo32(vopmask m) { return (vopmask){ m[0], m[0], m[1], m[1], m[2], m[2], m[3], m[3] }; } static INLINE vint vcast_vi_i(int i) { return (vint) { i, i, i, i }; } static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i, i, i, i, i }; } static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f, f, f, f, f }; } static INLINE vdouble vcast_vd_d(double d) { return (vdouble) { d, d, d, d }; } #ifdef ENABLE_LONGDOUBLE static INLINE vlongdouble vcast_vl_l(long double d) { return (vlongdouble) { d, d, d, d }; } #endif static INLINE vmask vcast_vm_i_i(int h, int l) { return (vmask){ l, h, l, h, l, h, l, h }; } static INLINE vint2 vcastu_vi2_vi(vint vi) { return (vint2){ 0, vi[0], 0, vi[1], 0, vi[2], 0, vi[3] }; } static INLINE vint vcastu_vi_vi2(vint2 vi2) { return (vint){ vi2[1], vi2[3], vi2[5], vi2[7] }; } static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) { return (vint){ vi2[0], vi2[1], vi2[2], vi2[3] }; } static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) { return (vint2){ vi[0], vi[1], vi[2], vi[3], 0, 0, 0, 0 }; } #define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 }) #define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 }) static INLINE vdouble vposneg_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)PNMASK); } static INLINE vdouble vnegpos_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)NPMASK); } #define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f }) #define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f }) static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)PNMASKf); } static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)NPMASKf); } static INLINE vdouble vrev21_vd_vd(vdouble vd) { return (vdouble) { vd[1], vd[0], vd[3], vd[2] }; } static INLINE vdouble vreva2_vd_vd(vdouble vd) { return (vdouble) { vd[2], vd[3], vd[0], vd[1] }; } static INLINE vfloat vrev21_vf_vf(vfloat vd) { return (vfloat) { vd[1], vd[0], vd[3], vd[2], vd[5], vd[4], vd[7], vd[6] }; } static INLINE vfloat vreva2_vf_vf(vfloat vd) { return (vfloat) { vd[6], vd[7], vd[4], vd[5], vd[2], vd[3], vd[0], vd[1] }; } #ifdef ENABLE_LONGDOUBLE static INLINE vlongdouble vrev21_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[1], vd[0], vd[3], vd[2] }; } static INLINE vlongdouble vreva2_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[2], vd[3], vd[0], vd[1] }; } static INLINE vlongdouble vposneg_vl_vl(vlongdouble vd) { return (vlongdouble) { +vd[0], -vd[1], +vd[2], -vd[3] }; } static INLINE vlongdouble vnegpos_vl_vl(vlongdouble vd) { return (vlongdouble) { -vd[0], +vd[1], -vd[2], +vd[3] }; } #endif #elif VECTLENDP == 8 static INLINE vopmask vcast_vo32_vo64(vopmask m) { return (vopmask){ m[1], m[3], m[5], m[7], m[9], m[11], m[13], m[15], 0, 0, 0, 0, 0, 0, 0, 0 }; } static INLINE vopmask vcast_vo64_vo32(vopmask m) { return (vopmask){ m[0], m[0], m[1], m[1], m[2], m[2], m[3], m[3], m[4], m[4], m[5], m[5], m[6], m[6], m[7], m[7] }; } static INLINE vint vcast_vi_i(int i) { return (vint) { i, i, i, i, i, i, i, i }; } static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i }; } static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f, f, f, f, f, f, f, f, f, f, f, f, f }; } static INLINE vdouble vcast_vd_d(double d) { return (vdouble) { d, d, d, d, d, d, d, d }; } #ifdef ENABLE_LONGDOUBLE static INLINE vlongdouble vcast_vl_l(long double d) { return (vlongdouble) { d, d, d, d, d, d, d, d }; } #endif static INLINE vmask vcast_vm_i_i(int h, int l) { return (vmask){ l, h, l, h, l, h, l, h, l, h, l, h, l, h, l, h }; } static INLINE vint2 vcastu_vi2_vi(vint vi) { return (vint2){ 0, vi[0], 0, vi[1], 0, vi[2], 0, vi[3], 0, vi[4], 0, vi[5], 0, vi[6], 0, vi[7] }; } static INLINE vint vcastu_vi_vi2(vint2 vi2) { return (vint){ vi2[1], vi2[3], vi2[5], vi2[7], vi2[9], vi2[11], vi2[13], vi2[15] }; } static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) { return (vint){ vi2[0], vi2[1], vi2[2], vi2[3], vi2[4], vi2[5], vi2[6], vi2[7] }; } static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) { return (vint2){ vi[0], vi[1], vi[2], vi[3], vi[4], vi[5], vi[6], vi[7], 0, 0, 0, 0, 0, 0, 0, 0 }; } #define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0, +0.0, -0.0, +0.0, -0.0 }) #define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0, -0.0, +0.0, -0.0, +0.0 }) static INLINE vdouble vposneg_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)PNMASK); } static INLINE vdouble vnegpos_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)NPMASK); } #define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f }) #define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f }) static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)PNMASKf); } static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)NPMASKf); } static INLINE vdouble vrev21_vd_vd(vdouble vd) { return (vdouble) { vd[1], vd[0], vd[3], vd[2], vd[5], vd[4], vd[7], vd[6] }; } static INLINE vdouble vreva2_vd_vd(vdouble vd) { return (vdouble) { vd[6], vd[7], vd[4], vd[5], vd[2], vd[3], vd[0], vd[1] }; } static INLINE vfloat vrev21_vf_vf(vfloat vd) { return (vfloat) { vd[1], vd[0], vd[3], vd[2], vd[5], vd[4], vd[7], vd[6], vd[9], vd[8], vd[11], vd[10], vd[13], vd[12], vd[15], vd[14] }; } static INLINE vfloat vreva2_vf_vf(vfloat vd) { return (vfloat) { vd[14], vd[15], vd[12], vd[13], vd[10], vd[11], vd[8], vd[9], vd[6], vd[7], vd[4], vd[5], vd[2], vd[3], vd[0], vd[1]}; } #ifdef ENABLE_LONGDOUBLE static INLINE vlongdouble vrev21_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[1], vd[0], vd[3], vd[2], vd[5], vd[4], vd[7], vd[6] }; } static INLINE vlongdouble vreva2_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[6], vd[7], vd[4], vd[5], vd[2], vd[3], vd[0], vd[1] }; } static INLINE vlongdouble vposneg_vl_vl(vlongdouble vd) { return (vlongdouble) { +vd[0], -vd[1], +vd[2], -vd[3], +vd[4], -vd[5], +vd[6], -vd[7] }; } static INLINE vlongdouble vnegpos_vl_vl(vlongdouble vd) { return (vlongdouble) { -vd[0], +vd[1], -vd[2], +vd[3], -vd[4], +vd[5], -vd[6], +vd[7] }; } #endif #else static INLINE vint vcast_vi_i(int k) { vint ret; for(int i=0;i y), x, y); } static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return vsel_vd_vo_vd_vd((vopmask)(x < y), x, y); } static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); } static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x == y); } static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x != y); } static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x < y); } static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x <= y); } static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x > y); } static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x >= y); } static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return x + y; } static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return x - y; } static INLINE vint vneg_vi_vi(vint e) { return -e; } static INLINE vint vand_vi_vi_vi(vint x, vint y) { return x & y; } static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return y & ~x; } static INLINE vint vor_vi_vi_vi(vint x, vint y) { return x | y; } static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return x ^ y; } static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return vreinterpretFirstHalf_vi_vi2((vint2)x) & y; } static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return y & ~vreinterpretFirstHalf_vi_vi2((vint2)x); } static INLINE vint vsll_vi_vi_i(vint x, int c) { #if defined(__clang__) typedef uint32_t vu __attribute__((ext_vector_type(VECTLENDP))); #else typedef uint32_t vu __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP))); #endif return (vint)(((vu)x) << c); } static INLINE vint vsrl_vi_vi_i(vint x, int c) { #if defined(__clang__) typedef uint32_t vu __attribute__((ext_vector_type(VECTLENDP))); #else typedef uint32_t vu __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP))); #endif return (vint)(((vu)x) >> c); } static INLINE vint vsra_vi_vi_i(vint x, int c) { return x >> c; } static INLINE vint veq_vi_vi_vi(vint x, vint y) { return x == y; } static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return x > y; } static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return (vopmask)vreinterpretFirstHalf_vi2_vi(x == y); } static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return (vopmask)vreinterpretFirstHalf_vi2_vi(x > y);} static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return vor_vi_vi_vi(vand_vi_vi_vi(vreinterpretFirstHalf_vi_vi2((vint2)m), x), vandnot_vi_vi_vi(vreinterpretFirstHalf_vi_vi2((vint2)m), y)); } static INLINE vopmask visinf_vo_vd(vdouble d) { return (vopmask)(vabs_vd_vd(d) == SLEEF_INFINITY); } static INLINE vopmask vispinf_vo_vd(vdouble d) { return (vopmask)(d == SLEEF_INFINITY); } static INLINE vopmask visminf_vo_vd(vdouble d) { return (vopmask)(d == -SLEEF_INFINITY); } static INLINE vopmask visnan_vo_vd(vdouble d) { return (vopmask)(d != d); } static INLINE vdouble vsqrt_vd_vd(vdouble d) { #if defined(__clang__) typedef int64_t vi64 __attribute__((ext_vector_type(VECTLENDP))); #else typedef int64_t vi64 __attribute__((vector_size(sizeof(int64_t)*VECTLENDP))); #endif vdouble q = vcast_vd_d(1); vopmask o = (vopmask)(d < 8.636168555094445E-78); d = (vdouble)((o & (vmask)(d * 1.157920892373162E77)) | (~o & (vmask)d)); q = (vdouble)((o & (vmask)vcast_vd_d(2.9387358770557188E-39)) | (~o & (vmask)vcast_vd_d(1))); q = (vdouble)vor_vm_vm_vm(vlt_vo_vd_vd(d, vcast_vd_d(0)), (vmask)q); vdouble x = (vdouble)(0x5fe6ec85e7de30daLL - ((vi64)(d + 1e-320) >> 1)); x = x * ( 3 - d * x * x); x = x * ( 12 - d * x * x); x = x * (768 - d * x * x); x *= 1.0 / (1 << 13); x = (d - (d * x) * (d * x)) * (x * 0.5) + d * x; return x * q; } static INLINE double vcast_d_vd(vdouble v) { return v[0]; } static INLINE float vcast_f_vf(vfloat v) { return v[0]; } static INLINE vdouble vload_vd_p(const double *ptr) { return *(vdouble *)ptr; } static INLINE vdouble vloadu_vd_p(const double *ptr) { vdouble vd; for(int i=0;i y), x, y); } static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return vsel_vf_vo_vf_vf((vopmask)(x < y), x, y); } static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); } static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x == y); } static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x != y); } static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x < y); } static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x <= y); } static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x > y); } static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x >= y); } static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return x + y; } static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return x - y; } static INLINE vint2 vneg_vi2_vi2(vint2 e) { return -e; } static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return x & y; } static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return y & ~x; } static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return x | y; } static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return x ^ y; } static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)x & y; } static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return y & ~(vint2)x; } static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { #if defined(__clang__) typedef uint32_t vu __attribute__((ext_vector_type(VECTLENDP*2))); #else typedef uint32_t vu __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP*2))); #endif return (vint2)(((vu)x) << c); } static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { #if defined(__clang__) typedef uint32_t vu __attribute__((ext_vector_type(VECTLENDP*2))); #else typedef uint32_t vu __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP*2))); #endif return (vint2)(((vu)x) >> c); } static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return x >> c; } static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return (vopmask)(x == y); } static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return (vopmask)(x > y); } static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return x == y; } static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return x > y; } static INLINE vopmask visinf_vo_vf(vfloat d) { return (vopmask)(vabs_vf_vf(d) == SLEEF_INFINITYf); } static INLINE vopmask vispinf_vo_vf(vfloat d) { return (vopmask)(d == SLEEF_INFINITYf); } static INLINE vopmask visminf_vo_vf(vfloat d) { return (vopmask)(d == -SLEEF_INFINITYf); } static INLINE vopmask visnan_vo_vf(vfloat d) { return (vopmask)(d != d); } static INLINE vfloat vsqrt_vf_vf(vfloat d) { vfloat q = vcast_vf_f(1); vopmask o = (vopmask)(d < 5.4210108624275221700372640043497e-20f); // 2^-64 d = (vfloat)((o & (vmask)(d * vcast_vf_f(18446744073709551616.0f))) | (~o & (vmask)d)); // 2^64 q = (vfloat)((o & (vmask)vcast_vf_f(0.00000000023283064365386962890625f)) | (~o & (vmask)vcast_vf_f(1))); // 2^-32 q = (vfloat)vor_vm_vm_vm(vlt_vo_vf_vf(d, vcast_vf_f(0)), (vmask)q); vfloat x = (vfloat)(0x5f330de2 - (((vint2)d) >> 1)); x = x * ( 3.0f - d * x * x); x = x * (12.0f - d * x * x); x *= 0.0625f; x = (d - (d * x) * (d * x)) * (x * 0.5) + d * x; return x * q; } static INLINE vfloat vload_vf_p(const float *ptr) { return *(vfloat *)ptr; } static INLINE vfloat vloadu_vf_p(const float *ptr) { vfloat vf; for(int i=0;i