// Copyright Naoki Shibata and contributors 2010 - 2024. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) // Always use -ffp-contract=off option to compile SLEEF. #if !defined(SLEEF_GENHEADER) #include #include #include #include #endif #include "quaddef.h" #include "misc.h" #ifndef SLEEF_ENABLE_CUDA extern const double Sleef_rempitabqp[]; #endif #define __SLEEFSIMDQP_C__ #if defined(_MSC_VER) && !defined (__clang__) #pragma fp_contract (off) #else #pragma STDC FP_CONTRACT OFF #endif #ifdef ENABLE_PUREC_SCALAR #define CONFIG 1 #include "helperpurec_scalar.h" #ifdef DORENAME #include "qrenamepurec_scalar.h" #endif #endif #ifdef ENABLE_PURECFMA_SCALAR #define CONFIG 2 #include "helperpurec_scalar.h" #ifdef DORENAME #include "qrenamepurecfma_scalar.h" #endif #if defined(_MSC_VER) && !defined (__clang__) #pragma optimize("", off) #endif #endif #ifdef SLEEF_ENABLE_CUDA #define CONFIG 3 #include "helperpurec_scalar.h" #ifdef DORENAME #include "qrenamecuda.h" #endif typedef vquad Sleef_quadx1; #endif #ifdef ENABLE_SSE2 #define CONFIG 2 #include "helpersse2.h" #ifdef DORENAME #include "qrenamesse2.h" #endif typedef vquad Sleef_quadx2; #endif #ifdef ENABLE_AVX2128 #define CONFIG 1 #include "helperavx2_128.h" #ifdef DORENAME #include "qrenameavx2128.h" #endif typedef vquad Sleef_quadx2; #endif #ifdef ENABLE_AVX2 #define CONFIG 1 #include "helperavx2.h" #ifdef DORENAME #include "qrenameavx2.h" #endif typedef vquad Sleef_quadx4; #endif #ifdef ENABLE_AVX512F #define CONFIG 1 #include "helperavx512f.h" #ifdef DORENAME #include "qrenameavx512f.h" #endif typedef vquad Sleef_quadx8; #endif #ifdef ENABLE_ADVSIMD #define CONFIG 1 #include "helperadvsimd.h" #ifdef DORENAME #include "qrenameadvsimd.h" #endif typedef vquad Sleef_quadx2; #endif #ifdef ENABLE_SVE #define CONFIG 1 #include "helpersve.h" #ifdef DORENAME #include "qrenamesve.h" #endif typedef vquad Sleef_svquad; #endif #ifdef ENABLE_VSX #define CONFIG 1 #include "helperpower_128.h" #ifdef DORENAME #include "qrenamevsx.h" #endif typedef vquad Sleef_quadx2; #endif #ifdef ENABLE_VSX3 #define CONFIG 3 #include "helperpower_128.h" #ifdef DORENAME #include "qrenamevsx3.h" #endif #endif #ifdef ENABLE_VXE #define CONFIG 140 #include "helpers390x_128.h" #ifdef DORENAME #include "qrenamevxe.h" #endif typedef vquad Sleef_quadx2; #endif #ifdef ENABLE_VXE2 #define CONFIG 150 #include "helpers390x_128.h" #ifdef DORENAME #include "qrenamevxe2.h" #endif typedef vquad Sleef_quadx2; #endif // RISC-V #ifdef ENABLE_RVVM1 #define CONFIG 1 #define ENABLE_RVV_DP #include "helperrvv.h" #ifdef DORENAME #include "qrenamervvm1.h" #endif typedef vquad Sleef_rvvm1quad; #endif #ifdef ENABLE_RVVM2 #define CONFIG 1 #define ENABLE_RVV_DP #include "helperrvv.h" #ifdef DORENAME #include "qrenamervvm2.h" #endif typedef vquad Sleef_rvvm2quad; #endif #include "dd.h" #include "commonfuncs.h" // static INLINE CONST VECTOR_CC vopmask isnonfinite_vo_vq(vquad a) { return veq64_vo_vm_vm(vand_vm_vm_vm(vqgety_vm_vq(a), vcast_vm_u64(UINT64_C(0x7fff000000000000))), vcast_vm_u64(UINT64_C(0x7fff000000000000))); } static INLINE CONST VECTOR_CC vopmask isnonfinite_vo_vq_vq(vquad a, vquad b) { vmask ma = vxor_vm_vm_vm(vand_vm_vm_vm(vqgety_vm_vq(a), vcast_vm_u64(UINT64_C(0x7fff000000000000))), vcast_vm_u64(UINT64_C(0x7fff000000000000))); vmask mb = vxor_vm_vm_vm(vand_vm_vm_vm(vqgety_vm_vq(b), vcast_vm_u64(UINT64_C(0x7fff000000000000))), vcast_vm_u64(UINT64_C(0x7fff000000000000))); return veq64_vo_vm_vm(vand_vm_vm_vm(ma, mb), vcast_vm_u64(0)); } static INLINE CONST VECTOR_CC vopmask isnonfinite_vo_vq_vq_vq(vquad a, vquad b, vquad c) { vmask ma = vxor_vm_vm_vm(vand_vm_vm_vm(vqgety_vm_vq(a), vcast_vm_u64(UINT64_C(0x7fff000000000000))), vcast_vm_u64(UINT64_C(0x7fff000000000000))); vmask mb = vxor_vm_vm_vm(vand_vm_vm_vm(vqgety_vm_vq(b), vcast_vm_u64(UINT64_C(0x7fff000000000000))), vcast_vm_u64(UINT64_C(0x7fff000000000000))); vmask mc = vxor_vm_vm_vm(vand_vm_vm_vm(vqgety_vm_vq(c), vcast_vm_u64(UINT64_C(0x7fff000000000000))), vcast_vm_u64(UINT64_C(0x7fff000000000000))); return veq64_vo_vm_vm(vand_vm_vm_vm(vand_vm_vm_vm(ma, mb), mc), vcast_vm_u64(0)); } static INLINE CONST VECTOR_CC vopmask isinf_vo_vq(vquad a) { vopmask o = veq64_vo_vm_vm(vand_vm_vm_vm(vqgety_vm_vq(a), vcast_vm_u64(UINT64_C(0x7fffffffffffffff))), vcast_vm_u64(UINT64_C(0x7fff000000000000))); return vand_vo_vo_vo(o, veq64_vo_vm_vm(vqgetx_vm_vq(a), vcast_vm_u64(0))); } static INLINE CONST VECTOR_CC vopmask ispinf_vo_vq(vquad a) { return vand_vo_vo_vo(veq64_vo_vm_vm(vqgety_vm_vq(a), vcast_vm_u64(UINT64_C(0x7fff000000000000))), veq64_vo_vm_vm(vqgetx_vm_vq(a), vcast_vm_u64(0))); } static INLINE CONST VECTOR_CC vopmask isminf_vo_vq(vquad a) { return vand_vo_vo_vo(veq64_vo_vm_vm(vqgety_vm_vq(a), vcast_vm_u64(UINT64_C(0xffff000000000000))), veq64_vo_vm_vm(vqgetx_vm_vq(a), vcast_vm_u64(0))); } static INLINE CONST VECTOR_CC vopmask isnan_vo_vq(vquad a) { return vandnot_vo_vo_vo(isinf_vo_vq(a), isnonfinite_vo_vq(a)); } static INLINE CONST VECTOR_CC vopmask iszero_vo_vq(vquad a) { return veq64_vo_vm_vm(vor_vm_vm_vm(vand_vm_vm_vm(vqgety_vm_vq(a), vcast_vm_u64(~UINT64_C(0x8000000000000000))), vqgetx_vm_vq(a)), vcast_vm_i64(0)); } static INLINE CONST VECTOR_CC vquad mulsign_vq_vq_vq(vquad a, vquad b) { vmask m = vxor_vm_vm_vm(vand_vm_vm_vm(vqgety_vm_vq(b), vcast_vm_u64(UINT64_C(0x8000000000000000))), vqgety_vm_vq(a)); return vqsety_vq_vq_vm(a, m); } static INLINE CONST VECTOR_CC vquad cmpcnv_vq_vq(vquad x) { vquad t = vqsetxy_vq_vm_vm(vxor_vm_vm_vm(vqgetx_vm_vq(x), vcast_vm_u64(UINT64_C(0xffffffffffffffff))), vxor_vm_vm_vm(vqgety_vm_vq(x), vcast_vm_u64(UINT64_C(0x7fffffffffffffff)))); t = vqsetx_vq_vq_vm(t, vadd64_vm_vm_vm(vqgetx_vm_vq(t), vcast_vm_i64(1))); t = vqsety_vq_vq_vm(t, vadd64_vm_vm_vm(vqgety_vm_vq(t), vand_vm_vo64_vm(veq64_vo_vm_vm(vqgetx_vm_vq(t), vcast_vm_i64(0)), vcast_vm_i64(1)))); return sel_vq_vo_vq_vq(veq64_vo_vm_vm(vand_vm_vm_vm(vqgety_vm_vq(x), vcast_vm_u64(UINT64_C(0x8000000000000000))), vcast_vm_u64(UINT64_C(0x8000000000000000))), t, x); } // double3 functions ------------------------------------------------------------------------------------------------------------ static INLINE CONST VECTOR_CC vdouble3 cast_vd3_vd_vd_vd(vdouble d0, vdouble d1, vdouble d2) { return vd3setxyz_vd3_vd_vd_vd(d0, d1, d2); } static INLINE CONST VECTOR_CC vdouble3 cast_vd3_d_d_d(double d0, double d1, double d2) { return vd3setxyz_vd3_vd_vd_vd(vcast_vd_d(d0), vcast_vd_d(d1), vcast_vd_d(d2)); } static INLINE CONST VECTOR_CC vdouble3 mulsign_vd3_vd3_vd(vdouble3 d, vdouble s) { return cast_vd3_vd_vd_vd(vmulsign_vd_vd_vd(vd3getx_vd_vd3(d), s), vmulsign_vd_vd_vd(vd3gety_vd_vd3(d), s), vmulsign_vd_vd_vd(vd3getz_vd_vd3(d), s)); } static INLINE CONST VECTOR_CC vdouble3 abs_vd3_vd3(vdouble3 d) { return mulsign_vd3_vd3_vd(d, vd3getx_vd_vd3(d)); } static INLINE CONST VECTOR_CC vdouble3 sel_vd3_vo_vd3_vd3(vopmask m, vdouble3 x, vdouble3 y) { return vd3setxyz_vd3_vd_vd_vd(vsel_vd_vo_vd_vd(m, vd3getx_vd_vd3(x), vd3getx_vd_vd3(y)), vsel_vd_vo_vd_vd(m, vd3gety_vd_vd3(x), vd3gety_vd_vd3(y)), vsel_vd_vo_vd_vd(m, vd3getz_vd_vd3(x), vd3getz_vd_vd3(y))); } // TD algorithms are based on Y. Hida et al., Library for double-double and quad-double arithmetic (2007). static INLINE CONST VECTOR_CC vdouble2 twosum_vd2_vd_vd(vdouble x, vdouble y) { vdouble rx = vadd_vd_vd_vd(x, y); vdouble v = vsub_vd_vd_vd(rx, x); return vd2setxy_vd2_vd_vd(rx, vadd_vd_vd_vd(vsub_vd_vd_vd(x, vsub_vd_vd_vd(rx, v)), vsub_vd_vd_vd(y, v))); } static INLINE CONST VECTOR_CC vdouble2 twosub_vd2_vd_vd(vdouble x, vdouble y) { vdouble rx = vsub_vd_vd_vd(x, y); vdouble v = vsub_vd_vd_vd(rx, x); return vd2setxy_vd2_vd_vd(rx, vsub_vd_vd_vd(vsub_vd_vd_vd(x, vsub_vd_vd_vd(rx, v)), vadd_vd_vd_vd(y, v))); } static INLINE CONST VECTOR_CC vdouble2 twosumx_vd2_vd_vd_vd(vdouble x, vdouble y, vdouble s) { vdouble rx = vmla_vd_vd_vd_vd(y, s, x); vdouble v = vsub_vd_vd_vd(rx, x); return vd2setxy_vd2_vd_vd(rx, vadd_vd_vd_vd(vsub_vd_vd_vd(x, vsub_vd_vd_vd(rx, v)), vmlapn_vd_vd_vd_vd(y, s, v))); } static INLINE CONST VECTOR_CC vdouble2 twosubx_vd2_vd_vd_vd(vdouble x, vdouble y, vdouble s) { vdouble rx = vmlanp_vd_vd_vd_vd(y, s, x); vdouble v = vsub_vd_vd_vd(rx, x); return vd2setxy_vd2_vd_vd(rx, vsub_vd_vd_vd(vsub_vd_vd_vd(x, vsub_vd_vd_vd(rx, v)), vmla_vd_vd_vd_vd(y, s, v))); } static INLINE CONST VECTOR_CC vdouble2 quicktwosum_vd2_vd_vd(vdouble x, vdouble y) { vdouble rx = vadd_vd_vd_vd(x, y); return vd2setxy_vd2_vd_vd(rx, vadd_vd_vd_vd(vsub_vd_vd_vd(x, rx), y)); } static INLINE CONST VECTOR_CC vdouble2 twoprod_vd2_vd_vd(vdouble x, vdouble y) { #ifdef ENABLE_FMA_DP vdouble rx = vmul_vd_vd_vd(x, y); return vd2setxy_vd2_vd_vd(rx, vfmapn_vd_vd_vd_vd(x, y, rx)); #else vdouble xh = vmul_vd_vd_vd(x, vcast_vd_d((1 << 27)+1)); xh = vsub_vd_vd_vd(xh, vsub_vd_vd_vd(xh, x)); vdouble xl = vsub_vd_vd_vd(x, xh); vdouble yh = vmul_vd_vd_vd(y, vcast_vd_d((1 << 27)+1)); yh = vsub_vd_vd_vd(yh, vsub_vd_vd_vd(yh, y)); vdouble yl = vsub_vd_vd_vd(y, yh); vdouble rx = vmul_vd_vd_vd(x, y); return vd2setxy_vd2_vd_vd(rx, vadd_vd_5vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(rx), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl))); #endif } static INLINE CONST VECTOR_CC vdouble3 scale_vd3_vd3_vd(vdouble3 d, vdouble s) { return cast_vd3_vd_vd_vd(vmul_vd_vd_vd(vd3getx_vd_vd3(d), s), vmul_vd_vd_vd(vd3gety_vd_vd3(d), s), vmul_vd_vd_vd(vd3getz_vd_vd3(d), s)); } static INLINE CONST VECTOR_CC vdouble3 scale_vd3_vd3_d(vdouble3 d, double s) { return scale_vd3_vd3_vd(d, vcast_vd_d(s)); } static INLINE CONST VECTOR_CC vdouble3 quickrenormalize_vd3_vd3(vdouble3 td) { vdouble2 u = quicktwosum_vd2_vd_vd(vd3getx_vd_vd3(td), vd3gety_vd_vd3(td)); vdouble2 v = quicktwosum_vd2_vd_vd(vd2gety_vd_vd2(u), vd3getz_vd_vd3(td)); return cast_vd3_vd_vd_vd(vd2getx_vd_vd2(u), vd2getx_vd_vd2(v), vd2gety_vd_vd2(v)); } static INLINE CONST VECTOR_CC vdouble3 normalize_vd3_vd3(vdouble3 td) { vdouble2 u = quicktwosum_vd2_vd_vd(vd3getx_vd_vd3(td), vd3gety_vd_vd3(td)); vdouble2 v = quicktwosum_vd2_vd_vd(vd2gety_vd_vd2(u), vd3getz_vd_vd3(td)); td = vd3setxyz_vd3_vd_vd_vd(vd2getx_vd_vd2(u), vd2getx_vd_vd2(v), vd2gety_vd_vd2(v)); u = quicktwosum_vd2_vd_vd(vd3getx_vd_vd3(td), vd3gety_vd_vd3(td)); return vd3setxyz_vd3_vd_vd_vd(vd2getx_vd_vd2(u), vd2gety_vd_vd2(u), vd3getz_vd_vd3(td)); } static INLINE CONST VECTOR_CC vdouble3 add2_vd3_vd3_vd3(vdouble3 x, vdouble3 y) { vdouble2 d0 = twosum_vd2_vd_vd(vd3getx_vd_vd3(x), vd3getx_vd_vd3(y)); vdouble2 d1 = twosum_vd2_vd_vd(vd3gety_vd_vd3(x), vd3gety_vd_vd3(y)); vdouble2 d3 = twosum_vd2_vd_vd(vd2gety_vd_vd2(d0), vd2getx_vd_vd2(d1)); return normalize_vd3_vd3(vd3setxyz_vd3_vd_vd_vd(vd2getx_vd_vd2(d0), vd2getx_vd_vd2(d3), vadd_vd_4vd(vd3getz_vd_vd3(x), vd3getz_vd_vd3(y), vd2gety_vd_vd2(d1), vd2gety_vd_vd2(d3)))); } static INLINE CONST VECTOR_CC vdouble3 sub2_vd3_vd3_vd3(vdouble3 x, vdouble3 y) { vdouble2 d0 = twosub_vd2_vd_vd(vd3getx_vd_vd3(x), vd3getx_vd_vd3(y)); vdouble2 d1 = twosub_vd2_vd_vd(vd3gety_vd_vd3(x), vd3gety_vd_vd3(y)); vdouble2 d3 = twosum_vd2_vd_vd(vd2gety_vd_vd2(d0), vd2getx_vd_vd2(d1)); return normalize_vd3_vd3(vd3setxyz_vd3_vd_vd_vd(vd2getx_vd_vd2(d0), vd2getx_vd_vd2(d3), vadd_vd_4vd(vd3getz_vd_vd3(x), vneg_vd_vd(vd3getz_vd_vd3(y)), vd2gety_vd_vd2(d1), vd2gety_vd_vd2(d3)))); } static INLINE CONST VECTOR_CC vdouble3 add2_vd3_vd2_vd3(vdouble2 x, vdouble3 y) { vdouble2 d0 = twosum_vd2_vd_vd(vd2getx_vd_vd2(x), vd3getx_vd_vd3(y)); vdouble2 d1 = twosum_vd2_vd_vd(vd2gety_vd_vd2(x), vd3gety_vd_vd3(y)); vdouble2 d3 = twosum_vd2_vd_vd(vd2gety_vd_vd2(d0), vd2getx_vd_vd2(d1)); return normalize_vd3_vd3(vd3setxyz_vd3_vd_vd_vd(vd2getx_vd_vd2(d0), vd2getx_vd_vd2(d3), vadd_vd_3vd(vd2gety_vd_vd2(d1), vd2gety_vd_vd2(d3), vd3getz_vd_vd3(y)))); } static INLINE CONST VECTOR_CC vdouble3 add_vd3_vd2_vd3(vdouble2 x, vdouble3 y) { vdouble2 d0 = twosum_vd2_vd_vd(vd2getx_vd_vd2(x), vd3getx_vd_vd3(y)); vdouble2 d1 = twosum_vd2_vd_vd(vd2gety_vd_vd2(x), vd3gety_vd_vd3(y)); vdouble2 d3 = twosum_vd2_vd_vd(vd2gety_vd_vd2(d0), vd2getx_vd_vd2(d1)); return vd3setxyz_vd3_vd_vd_vd(vd2getx_vd_vd2(d0), vd2getx_vd_vd2(d3), vadd_vd_3vd(vd2gety_vd_vd2(d1), vd2gety_vd_vd2(d3), vd3getz_vd_vd3(y))); } static INLINE CONST VECTOR_CC vdouble3 add2_vd3_vd_vd3(vdouble x, vdouble3 y) { vdouble2 d0 = twosum_vd2_vd_vd(x, vd3getx_vd_vd3(y)); vdouble2 d3 = twosum_vd2_vd_vd(vd2gety_vd_vd2(d0), vd3gety_vd_vd3(y)); return normalize_vd3_vd3(vd3setxyz_vd3_vd_vd_vd(vd2getx_vd_vd2(d0), vd2getx_vd_vd2(d3), vadd_vd_vd_vd(vd2gety_vd_vd2(d3), vd3getz_vd_vd3(y)))); } static INLINE CONST VECTOR_CC vdouble3 add_vd3_vd_vd3(vdouble x, vdouble3 y) { vdouble2 d0 = twosum_vd2_vd_vd(x, vd3getx_vd_vd3(y)); vdouble2 d3 = twosum_vd2_vd_vd(vd2gety_vd_vd2(d0), vd3gety_vd_vd3(y)); return vd3setxyz_vd3_vd_vd_vd(vd2getx_vd_vd2(d0), vd2getx_vd_vd2(d3), vadd_vd_vd_vd(vd2gety_vd_vd2(d3), vd3getz_vd_vd3(y))); } static INLINE CONST VECTOR_CC vdouble3 scaleadd2_vd3_vd3_vd3_vd(vdouble3 x, vdouble3 y, vdouble s) { vdouble2 d0 = twosumx_vd2_vd_vd_vd(vd3getx_vd_vd3(x), vd3getx_vd_vd3(y), s); vdouble2 d1 = twosumx_vd2_vd_vd_vd(vd3gety_vd_vd3(x), vd3gety_vd_vd3(y), s); vdouble2 d3 = twosum_vd2_vd_vd(vd2gety_vd_vd2(d0), vd2getx_vd_vd2(d1)); return normalize_vd3_vd3(vd3setxyz_vd3_vd_vd_vd(vd2getx_vd_vd2(d0), vd2getx_vd_vd2(d3), vadd_vd_3vd(vmla_vd_vd_vd_vd(vd3getz_vd_vd3(y), s, vd3getz_vd_vd3(x)), vd2gety_vd_vd2(d1), vd2gety_vd_vd2(d3)))); } static INLINE CONST VECTOR_CC vdouble3 scalesub2_vd3_vd3_vd3_vd(vdouble3 x, vdouble3 y, vdouble s) { vdouble2 d0 = twosubx_vd2_vd_vd_vd(vd3getx_vd_vd3(x), vd3getx_vd_vd3(y), s); vdouble2 d1 = twosubx_vd2_vd_vd_vd(vd3gety_vd_vd3(x), vd3gety_vd_vd3(y), s); vdouble2 d3 = twosum_vd2_vd_vd(vd2gety_vd_vd2(d0), vd2getx_vd_vd2(d1)); return normalize_vd3_vd3(vd3setxyz_vd3_vd_vd_vd(vd2getx_vd_vd2(d0), vd2getx_vd_vd2(d3), vadd_vd_3vd(vmlanp_vd_vd_vd_vd(vd3getz_vd_vd3(y), s, vd3getz_vd_vd3(x)), vd2gety_vd_vd2(d1), vd2gety_vd_vd2(d3)))); } static INLINE CONST VECTOR_CC vdouble3 mul2_vd3_vd3_vd3(vdouble3 x, vdouble3 y) { vdouble2 d0 = twoprod_vd2_vd_vd(vd3getx_vd_vd3(x), vd3getx_vd_vd3(y)); vdouble2 d1 = twoprod_vd2_vd_vd(vd3getx_vd_vd3(x), vd3gety_vd_vd3(y)); vdouble2 d2 = twoprod_vd2_vd_vd(vd3gety_vd_vd3(x), vd3getx_vd_vd3(y)); vdouble2 d4 = twosum_vd2_vd_vd(vd2gety_vd_vd2(d0), vd2getx_vd_vd2(d1)); vdouble2 d5 = twosum_vd2_vd_vd(vd2getx_vd_vd2(d4), vd2getx_vd_vd2(d2)); vdouble t2 = vadd_vd_3vd(vmla_vd_vd_vd_vd(vd3getz_vd_vd3(x), vd3getx_vd_vd3(y), vmla_vd_vd_vd_vd(vd3gety_vd_vd3(x), vd3gety_vd_vd3(y), vmla_vd_vd_vd_vd(vd3getx_vd_vd3(x), vd3getz_vd_vd3(y), vadd_vd_vd_vd(vd2gety_vd_vd2(d1), vd2gety_vd_vd2(d2))))), vd2gety_vd_vd2(d4), vd2gety_vd_vd2(d5)); return normalize_vd3_vd3(vd3setxyz_vd3_vd_vd_vd(vd2getx_vd_vd2(d0), vd2getx_vd_vd2(d5), t2)); } static INLINE CONST VECTOR_CC vdouble3 mul_vd3_vd3_vd3(vdouble3 x, vdouble3 y) { vdouble2 d0 = twoprod_vd2_vd_vd(vd3getx_vd_vd3(x), vd3getx_vd_vd3(y)); vdouble2 d1 = twoprod_vd2_vd_vd(vd3getx_vd_vd3(x), vd3gety_vd_vd3(y)); vdouble2 d2 = twoprod_vd2_vd_vd(vd3gety_vd_vd3(x), vd3getx_vd_vd3(y)); vdouble2 d4 = twosum_vd2_vd_vd(vd2gety_vd_vd2(d0), vd2getx_vd_vd2(d1)); vdouble2 d5 = twosum_vd2_vd_vd(vd2getx_vd_vd2(d4), vd2getx_vd_vd2(d2)); vdouble t2 = vadd_vd_3vd(vmla_vd_vd_vd_vd(vd3getz_vd_vd3(x), vd3getx_vd_vd3(y), vmla_vd_vd_vd_vd(vd3gety_vd_vd3(x), vd3gety_vd_vd3(y), vmla_vd_vd_vd_vd(vd3getx_vd_vd3(x), vd3getz_vd_vd3(y), vadd_vd_vd_vd(vd2gety_vd_vd2(d1), vd2gety_vd_vd2(d2))))), vd2gety_vd_vd2(d4), vd2gety_vd_vd2(d5)); return quickrenormalize_vd3_vd3(vd3setxyz_vd3_vd_vd_vd(vd2getx_vd_vd2(d0), vd2getx_vd_vd2(d5), t2)); } static INLINE CONST VECTOR_CC vdouble3 squ_vd3_vd3(vdouble3 x) { return mul_vd3_vd3_vd3(x, x); } static INLINE CONST VECTOR_CC vdouble3 mul_vd3_vd2_vd3(vdouble2 x, vdouble3 y) { vdouble2 d0 = twoprod_vd2_vd_vd(vd2getx_vd_vd2(x), vd3getx_vd_vd3(y)); vdouble2 d1 = twoprod_vd2_vd_vd(vd2getx_vd_vd2(x), vd3gety_vd_vd3(y)); vdouble2 d2 = twoprod_vd2_vd_vd(vd2gety_vd_vd2(x), vd3getx_vd_vd3(y)); vdouble2 d4 = twosum_vd2_vd_vd(vd2gety_vd_vd2(d0), vd2getx_vd_vd2(d1)); vdouble2 d5 = twosum_vd2_vd_vd(vd2getx_vd_vd2(d4), vd2getx_vd_vd2(d2)); vdouble t2 = vadd_vd_3vd(vmla_vd_vd_vd_vd(vd2gety_vd_vd2(x), vd3gety_vd_vd3(y), vmla_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd3getz_vd_vd3(y), vadd_vd_vd_vd(vd2gety_vd_vd2(d1), vd2gety_vd_vd2(d2)))), vd2gety_vd_vd2(d4), vd2gety_vd_vd2(d5)); return vd3setxyz_vd3_vd_vd_vd(vd2getx_vd_vd2(d0), vd2getx_vd_vd2(d5), t2); } static INLINE CONST VECTOR_CC vdouble3 mul_vd3_vd3_vd2(vdouble3 x, vdouble2 y) { vdouble2 d0 = twoprod_vd2_vd_vd(vd2getx_vd_vd2(y), vd3getx_vd_vd3(x)); vdouble2 d1 = twoprod_vd2_vd_vd(vd2getx_vd_vd2(y), vd3gety_vd_vd3(x)); vdouble2 d2 = twoprod_vd2_vd_vd(vd2gety_vd_vd2(y), vd3getx_vd_vd3(x)); vdouble2 d4 = twosum_vd2_vd_vd(vd2gety_vd_vd2(d0), vd2getx_vd_vd2(d1)); vdouble2 d5 = twosum_vd2_vd_vd(vd2getx_vd_vd2(d4), vd2getx_vd_vd2(d2)); vdouble t2 = vadd_vd_3vd(vmla_vd_vd_vd_vd(vd2gety_vd_vd2(y), vd3gety_vd_vd3(x), vmla_vd_vd_vd_vd(vd2getx_vd_vd2(y), vd3getz_vd_vd3(x), vadd_vd_vd_vd(vd2gety_vd_vd2(d1), vd2gety_vd_vd2(d2)))), vd2gety_vd_vd2(d4), vd2gety_vd_vd2(d5)); return vd3setxyz_vd3_vd_vd_vd(vd2getx_vd_vd2(d0), vd2getx_vd_vd2(d5), t2); } static INLINE CONST VECTOR_CC vdouble3 mul_vd3_vd3_vd(vdouble3 x, vdouble y) { vdouble2 d0 = twoprod_vd2_vd_vd(y, vd3getx_vd_vd3(x)); vdouble2 d1 = twoprod_vd2_vd_vd(y, vd3gety_vd_vd3(x)); vdouble2 d4 = twosum_vd2_vd_vd(vd2gety_vd_vd2(d0), vd2getx_vd_vd2(d1)); return vd3setxyz_vd3_vd_vd_vd(vd2getx_vd_vd2(d0), vd2getx_vd_vd2(d4), vadd_vd_vd_vd(vmla_vd_vd_vd_vd(y, vd3getz_vd_vd3(x), vd2gety_vd_vd2(d1)), vd2gety_vd_vd2(d4))); } static INLINE CONST VECTOR_CC vdouble3 mul_vd3_vd2_vd2(vdouble2 x, vdouble2 y) { vdouble2 d0 = twoprod_vd2_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y)); vdouble2 d1 = twoprod_vd2_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(y)); vdouble2 d2 = twoprod_vd2_vd_vd(vd2gety_vd_vd2(x), vd2getx_vd_vd2(y)); vdouble2 d4 = twosum_vd2_vd_vd(vd2gety_vd_vd2(d0), vd2getx_vd_vd2(d1)); vdouble2 d5 = twosum_vd2_vd_vd(vd2getx_vd_vd2(d4), vd2getx_vd_vd2(d2)); vdouble t2 = vadd_vd_3vd(vmla_vd_vd_vd_vd(vd2gety_vd_vd2(x), vd2gety_vd_vd2(y), vadd_vd_vd_vd(vd2gety_vd_vd2(d1), vd2gety_vd_vd2(d2))), vd2gety_vd_vd2(d4), vd2gety_vd_vd2(d5)); return vd3setxyz_vd3_vd_vd_vd(vd2getx_vd_vd2(d0), vd2getx_vd_vd2(d5), t2); } static INLINE CONST VECTOR_CC vdouble3 div2_vd3_vd3_vd3(vdouble3 n, vdouble3 q) { vdouble2 d = ddrec_vd2_vd2(vcast_vd2_vd_vd(vd3getx_vd_vd3(q), vd3gety_vd_vd3(q))); return mul2_vd3_vd3_vd3(n, add_vd3_vd2_vd3(d, mul_vd3_vd2_vd3(ddscale_vd2_vd2_d(d, -1), add_vd3_vd_vd3(vcast_vd_d(-1), mul_vd3_vd2_vd3(d, q))))); } static INLINE CONST VECTOR_CC vdouble3 div_vd3_vd3_vd3(vdouble3 n, vdouble3 q) { vdouble2 d = ddrec_vd2_vd2(vcast_vd2_vd_vd(vd3getx_vd_vd3(q), vd3gety_vd_vd3(q))); return mul_vd3_vd3_vd3(n, add_vd3_vd2_vd3(d, mul_vd3_vd2_vd3(ddscale_vd2_vd2_d(d, -1), add_vd3_vd_vd3(vcast_vd_d(-1), mul_vd3_vd2_vd3(d, q))))); } static INLINE CONST VECTOR_CC vdouble3 rec_vd3_vd3(vdouble3 q) { vdouble2 d = ddrec_vd2_vd2(vcast_vd2_vd_vd(vd3getx_vd_vd3(q), vd3gety_vd_vd3(q))); return add2_vd3_vd2_vd3(d, mul_vd3_vd2_vd3(ddscale_vd2_vd2_d(d, -1), add_vd3_vd_vd3(vcast_vd_d(-1), mul_vd3_vd2_vd3(d, q)))); } static INLINE CONST VECTOR_CC vdouble3 rec_vd3_vd2(vdouble2 q) { vdouble2 d = ddrec_vd2_vd2(vcast_vd2_vd_vd(vd2getx_vd_vd2(q), vd2gety_vd_vd2(q))); return add2_vd3_vd2_vd3(d, mul_vd3_vd2_vd3(ddscale_vd2_vd2_d(d, -1), add_vd3_vd_vd3(vcast_vd_d(-1), mul_vd3_vd2_vd2(d, q)))); } static INLINE CONST VECTOR_CC vdouble3 sqrt_vd3_vd3(vdouble3 d) { vdouble2 t = ddsqrt_vd2_vd2(vcast_vd2_vd_vd(vd3getx_vd_vd3(d), vd3gety_vd_vd3(d))); vdouble3 r = mul2_vd3_vd3_vd3(add2_vd3_vd3_vd3(d, mul_vd3_vd2_vd2(t, t)), rec_vd3_vd2(t)); r = sel_vd3_vo_vd3_vd3(veq_vo_vd_vd(vd3getx_vd_vd3(d), vcast_vd_d(0)), cast_vd3_d_d_d(0, 0, 0), scale_vd3_vd3_d(r, 0.5)); return r; } static INLINE CONST VECTOR_CC vdouble3 neg_vd3_vd3(vdouble3 d) { return vd3setxyz_vd3_vd_vd_vd(vneg_vd_vd(vd3getx_vd_vd3(d)), vneg_vd_vd(vd3gety_vd_vd3(d)), vneg_vd_vd(vd3getz_vd_vd3(d))); } // static INLINE CONST VECTOR_CC vdouble poly2d(vdouble x, double c1, double c0) { return vmla_vd_vd_vd_vd(x, vcast_vd_d(c1), vcast_vd_d(c0)); } static INLINE CONST VECTOR_CC vdouble poly3d(vdouble x, double c2, double c1, double c0) { return vmla_vd_vd_vd_vd(vmul_vd_vd_vd(x, x), vcast_vd_d(c2), poly2d(x, c1, c0)); } static INLINE CONST VECTOR_CC vdouble poly4d(vdouble x, double c3, double c2, double c1, double c0) { return vmla_vd_vd_vd_vd(vmul_vd_vd_vd(x, x), poly2d(x, c3, c2), poly2d(x, c1, c0)); } static INLINE CONST VECTOR_CC vdouble poly5d(vdouble x, double c4, double c3, double c2, double c1, double c0) { return vmla_vd_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(x, x),vmul_vd_vd_vd(x, x)), vcast_vd_d(c4), poly4d(x, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble poly6d(vdouble x, double c5, double c4, double c3, double c2, double c1, double c0) { return vmla_vd_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(x, x),vmul_vd_vd_vd(x, x)), poly2d(x, c5, c4), poly4d(x, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble poly7d(vdouble x, double c6, double c5, double c4, double c3, double c2, double c1, double c0) { return vmla_vd_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(x, x),vmul_vd_vd_vd(x, x)), poly3d(x, c6, c5, c4), poly4d(x, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble poly8d(vdouble x, double c7, double c6, double c5, double c4, double c3, double c2, double c1, double c0) { return vmla_vd_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(x, x),vmul_vd_vd_vd(x, x)), poly4d(x, c7, c6, c5, c4), poly4d(x, c3, c2, c1, c0)); } // static INLINE CONST VECTOR_CC vdouble2 poly2dd_b(vdouble2 x, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(x, vcast_vd2_d2(c1), vcast_vd2_d2(c0)); } static INLINE CONST VECTOR_CC vdouble2 poly2dd(vdouble2 x, vdouble c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(x, vcast_vd2_vd_vd(c1, vcast_vd_d(0)), vcast_vd2_d2(c0)); } static INLINE CONST VECTOR_CC vdouble2 poly3dd_b(vdouble2 x, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(x), vcast_vd2_d2(c2), poly2dd_b(x, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly3dd(vdouble2 x, vdouble c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(x), vcast_vd2_vd_vd(c2, vcast_vd_d(0)), poly2dd_b(x, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly4dd_b(vdouble2 x, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(x), poly2dd_b(x, c3, c2), poly2dd_b(x, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly4dd(vdouble2 x, vdouble c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(x), poly2dd(x, c3, c2), poly2dd_b(x, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly5dd_b(vdouble2 x, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)), vcast_vd2_d2(c4), poly4dd_b(x, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly5dd(vdouble2 x, vdouble c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)), vcast_vd2_vd_vd(c4, vcast_vd_d(0)), poly4dd_b(x, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly6dd_b(vdouble2 x, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)), poly2dd_b(x, c5, c4), poly4dd_b(x, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly6dd(vdouble2 x, vdouble c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)), poly2dd(x, c5, c4), poly4dd_b(x, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly7dd_b(vdouble2 x, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)), poly3dd_b(x, c6, c5, c4), poly4dd_b(x, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly7dd(vdouble2 x, vdouble c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)), poly3dd(x, c6, c5, c4), poly4dd_b(x, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly8dd_b(vdouble2 x, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)), poly4dd_b(x, c7, c6, c5, c4), poly4dd_b(x, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly8dd(vdouble2 x, vdouble c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)), poly4dd(x, c7, c6, c5, c4), poly4dd_b(x, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly9dd_b(vdouble2 x, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x))), vcast_vd2_d2(c8), poly8dd_b(x, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly9dd(vdouble2 x, vdouble c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x))), vcast_vd2_vd_vd(c8, vcast_vd_d(0)), poly8dd_b(x, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly10dd_b(vdouble2 x, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x))), poly2dd_b(x, c9, c8), poly8dd_b(x, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly10dd(vdouble2 x, vdouble c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x))), poly2dd(x, c9, c8), poly8dd_b(x, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly11dd_b(vdouble2 x, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x))), poly3dd_b(x, c10, c9, c8), poly8dd_b(x, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly11dd(vdouble2 x, vdouble c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x))), poly3dd(x, c10, c9, c8), poly8dd_b(x, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly12dd_b(vdouble2 x, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x))), poly4dd_b(x, c11, c10, c9, c8), poly8dd_b(x, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly12dd(vdouble2 x, vdouble c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x))), poly4dd(x, c11, c10, c9, c8), poly8dd_b(x, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly13dd_b(vdouble2 x, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x))), poly5dd_b(x, c12, c11, c10, c9, c8), poly8dd_b(x, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly13dd(vdouble2 x, vdouble c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x))), poly5dd(x, c12, c11, c10, c9, c8), poly8dd_b(x, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly14dd_b(vdouble2 x, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x))), poly6dd_b(x, c13, c12, c11, c10, c9, c8), poly8dd_b(x, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly14dd(vdouble2 x, vdouble c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x))), poly6dd(x, c13, c12, c11, c10, c9, c8), poly8dd_b(x, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly15dd_b(vdouble2 x, double2 c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x))), poly7dd_b(x, c14, c13, c12, c11, c10, c9, c8), poly8dd_b(x, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly15dd(vdouble2 x, vdouble c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x))), poly7dd(x, c14, c13, c12, c11, c10, c9, c8), poly8dd_b(x, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly16dd_b(vdouble2 x, double2 c15, double2 c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x))), poly8dd_b(x, c15, c14, c13, c12, c11, c10, c9, c8), poly8dd_b(x, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly16dd(vdouble2 x, vdouble c15, double2 c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x))), poly8dd(x, c15, c14, c13, c12, c11, c10, c9, c8), poly8dd_b(x, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly17dd_b(vdouble2 x, double2 c16, double2 c15, double2 c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)))), vcast_vd2_d2(c16), poly16dd_b(x, c15, c14, c13, c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly17dd(vdouble2 x, vdouble c16, double2 c15, double2 c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)))), vcast_vd2_vd_vd(c16, vcast_vd_d(0)), poly16dd_b(x, c15, c14, c13, c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly18dd_b(vdouble2 x, double2 c17, double2 c16, double2 c15, double2 c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)))), poly2dd_b(x, c17, c16), poly16dd_b(x, c15, c14, c13, c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly18dd(vdouble2 x, vdouble c17, double2 c16, double2 c15, double2 c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)))), poly2dd(x, c17, c16), poly16dd_b(x, c15, c14, c13, c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly19dd_b(vdouble2 x, double2 c18, double2 c17, double2 c16, double2 c15, double2 c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)))), poly3dd_b(x, c18, c17, c16), poly16dd_b(x, c15, c14, c13, c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly19dd(vdouble2 x, vdouble c18, double2 c17, double2 c16, double2 c15, double2 c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)))), poly3dd(x, c18, c17, c16), poly16dd_b(x, c15, c14, c13, c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly20dd_b(vdouble2 x, double2 c19, double2 c18, double2 c17, double2 c16, double2 c15, double2 c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)))), poly4dd_b(x, c19, c18, c17, c16), poly16dd_b(x, c15, c14, c13, c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly20dd(vdouble2 x, vdouble c19, double2 c18, double2 c17, double2 c16, double2 c15, double2 c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)))), poly4dd(x, c19, c18, c17, c16), poly16dd_b(x, c15, c14, c13, c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly21dd_b(vdouble2 x, double2 c20, double2 c19, double2 c18, double2 c17, double2 c16, double2 c15, double2 c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)))), poly5dd_b(x, c20, c19, c18, c17, c16), poly16dd_b(x, c15, c14, c13, c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly21dd(vdouble2 x, vdouble c20, double2 c19, double2 c18, double2 c17, double2 c16, double2 c15, double2 c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)))), poly5dd(x, c20, c19, c18, c17, c16), poly16dd_b(x, c15, c14, c13, c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly22dd_b(vdouble2 x, double2 c21, double2 c20, double2 c19, double2 c18, double2 c17, double2 c16, double2 c15, double2 c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)))), poly6dd_b(x, c21, c20, c19, c18, c17, c16), poly16dd_b(x, c15, c14, c13, c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly22dd(vdouble2 x, vdouble c21, double2 c20, double2 c19, double2 c18, double2 c17, double2 c16, double2 c15, double2 c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)))), poly6dd(x, c21, c20, c19, c18, c17, c16), poly16dd_b(x, c15, c14, c13, c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly23dd_b(vdouble2 x, double2 c22, double2 c21, double2 c20, double2 c19, double2 c18, double2 c17, double2 c16, double2 c15, double2 c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)))), poly7dd_b(x, c22, c21, c20, c19, c18, c17, c16), poly16dd_b(x, c15, c14, c13, c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly23dd(vdouble2 x, vdouble c22, double2 c21, double2 c20, double2 c19, double2 c18, double2 c17, double2 c16, double2 c15, double2 c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)))), poly7dd(x, c22, c21, c20, c19, c18, c17, c16), poly16dd_b(x, c15, c14, c13, c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly24dd_b(vdouble2 x, double2 c23, double2 c22, double2 c21, double2 c20, double2 c19, double2 c18, double2 c17, double2 c16, double2 c15, double2 c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)))), poly8dd_b(x, c23, c22, c21, c20, c19, c18, c17, c16), poly16dd_b(x, c15, c14, c13, c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly24dd(vdouble2 x, vdouble c23, double2 c22, double2 c21, double2 c20, double2 c19, double2 c18, double2 c17, double2 c16, double2 c15, double2 c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)))), poly8dd(x, c23, c22, c21, c20, c19, c18, c17, c16), poly16dd_b(x, c15, c14, c13, c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly25dd_b(vdouble2 x, double2 c24, double2 c23, double2 c22, double2 c21, double2 c20, double2 c19, double2 c18, double2 c17, double2 c16, double2 c15, double2 c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)))), poly9dd_b(x, c24, c23, c22, c21, c20, c19, c18, c17, c16), poly16dd_b(x, c15, c14, c13, c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly25dd(vdouble2 x, vdouble c24, double2 c23, double2 c22, double2 c21, double2 c20, double2 c19, double2 c18, double2 c17, double2 c16, double2 c15, double2 c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)))), poly9dd(x, c24, c23, c22, c21, c20, c19, c18, c17, c16), poly16dd_b(x, c15, c14, c13, c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly26dd_b(vdouble2 x, double2 c25, double2 c24, double2 c23, double2 c22, double2 c21, double2 c20, double2 c19, double2 c18, double2 c17, double2 c16, double2 c15, double2 c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)))), poly10dd_b(x, c25, c24, c23, c22, c21, c20, c19, c18, c17, c16), poly16dd_b(x, c15, c14, c13, c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly26dd(vdouble2 x, vdouble c25, double2 c24, double2 c23, double2 c22, double2 c21, double2 c20, double2 c19, double2 c18, double2 c17, double2 c16, double2 c15, double2 c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)))), poly10dd(x, c25, c24, c23, c22, c21, c20, c19, c18, c17, c16), poly16dd_b(x, c15, c14, c13, c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly27dd_b(vdouble2 x, double2 c26, double2 c25, double2 c24, double2 c23, double2 c22, double2 c21, double2 c20, double2 c19, double2 c18, double2 c17, double2 c16, double2 c15, double2 c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)))), poly11dd_b(x, c26, c25, c24, c23, c22, c21, c20, c19, c18, c17, c16), poly16dd_b(x, c15, c14, c13, c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble2 poly27dd(vdouble2 x, vdouble c26, double2 c25, double2 c24, double2 c23, double2 c22, double2 c21, double2 c20, double2 c19, double2 c18, double2 c17, double2 c16, double2 c15, double2 c14, double2 c13, double2 c12, double2 c11, double2 c10, double2 c9, double2 c8, double2 c7, double2 c6, double2 c5, double2 c4, double2 c3, double2 c2, double2 c1, double2 c0) { return ddmla_vd2_vd2_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(ddsqu_vd2_vd2(x)))), poly11dd(x, c26, c25, c24, c23, c22, c21, c20, c19, c18, c17, c16), poly16dd_b(x, c15, c14, c13, c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)); } // #ifndef SLEEF_ENABLE_CUDA typedef struct { double x, y, z; } double3; #endif static INLINE CONST VECTOR_CC vdouble3 cast_vd3_d3(double3 td) { return vd3setxyz_vd3_vd_vd_vd(vcast_vd_d(td.x), vcast_vd_d(td.y), vcast_vd_d(td.z)); } static INLINE CONST double3 td(double x, double y, double z) { double3 ret = { x, y, z }; return ret; } static INLINE CONST VECTOR_CC vdouble3 mla_vd3_vd3_vd3_vd3(vdouble3 x, vdouble3 y, vdouble3 z) { return add2_vd3_vd3_vd3(z, mul_vd3_vd3_vd3(x, y)); } static INLINE CONST VECTOR_CC vdouble3 poly2td_b(vdouble3 x, double3 c1, double3 c0) { if (c0.y == 0 && c0.z == 0) { if (c1.x == 1 && c1.y == 0 && c1.z == 0) return add2_vd3_vd_vd3(vcast_vd_d(c0.x), x); return add2_vd3_vd_vd3(vcast_vd_d(c0.x), mul_vd3_vd3_vd3(x, cast_vd3_d3(c1))); } return mla_vd3_vd3_vd3_vd3(x, cast_vd3_d3(c1), cast_vd3_d3(c0)); } static INLINE CONST VECTOR_CC vdouble3 poly2td(vdouble3 x, vdouble2 c1, double3 c0) { return add2_vd3_vd3_vd3(cast_vd3_d3(c0), mul_vd3_vd3_vd2(x, c1)); } static INLINE CONST VECTOR_CC vdouble3 poly3td_b(vdouble3 x, double3 c2, double3 c1, double3 c0) { return mla_vd3_vd3_vd3_vd3(squ_vd3_vd3(x), cast_vd3_d3(c2), poly2td_b(x, c1, c0)); } static INLINE CONST VECTOR_CC vdouble3 poly3td(vdouble3 x, vdouble2 c2, double3 c1, double3 c0) { return add2_vd3_vd3_vd3(poly2td_b(x, c1, c0), mul_vd3_vd3_vd2(squ_vd3_vd3(x), c2)); } static INLINE CONST VECTOR_CC vdouble3 poly4td_b(vdouble3 x, double3 c3, double3 c2, double3 c1, double3 c0) { return mla_vd3_vd3_vd3_vd3(squ_vd3_vd3(x), poly2td_b(x, c3, c2), poly2td_b(x, c1, c0)); } static INLINE CONST VECTOR_CC vdouble3 poly4td(vdouble3 x, vdouble2 c3, double3 c2, double3 c1, double3 c0) { return mla_vd3_vd3_vd3_vd3(squ_vd3_vd3(x), poly2td(x, c3, c2), poly2td_b(x, c1, c0)); } static INLINE CONST VECTOR_CC vdouble3 poly5td_b(vdouble3 x, double3 c4, double3 c3, double3 c2, double3 c1, double3 c0) { return mla_vd3_vd3_vd3_vd3(squ_vd3_vd3(squ_vd3_vd3(x)), cast_vd3_d3(c4), poly4td_b(x, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble3 poly5td(vdouble3 x, vdouble2 c4, double3 c3, double3 c2, double3 c1, double3 c0) { return add2_vd3_vd3_vd3(poly4td_b(x, c3, c2, c1, c0), mul_vd3_vd3_vd2(squ_vd3_vd3(squ_vd3_vd3(x)), c4)); } static INLINE CONST VECTOR_CC vdouble3 poly6td_b(vdouble3 x, double3 c5, double3 c4, double3 c3, double3 c2, double3 c1, double3 c0) { return mla_vd3_vd3_vd3_vd3(squ_vd3_vd3(squ_vd3_vd3(x)), poly2td_b(x, c5, c4), poly4td_b(x, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble3 poly6td(vdouble3 x, vdouble2 c5, double3 c4, double3 c3, double3 c2, double3 c1, double3 c0) { return mla_vd3_vd3_vd3_vd3(squ_vd3_vd3(squ_vd3_vd3(x)), poly2td(x, c5, c4), poly4td_b(x, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble3 poly7td_b(vdouble3 x, double3 c6, double3 c5, double3 c4, double3 c3, double3 c2, double3 c1, double3 c0) { return mla_vd3_vd3_vd3_vd3(squ_vd3_vd3(squ_vd3_vd3(x)), poly3td_b(x, c6, c5, c4), poly4td_b(x, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble3 poly7td(vdouble3 x, vdouble2 c6, double3 c5, double3 c4, double3 c3, double3 c2, double3 c1, double3 c0) { return mla_vd3_vd3_vd3_vd3(squ_vd3_vd3(squ_vd3_vd3(x)), poly3td(x, c6, c5, c4), poly4td_b(x, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble3 poly8td_b(vdouble3 x, double3 c7, double3 c6, double3 c5, double3 c4, double3 c3, double3 c2, double3 c1, double3 c0) { return mla_vd3_vd3_vd3_vd3(squ_vd3_vd3(squ_vd3_vd3(x)), poly4td_b(x, c7, c6, c5, c4), poly4td_b(x, c3, c2, c1, c0)); } static INLINE CONST VECTOR_CC vdouble3 poly8td(vdouble3 x, vdouble2 c7, double3 c6, double3 c5, double3 c4, double3 c3, double3 c2, double3 c1, double3 c0) { return mla_vd3_vd3_vd3_vd3(squ_vd3_vd3(squ_vd3_vd3(x)), poly4td(x, c7, c6, c5, c4), poly4td_b(x, c3, c2, c1, c0)); } // TDX functions ------------------------------------------------------------------------------------------------------------ // TDX Cast operators static INLINE CONST VECTOR_CC tdx cast_tdx_vd(vdouble d) { tdx r = tdxsetexyz_tdx_vm_vd_vd_vd(vilogbk_vm_vd(d), d, vcast_vd_d(0), vcast_vd_d(0)); r = tdxsetx_tdx_tdx_vd(r, vsel_vd_vo_vd_vd(visnonfinite_vo_vd(tdxgetd3x_vd_tdx(r)), tdxgetd3x_vd_tdx(r), vldexp2_vd_vd_vm(tdxgetd3x_vd_tdx(r), vneg64_vm_vm(tdxgete_vm_tdx(r))))); r = tdxsete_tdx_tdx_vm(r, vadd64_vm_vm_vm(tdxgete_vm_tdx(r), vcast_vm_i64(16383))); return r; } static INLINE CONST VECTOR_CC tdx cast_tdx_d(double d) { return cast_tdx_vd(vcast_vd_d(d)); } static INLINE CONST VECTOR_CC tdx cast_tdx_vd3(vdouble3 d) { vmask re = vilogbk_vm_vd(vd3getx_vd_vd3(d)); vdouble3 rd3 = vd3setxyz_vd3_vd_vd_vd(vldexp2_vd_vd_vm(vd3getx_vd_vd3(d), vneg64_vm_vm(re)), vldexp2_vd_vd_vm(vd3gety_vd_vd3(d), vneg64_vm_vm(re)), vldexp2_vd_vd_vm(vd3getz_vd_vd3(d), vneg64_vm_vm(re))); re = vadd64_vm_vm_vm(re, vcast_vm_i64(16383)); return tdxseted3_tdx_vm_vd3(re, rd3); } static INLINE CONST VECTOR_CC tdx cast_tdx_d_d_d(double d0, double d1, double d2) { return cast_tdx_vd3(cast_vd3_d_d_d(d0, d1, d2)); } static INLINE CONST VECTOR_CC tdx fastcast_tdx_vd3(vdouble3 d) { vmask re = vadd64_vm_vm_vm(vilogb2k_vm_vd(vd3getx_vd_vd3(d)), vcast_vm_i64(16383)); vdouble t = vldexp3_vd_vd_vm(vcast_vd_d(0.5), vneg64_vm_vm(re)); return tdxsetexyz_tdx_vm_vd_vd_vd(re, vmul_vd_vd_vd(vd3getx_vd_vd3(d), t), vmul_vd_vd_vd(vd3gety_vd_vd3(d), t), vmul_vd_vd_vd(vd3getz_vd_vd3(d), t)); } static INLINE CONST VECTOR_CC vdouble cast_vd_tdx(tdx t) { vdouble ret = vldexp2_vd_vd_vm(tdxgetd3x_vd_tdx(t), vadd64_vm_vm_vm(tdxgete_vm_tdx(t), vcast_vm_i64(-16383))); vopmask o = vor_vo_vo_vo(veq_vo_vd_vd(tdxgetd3x_vd_tdx(t), vcast_vd_d(0)), vlt64_vo_vm_vm(tdxgete_vm_tdx(t), vcast_vm_i64(16383 - 0x500))); ret = vsel_vd_vo_vd_vd(o, vmulsign_vd_vd_vd(vcast_vd_d(0), tdxgetd3x_vd_tdx(t)), ret); o = vgt64_vo_vm_vm(tdxgete_vm_tdx(t), vcast_vm_i64(16383 + 0x400)); ret = vsel_vd_vo_vd_vd(o, vmulsign_vd_vd_vd(vcast_vd_d(SLEEF_INFINITY), tdxgetd3x_vd_tdx(t)), ret); return vsel_vd_vo_vd_vd(visnonfinite_vo_vd(tdxgetd3x_vd_tdx(t)), tdxgetd3x_vd_tdx(t), ret); } static INLINE CONST VECTOR_CC vdouble3 cast_vd3_tdx(tdx t) { vmask e = vadd64_vm_vm_vm(tdxgete_vm_tdx(t), vcast_vm_i64(-16383)); vdouble3 r = cast_vd3_vd_vd_vd(vldexp2_vd_vd_vm(tdxgetd3x_vd_tdx(t), e), vldexp2_vd_vd_vm(tdxgetd3y_vd_tdx(t), e), vldexp2_vd_vd_vm(tdxgetd3z_vd_tdx(t), e)); vopmask o = vor_vo_vo_vo(veq_vo_vd_vd(tdxgetd3x_vd_tdx(t), vcast_vd_d(0)), vlt64_vo_vm_vm(tdxgete_vm_tdx(t), vcast_vm_i64(16383 - 0x500))); r = sel_vd3_vo_vd3_vd3(o, cast_vd3_vd_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(0), tdxgetd3x_vd_tdx(t)), vcast_vd_d(0), vcast_vd_d(0)), r); o = vgt64_vo_vm_vm(tdxgete_vm_tdx(t), vcast_vm_i64(16383 + 0x400)); r = sel_vd3_vo_vd3_vd3(o, cast_vd3_vd_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(SLEEF_INFINITY), tdxgetd3x_vd_tdx(t)), vcast_vd_d(0), vcast_vd_d(0)), r); r = sel_vd3_vo_vd3_vd3(visnonfinite_vo_vd(tdxgetd3x_vd_tdx(t)), tdxgetd3_vd3_tdx(t), r); return r; } // TDX Compare and select functions static INLINE CONST VECTOR_CC tdx sel_tdx_vo_tdx_tdx(vopmask o, tdx x, tdx y) { return tdxseted3_tdx_vm_vd3(vsel_vm_vo64_vm_vm(o, tdxgete_vm_tdx(x), tdxgete_vm_tdx(y)), sel_vd3_vo_vd3_vd3(o, tdxgetd3_vd3_tdx(x), tdxgetd3_vd3_tdx(y))); } static INLINE CONST VECTOR_CC vmask cmp_vm_tdx_tdx(tdx t0, tdx t1) { vmask r = vcast_vm_i64(0); r = vsel_vm_vo64_vm_vm(vlt_vo_vd_vd(tdxgetd3z_vd_tdx(t0), tdxgetd3z_vd_tdx(t1)), vcast_vm_i64(-1), r); r = vsel_vm_vo64_vm_vm(vgt_vo_vd_vd(tdxgetd3z_vd_tdx(t0), tdxgetd3z_vd_tdx(t1)), vcast_vm_i64( 1), r); r = vsel_vm_vo64_vm_vm(vlt_vo_vd_vd(tdxgetd3y_vd_tdx(t0), tdxgetd3y_vd_tdx(t1)), vcast_vm_i64(-1), r); r = vsel_vm_vo64_vm_vm(vgt_vo_vd_vd(tdxgetd3y_vd_tdx(t0), tdxgetd3y_vd_tdx(t1)), vcast_vm_i64( 1), r); r = vsel_vm_vo64_vm_vm(vlt_vo_vd_vd(tdxgetd3x_vd_tdx(t0), tdxgetd3x_vd_tdx(t1)), vcast_vm_i64(-1), r); r = vsel_vm_vo64_vm_vm(vgt_vo_vd_vd(tdxgetd3x_vd_tdx(t0), tdxgetd3x_vd_tdx(t1)), vcast_vm_i64( 1), r); r = vsel_vm_vo64_vm_vm(vand_vo_vo_vo(vgt64_vo_vm_vm(tdxgete_vm_tdx(t1), tdxgete_vm_tdx(t0)), veq64_vo_vm_vm(vsignbit_vm_vd(tdxgetd3x_vd_tdx(t0)), vsignbit_vm_vd(tdxgetd3x_vd_tdx(t1)))), vsel_vm_vo64_vm_vm(vsignbit_vo_vd(tdxgetd3x_vd_tdx(t0)), vcast_vm_i64(+1), vcast_vm_i64(-1)), r); r = vsel_vm_vo64_vm_vm(vand_vo_vo_vo(vgt64_vo_vm_vm(tdxgete_vm_tdx(t0), tdxgete_vm_tdx(t1)), veq64_vo_vm_vm(vsignbit_vm_vd(tdxgetd3x_vd_tdx(t0)), vsignbit_vm_vd(tdxgetd3x_vd_tdx(t1)))), vsel_vm_vo64_vm_vm(vsignbit_vo_vd(tdxgetd3x_vd_tdx(t0)), vcast_vm_i64(-1), vcast_vm_i64(+1)), r); r = vsel_vm_vo64_vm_vm(vand_vo_vo_vo(veq_vo_vd_vd(tdxgetd3x_vd_tdx(t0), vcast_vd_d(0)), veq_vo_vd_vd(tdxgetd3x_vd_tdx(t1), vcast_vd_d(0))), vcast_vm_i64( 0), r); r = vsel_vm_vo64_vm_vm(vand_vo_vo_vo(vlt_vo_vd_vd(tdxgetd3x_vd_tdx(t0), vcast_vd_d(0)), vge_vo_vd_vd(tdxgetd3x_vd_tdx(t1), vcast_vd_d(0))), vcast_vm_i64(-1), r); r = vsel_vm_vo64_vm_vm(vand_vo_vo_vo(vge_vo_vd_vd(tdxgetd3x_vd_tdx(t0), vcast_vd_d(0)), vlt_vo_vd_vd(tdxgetd3x_vd_tdx(t1), vcast_vd_d(0))), vcast_vm_i64( 1), r); return r; } static INLINE CONST VECTOR_CC vopmask signbit_vo_tdx(tdx x) { return vsignbit_vo_vd(tdxgetd3x_vd_tdx(x)); } static INLINE CONST VECTOR_CC vopmask isnan_vo_tdx(tdx x) { return visnan_vo_vd(tdxgetd3x_vd_tdx(x)); } static INLINE CONST VECTOR_CC vopmask isinf_vo_tdx(tdx x) { return visinf_vo_vd(tdxgetd3x_vd_tdx(x)); } static INLINE CONST VECTOR_CC vopmask iszero_vo_tdx(tdx x) { return vandnot_vo_vo_vo(visnan_vo_vd(tdxgetd3x_vd_tdx(x)), veq_vo_vd_vd(tdxgetd3x_vd_tdx(x), vcast_vd_d(0))); } static INLINE CONST VECTOR_CC vopmask eq_vo_tdx_tdx(tdx x, tdx y) { return vandnot_vo_vo_vo(visnan_vo_vd(tdxgetd3x_vd_tdx(x)), veq64_vo_vm_vm(cmp_vm_tdx_tdx(x, y), vcast_vm_i64(0))); } static INLINE CONST VECTOR_CC vopmask neq_vo_tdx_tdx(tdx x, tdx y) { return vandnot_vo_vo_vo(visnan_vo_vd(tdxgetd3x_vd_tdx(x)), vnot_vo64_vo64(veq64_vo_vm_vm(cmp_vm_tdx_tdx(x, y), vcast_vm_i64(0)))); } static INLINE CONST VECTOR_CC vopmask gt_vo_tdx_tdx(tdx x, tdx y) { return vandnot_vo_vo_vo(visnan_vo_vd(tdxgetd3x_vd_tdx(x)), vgt64_vo_vm_vm(cmp_vm_tdx_tdx(x, y), vcast_vm_i64(0))); } static INLINE CONST VECTOR_CC vopmask lt_vo_tdx_tdx(tdx x, tdx y) { return vandnot_vo_vo_vo(visnan_vo_vd(tdxgetd3x_vd_tdx(x)), vgt64_vo_vm_vm(cmp_vm_tdx_tdx(y, x), vcast_vm_i64(0))); } static INLINE CONST VECTOR_CC vopmask ge_vo_tdx_tdx(tdx x, tdx y) { return vandnot_vo_vo_vo(visnan_vo_vd(tdxgetd3x_vd_tdx(x)), vgt64_vo_vm_vm(cmp_vm_tdx_tdx(x, y), vcast_vm_i64(-1))); } static INLINE CONST VECTOR_CC vopmask le_vo_tdx_tdx(tdx x, tdx y) { return vandnot_vo_vo_vo(visnan_vo_vd(tdxgetd3x_vd_tdx(x)), vgt64_vo_vm_vm(cmp_vm_tdx_tdx(y, x), vcast_vm_i64(-1))); } static INLINE CONST VECTOR_CC vopmask isint_vo_tdx(tdx t) { vdouble3 d = cast_vd3_tdx(t); vopmask o0 = vand_vo_vo_vo(vlt64_vo_vm_vm(tdxgete_vm_tdx(t), vcast_vm_i64(15400)), vneq_vo_vd_vd(tdxgetd3x_vd_tdx(t), vcast_vd_d(0))); vopmask o1 = vor_vo_vo_vo(vgt64_vo_vm_vm(tdxgete_vm_tdx(t), vcast_vm_i64(17000)), vand_vo_vo_vo(vand_vo_vo_vo(visint_vo_vd(vd3getx_vd_vd3(d)), visint_vo_vd(vd3gety_vd_vd3(d))), visint_vo_vd(vd3getz_vd_vd3(d)))); return vandnot_vo_vo_vo(o0, o1); } static INLINE CONST VECTOR_CC vopmask isodd_vo_tdx(tdx t) { t = tdxsete_tdx_tdx_vm(t, vadd64_vm_vm_vm(tdxgete_vm_tdx(t), vcast_vm_i64(-1))); return vnot_vo64_vo64(isint_vo_tdx(t)); } // TDX Arithmetic functions static INLINE CONST VECTOR_CC tdx neg_tdx_tdx(tdx x) { return tdxsetd3_tdx_tdx_vd3(x, neg_vd3_vd3(tdxgetd3_vd3_tdx(x))); } static INLINE CONST VECTOR_CC tdx abs_tdx_tdx(tdx x) { return tdxsetd3_tdx_tdx_vd3(x, abs_vd3_vd3(tdxgetd3_vd3_tdx(x))); } static INLINE CONST VECTOR_CC tdx mulsign_tdx_tdx_vd(tdx x, vdouble d) { return tdxsetd3_tdx_tdx_vd3(x, mulsign_vd3_vd3_vd(tdxgetd3_vd3_tdx(x), d)); } static INLINE CONST VECTOR_CC vmask ilogb_vm_tdx(tdx t) { vmask e = vadd64_vm_vm_vm(tdxgete_vm_tdx(t), vcast_vm_i64(-16383)); e = vsel_vm_vo64_vm_vm(vor_vo_vo_vo(veq_vo_vd_vd(tdxgetd3x_vd_tdx(t), vcast_vd_d(1.0)), vlt_vo_vd_vd(tdxgetd3y_vd_tdx(t), vcast_vd_d(0))), vadd64_vm_vm_vm(e, vcast_vm_i64(-1)), e); return e; } static INLINE CONST VECTOR_CC tdx add_tdx_tdx_tdx(tdx dd0, tdx dd1) { // finite numbers only vmask ed = vsub64_vm_vm_vm(tdxgete_vm_tdx(dd1), tdxgete_vm_tdx(dd0)); vdouble t = vldexp3_vd_vd_vm(vcast_vd_d(1), ed); vdouble3 rd3 = scaleadd2_vd3_vd3_vd3_vd(tdxgetd3_vd3_tdx(dd0), tdxgetd3_vd3_tdx(dd1), t); tdx r = tdxseted3_tdx_vm_vd3(vilogb2k_vm_vd(vd3getx_vd_vd3(rd3)), rd3); t = vldexp3_vd_vd_vm(vcast_vd_d(1), vneg64_vm_vm(tdxgete_vm_tdx(r))); vopmask o = veq_vo_vd_vd(tdxgetd3x_vd_tdx(dd0), vcast_vd_d(0)); r = tdxsete_tdx_tdx_vm(r, vsel_vm_vo64_vm_vm(o, tdxgete_vm_tdx(dd1), vadd64_vm_vm_vm(tdxgete_vm_tdx(r), tdxgete_vm_tdx(dd0)))); r = tdxsetd3_tdx_tdx_vd3(r, scale_vd3_vd3_vd(tdxgetd3_vd3_tdx(r), t)); r = sel_tdx_vo_tdx_tdx(vgt64_vo_vm_vm(ed, vcast_vm_i64(200)), dd1, r); r = sel_tdx_vo_tdx_tdx(vgt64_vo_vm_vm(vcast_vm_i64(-200), ed), dd0, r); return r; } static INLINE CONST VECTOR_CC tdx sub_tdx_tdx_tdx(tdx dd0, tdx dd1) { vmask ed = vsub64_vm_vm_vm(tdxgete_vm_tdx(dd1), tdxgete_vm_tdx(dd0)); vdouble t = vldexp3_vd_vd_vm(vcast_vd_d(1), ed); vdouble3 rd3 = scalesub2_vd3_vd3_vd3_vd(tdxgetd3_vd3_tdx(dd0), tdxgetd3_vd3_tdx(dd1), t); tdx r = tdxseted3_tdx_vm_vd3(vilogb2k_vm_vd(vd3getx_vd_vd3(rd3)), rd3); t = vldexp3_vd_vd_vm(vcast_vd_d(1), vneg64_vm_vm(tdxgete_vm_tdx(r))); vopmask o = veq_vo_vd_vd(tdxgetd3x_vd_tdx(dd0), vcast_vd_d(0)); r = tdxsete_tdx_tdx_vm(r, vsel_vm_vo64_vm_vm(o, tdxgete_vm_tdx(dd1), vadd64_vm_vm_vm(tdxgete_vm_tdx(r), tdxgete_vm_tdx(dd0)))); r = tdxsetd3_tdx_tdx_vd3(r, scale_vd3_vd3_vd(tdxgetd3_vd3_tdx(r), t)); r = sel_tdx_vo_tdx_tdx(vgt64_vo_vm_vm(ed, vcast_vm_i64(200)), neg_tdx_tdx(dd1), r); r = sel_tdx_vo_tdx_tdx(vgt64_vo_vm_vm(vcast_vm_i64(-200), ed), dd0, r); return r; } static INLINE CONST VECTOR_CC tdx mul_tdx_tdx_tdx(tdx dd0, tdx dd1) { vdouble3 rd3 = mul2_vd3_vd3_vd3(tdxgetd3_vd3_tdx(dd0), tdxgetd3_vd3_tdx(dd1)); tdx r = tdxseted3_tdx_vm_vd3(vilogb2k_vm_vd(vd3getx_vd_vd3(rd3)), rd3); vdouble t = vldexp3_vd_vd_vm(vcast_vd_d(1), vneg64_vm_vm(tdxgete_vm_tdx(r))); r = tdxsetd3_tdx_tdx_vd3(r, scale_vd3_vd3_vd(tdxgetd3_vd3_tdx(r), t)); vmask e = vadd64_vm_vm_vm(vadd64_vm_vm_vm(vadd64_vm_vm_vm(tdxgete_vm_tdx(dd0), tdxgete_vm_tdx(dd1)), vcast_vm_i64(-16383)), tdxgete_vm_tdx(r)); e = vsel_vm_vo64_vm_vm(veq_vo_vd_vd(tdxgetd3x_vd_tdx(r), vcast_vd_d(0)), vcast_vm_i64(0), e); r = tdxsete_tdx_tdx_vm(r, e); return r; } static INLINE CONST VECTOR_CC tdx div_tdx_tdx_tdx(tdx dd0, tdx dd1) { vdouble3 rd3 = div2_vd3_vd3_vd3(tdxgetd3_vd3_tdx(dd0), tdxgetd3_vd3_tdx(dd1)); tdx r = tdxseted3_tdx_vm_vd3(vilogb2k_vm_vd(vd3getx_vd_vd3(rd3)), rd3); vdouble t = vldexp3_vd_vd_vm(vcast_vd_d(1), vneg64_vm_vm(tdxgete_vm_tdx(r))); r = tdxsetd3_tdx_tdx_vd3(r, scale_vd3_vd3_vd(tdxgetd3_vd3_tdx(r), t)); vmask e = vadd64_vm_vm_vm(vadd64_vm_vm_vm(vsub64_vm_vm_vm(tdxgete_vm_tdx(dd0), tdxgete_vm_tdx(dd1)), vcast_vm_i64(16383)), tdxgete_vm_tdx(r)); e = vsel_vm_vo64_vm_vm(veq_vo_vd_vd(tdxgetd3x_vd_tdx(r), vcast_vd_d(0)), vcast_vm_i64(0), e); r = tdxsete_tdx_tdx_vm(r, e); return r; } // TDX math functions static INLINE CONST VECTOR_CC tdx sqrt_tdx_tdx(tdx dd0) { vopmask o = veq64_vo_vm_vm(vand_vm_vm_vm(tdxgete_vm_tdx(dd0), vcast_vm_i64(1)), vcast_vm_i64(1)); dd0 = tdxsetd3_tdx_tdx_vd3(dd0, scale_vd3_vd3_vd(tdxgetd3_vd3_tdx(dd0), vsel_vd_vo_vd_vd(o, vcast_vd_d(1), vcast_vd_d(2)))); vdouble3 rd3 = sqrt_vd3_vd3(tdxgetd3_vd3_tdx(dd0)); tdx r = tdxseted3_tdx_vm_vd3(vilogb2k_vm_vd(vd3getx_vd_vd3(rd3)), rd3); r = tdxsetd3_tdx_tdx_vd3(r, scale_vd3_vd3_vd(tdxgetd3_vd3_tdx(r), vldexp3_vd_vd_vm(vcast_vd_d(1), vneg64_vm_vm(tdxgete_vm_tdx(r))))); r = tdxsete_tdx_tdx_vm(r, vadd64_vm_vm_vm(vsub64_vm_vm_vm(vsrl64_vm_vm_i(vadd64_vm_vm_vm(tdxgete_vm_tdx(dd0), vcast_vm_i64(16383+(1 << 21))), 1), vcast_vm_i64(1 << 20)), tdxgete_vm_tdx(r))); o = vneq_vo_vd_vd(tdxgetd3x_vd_tdx(dd0), vcast_vd_d(0)); return tdxsetxyz_tdx_tdx_vd_vd_vd(r, vreinterpret_vd_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(tdxgetd3x_vd_tdx(r)))), vreinterpret_vd_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(tdxgetd3y_vd_tdx(r)))), vreinterpret_vd_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(tdxgetd3z_vd_tdx(r))))); } static INLINE CONST VECTOR_CC tdx cbrt_tdx_tdx(tdx d) { vmask e = vadd64_vm_vm_vm(tdxgete_vm_tdx(d), vcast_vm_i64(-16382)); d = tdxsete_tdx_tdx_vm(d, vsub64_vm_vm_vm(tdxgete_vm_tdx(d), e)); vdouble t = vadd_vd_vd_vd(vcast_vd_vm(e), vcast_vd_d(60000)); vint qu = vtruncate_vi_vd(vmul_vd_vd_vd(t, vcast_vd_d(1.0/3.0))); vint re = vtruncate_vi_vd(vsub_vd_vd_vd(t, vmul_vd_vd_vd(vcast_vd_vi(qu), vcast_vd_d(3)))); tdx q = cast_tdx_d(1); q = sel_tdx_vo_tdx_tdx(vcast_vo64_vo32(veq_vo_vi_vi(re, vcast_vi_i(1))), cast_tdx_d_d_d(1.2599210498948731907, -2.5899333753005069177e-17, 9.7278081561563724019e-34), q); q = sel_tdx_vo_tdx_tdx(vcast_vo64_vo32(veq_vo_vi_vi(re, vcast_vi_i(2))), cast_tdx_d_d_d(1.5874010519681995834, -1.0869008194197822986e-16, 9.380961956715938535e-34), q); q = tdxsete_tdx_tdx_vm(q, vadd64_vm_vm_vm(tdxgete_vm_tdx(q), vcast_vm_vi(vsub_vi_vi_vi(qu, vcast_vi_i(20000))))); q = mulsign_tdx_tdx_vd(q, tdxgetd3x_vd_tdx(d)); vdouble3 d3 = abs_vd3_vd3(cast_vd3_tdx(d)); vdouble x = vcast_vd_d(-0.640245898480692909870982), y; x = vmla_vd_vd_vd_vd(x, vd3getx_vd_vd3(d3), vcast_vd_d(2.96155103020039511818595)); x = vmla_vd_vd_vd_vd(x, vd3getx_vd_vd3(d3), vcast_vd_d(-5.73353060922947843636166)); x = vmla_vd_vd_vd_vd(x, vd3getx_vd_vd3(d3), vcast_vd_d(6.03990368989458747961407)); x = vmla_vd_vd_vd_vd(x, vd3getx_vd_vd3(d3), vcast_vd_d(-3.85841935510444988821632)); x = vmla_vd_vd_vd_vd(x, vd3getx_vd_vd3(d3), vcast_vd_d(2.2307275302496609725722)); y = vmul_vd_vd_vd(x, x); y = vmul_vd_vd_vd(y, y); x = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vmlapn_vd_vd_vd_vd(vd3getx_vd_vd3(d3), y, x), vcast_vd_d(1.0 / 3.0))); y = vmul_vd_vd_vd(x, x); y = vmul_vd_vd_vd(y, y); x = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vmlapn_vd_vd_vd_vd(vd3getx_vd_vd3(d3), y, x), vcast_vd_d(1.0 / 3.0))); vdouble2 y2 = ddmul_vd2_vd_vd(x, x); y2 = ddsqu_vd2_vd2(y2); vdouble2 x2 = vcast_vd2_vd_vd(vd3getx_vd_vd3(d3), vd3gety_vd_vd3(d3)); x2 = ddadd_vd2_vd_vd2(x, ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd2(x2, y2), vneg_vd_vd(x)), vcast_vd2_d_d(-0.33333333333333331483, -1.8503717077085941313e-17))); vdouble3 y3 = mul_vd3_vd3_vd3(d3, mul_vd3_vd2_vd2(x2, x2)); vdouble3 x3 = cast_vd3_d_d_d(-0.66666666666666662966, -3.7007434154171882626e-17, -2.0543252740130514626e-33); x3 = mul_vd3_vd3_vd3(mul_vd3_vd3_vd3(x3, y3), add2_vd3_vd_vd3(vcast_vd_d(-1), mul_vd3_vd2_vd3(x2, y3))); y3 = add2_vd3_vd3_vd3(y3, x3); return sel_tdx_vo_tdx_tdx(vor_vo_vo_vo(isinf_vo_tdx(d), iszero_vo_tdx(d)), d, mul_tdx_tdx_tdx(q, cast_tdx_vd3(y3))); } static CONST VECTOR_CC tdi_t rempio2q(tdx a) { const int N = 8, B = 8; const int NCOL = 53-B, NROW = (16385+(53-B)*N-106)/NCOL+1; vmask e = ilogb_vm_tdx(a); e = vsel_vm_vo64_vm_vm(vgt64_vo_vm_vm(e, vcast_vm_i64(106)), e, vcast_vm_i64(106)); a = tdxsete_tdx_tdx_vm(a, vadd64_vm_vm_vm(tdxgete_vm_tdx(a), vsub64_vm_vm_vm(vcast_vm_i64(106), e))); vdouble row = vtruncate_vd_vd(vmul_vd_vd_vd(vcast_vd_vm(vsub64_vm_vm_vm(e, vcast_vm_i64(106))), vcast_vd_d(1.0 / NCOL))); // (e - 106) / NCOL; vdouble col = vsub_vd_vd_vd(vcast_vd_vm(vsub64_vm_vm_vm(e, vcast_vm_i64(106))), vmul_vd_vd_vd(row, vcast_vd_d(NCOL))); // (e - 106) % NCOL; vint p = vtruncate_vi_vd(vmla_vd_vd_vd_vd(col, vcast_vd_d(NROW), row)); vint q = vcast_vi_i(0); vdouble3 d = normalize_vd3_vd3(cast_vd3_tdx(a)), x = cast_vd3_d_d_d(0, 0, 0); for(int i=0;i #include #include #include #include #include #endif static const tdx exp10tab[14] = { { 16386, { 1.25, 0, 0 } }, // 10 { 16389, { 1.5625, 0, 0 } }, // 100 { 16396, { 1.220703125, 0, 0 } }, // 10000 { 16409, { 1.490116119384765625, 0, 0 } }, // 1e+08 { 16436, { 1.1102230246251565404, 0, 0 } }, // 1e+16 { 16489, { 1.2325951644078310121, -6.6143055845634601483e-17, 0 } }, // 1e+32 { 16595, { 1.519290839321567832, -3.2391917291561477915e-17, -1.8687814275678753633e-33 } }, // 1e+64 { 16808, { 1.1541223272232170594, -8.6760553787903265289e-17, -5.7759618887794337486e-33 } }, // 1e+128 { 17233, { 1.3319983461951343529, -4.0129993161716667573e-17, -4.1720927621797370111e-34 } }, // 1e+256 { 18083, { 1.7742195942665728303, 4.9309343678620668082e-17, 1.3386888736008621608e-34 } }, // 1e+512 { 19784, { 1.5739275843397213528, -1.0848584040002990893e-16, 4.3586291506346591213e-33 } }, // 1e+1024 { 23186, { 1.2386240203727352238, -5.8476062413608067671e-17, -2.0006771920677486581e-33 } }, // 1e+2048 { 29989, { 1.5341894638443178689, -1.0973609479387666102e-17, -6.5816871252891901643e-34 } }, // 1e+4096 { 43596, { 1.1768686554854577153, 3.0579788864750933707e-17, -2.6771867381968692559e-34 } }, // 1e+8192 }; static CONST tdx exp10i(int n) { int neg = 0; if (n < 0) { neg = 1; n = -n; } tdx r = cast_tdx_d(1); for(int i=0;i<14;i++) if ((n & (1 << i)) != 0) r = mul_tdx_tdx_tdx(r, exp10tab[i]); if (neg) r = div_tdx_tdx_tdx(cast_tdx_d(1), r); return r; } static CONST int ilog10(tdx t) { int r = 0, p = 1; if ((int)cmp_vm_tdx_tdx(t, cast_tdx_d(1)) < 0) { t = div_tdx_tdx_tdx(cast_tdx_d(1), t); p = -1; } for(int i=12;i>=0;i--) { int c = cmp_vm_tdx_tdx(t, exp10tab[i]); if ((p > 0 && c >= 0) || (p < 0 && c > 0)) { t = div_tdx_tdx_tdx(t, exp10tab[i]); r |= (1 << i); } } if (p < 0) r++; return r * p; } static CONST vquad xsll128(vquad m, int c) { if (c == 0) return m; if (c >= 128) return imdvq_vq_vm_vm(0, 0); if (c < 64) { return imdvq_vq_vm_vm(vsll64_vm_vm_i(vqgetx_vm_vq(m), c), vor_vm_vm_vm(vsll64_vm_vm_i(vqgety_vm_vq(m), c), vsrl64_vm_vm_i(vqgetx_vm_vq(m), 64 - c))); } return imdvq_vq_vm_vm(0, vsll64_vm_vm_i(vqgetx_vm_vq(m), c - 64)); } static CONST vquad xsrl128(vquad m, int c) { if (c == 0) return m; if (c >= 128) return imdvq_vq_vm_vm(0, 0); if (c < 64) { return imdvq_vq_vm_vm(vor_vm_vm_vm(vsrl64_vm_vm_i(vqgetx_vm_vq(m), c), vsll64_vm_vm_i(vqgety_vm_vq(m), 64 - c)), vsrl64_vm_vm_i(vqgety_vm_vq(m), c)); } return imdvq_vq_vm_vm(vsrl64_vm_vm_i(vqgety_vm_vq(m), c - 64), 0); } static int xclz128(vquad m) { int n = m.y == 0 ? 128 : 64; uint64_t u = m.y == 0 ? m.x : m.y, v; v = u >> 32; if (v != 0) { n -= 32; u = v; } v = u >> 16; if (v != 0) { n -= 16; u = v; } v = u >> 8; if (v != 0) { n -= 8; u = v; } v = u >> 4; if (v != 0) { n -= 4; u = v; } v = u >> 2; if (v != 0) { n -= 2; u = v; } v = u >> 1; if (v != 0) return n - 2; return n - u; } // Float128 string functions EXPORT vargquad Sleef_strtoq(const char *str, const char **endptr) { while(isspace((int)*str)) str++; const char *p = str; int positive = 1, bp = 0, e = 0, mf = 0; tdx n = cast_tdx_d(0), d = cast_tdx_d(1); if (*p == '-') { positive = 0; p++; } else if (*p == '+') p++; if (tolower(p[0]) == 'n' && tolower(p[1]) == 'a' && tolower(p[2]) == 'n') { if (endptr != NULL) *endptr = p+3; vquad r = { vcast_vm_i64(-1), vcast_vm_u64(UINT64_C(0x7fffffffffffffff)) }; r.y |= ((uint64_t)!positive) << 63; return cast_aq_vq(r); } if (tolower(p[0]) == 'i' && tolower(p[1]) == 'n' && tolower(p[2]) == 'f') { if (endptr != NULL) *endptr = p+3; if (tolower(p[3]) == 'i' && tolower(p[4]) == 'n' && tolower(p[5]) == 'i' && tolower(p[6]) == 't' && tolower(p[7]) == 'y' && endptr != NULL) *endptr = p+8; vquad r = { vcast_vm_i64(0), vcast_vm_u64(UINT64_C(0x7fff000000000000)) }; r.y |= ((uint64_t)!positive) << 63; return cast_aq_vq(r); } if (*p == '0' && *(p+1) == 'x') { p += 2; vquad m = { 0, 0 }; int pp = 0; while(*p != '\0') { int d = 0; if (('0' <= *p && *p <= '9')) { d = *p++ - '0'; } else if ('a' <= *p && *p <= 'f') { d = *p++ - 'a' + 10; } else if ('A' <= *p && *p <= 'F') { d = *p++ - 'A' + 10; } else if (*p == '.') { if (bp) break; bp = 1; p++; continue; } else break; mf = 1; m = xsll128(m, 4); m.x += d; pp += bp * 4; if (m.y >> 60) break; } while(('0' <= *p && *p <= '9') || ('a' <= *p && *p <= 'f') || ('A' <= *p && *p <= 'F')) p++; if (*p == '.' && !bp) p++; while(('0' <= *p && *p <= '9') || ('a' <= *p && *p <= 'f') || ('A' <= *p && *p <= 'F')) p++; if (*p == 'p' || *p == 'P') { char *q; e = strtol(p+1, &q, 10); if (p+1 == q || isspace((int)*(p+1))) { e = 0; } else { p = q; } } int nsl = xclz128(m) - 15; e = e - pp - nsl + 0x3fff + 112; if (e <= 0 || nsl == 128 - 15) { nsl += e - 1; e = 0; } if (nsl >= 0) { m = xsll128(m, nsl); } else { if (-nsl-1 < 64) { uint64_t u = m.x + ((1ULL) << (-nsl - 1)); if (u < m.x) m.y++; m.x = u; } else { m.y += ((1ULL) << (-nsl - 1 - 64)); } m = xsrl128(m, -nsl); } m.y &= 0x0000ffffffffffff; m.y |= (((uint64_t)!positive) << 63) | ((((uint64_t)e) & 0x7fff) << 48); if (endptr != NULL) *endptr = p; return cast_aq_vq(m); } while(*p != '\0') { if ('0' <= *p && *p <= '9') { n = add_tdx_tdx_tdx(mul_tdx_tdx_tdx(n, cast_tdx_d(10)), cast_tdx_d(*p - '0')); if (bp) d = mul_tdx_tdx_tdx(d, cast_tdx_d(10)); p++; mf = 1; continue; } if (*p == '.') { if (bp) break; bp = 1; p++; continue; } if (*p == 'e' || *p == 'E') { char *q; e = strtol(p+1, &q, 10); if (p+1 == q || isspace((int)*(p+1))) { e = 0; } else { p = q; } } break; } if (!mf && !bp) { if (endptr != NULL) *endptr = str; vquad r = { vcast_vm_i64(0), vcast_vm_i64(0) }; return cast_aq_vq(r); } n = div_tdx_tdx_tdx(n, d); if (e > 0) n = mul_tdx_tdx_tdx(n, exp10i(+e)); if (e < 0) n = div_tdx_tdx_tdx(n, exp10i(-e)); if (!positive) n = neg_tdx_tdx(n); if (endptr != NULL) *endptr = p; return cast_aq_vq(cast_vq_tdx(n)); } // #define FLAG_SIGN (1 << 0) #define FLAG_BLANK (1 << 1) #define FLAG_ALT (1 << 2) #define FLAG_LEFT (1 << 3) #define FLAG_ZERO (1 << 4) #define FLAG_UPPER (1 << 5) static int snprintquad(char *buf, size_t bufsize, vargquad argvalue, int typespec, int width, int precision, int flags) { if (width > bufsize) width = bufsize; vquad c128 = cast_vq_aq(argvalue); char *ptr = buf; char prefix = 0; int length = 0, flag_rtz = 0; assert(typespec == 'e' || typespec == 'f' || typespec == 'g'); if ((c128.y & UINT64_C(0x8000000000000000)) != 0) { c128.y ^= UINT64_C(0x8000000000000000); prefix = '-'; } else if (flags & FLAG_SIGN) { prefix = '+'; } else if (flags & FLAG_BLANK) { prefix = ' '; } tdx value = cast_tdx_vq(c128); if (isnan_vo_vq(c128)) { if (prefix) *ptr++ = prefix; ptr += snprintf(ptr, buf + bufsize - ptr, (flags & FLAG_UPPER) ? "NAN" : "nan"); flags &= ~FLAG_ZERO; } else if (isinf_vo_vq(c128)) { if (prefix) *ptr++ = prefix; ptr += snprintf(ptr, buf + bufsize - ptr, (flags & FLAG_UPPER) ? "INF" : "inf"); flags &= ~FLAG_ZERO; } else { if (precision < 0) precision = 6; if (precision > bufsize/2 - 10) precision = bufsize/2 - 10; if (typespec == 'g' && precision > 0) precision--; tdx rounder = mul_tdx_tdx_tdx(cast_tdx_d(0.5), exp10i(-precision)); if (typespec == 'f') value = add_tdx_tdx_tdx(value, rounder); int exp = 0, e2 = 0; if (!iszero_vo_vq(c128)) { exp = ilog10(value); value = mul_tdx_tdx_tdx(value, exp10i(-exp)); } int flag_exp = typespec == 'e'; if (typespec != 'f') { value = add_tdx_tdx_tdx(value, rounder); } if ((int)cmp_vm_tdx_tdx(value, cast_tdx_d(10.0)) >= 0) { value = div_tdx_tdx_tdx(value, cast_tdx_d(10)); exp++; } if (typespec == 'g') { flag_rtz = !(flags & FLAG_ALT); if (exp < -4 || exp > precision) { typespec = 'e'; } else { precision = precision - exp; typespec = 'f'; } } if (typespec != 'e') e2 = exp; int flag_dp = (precision > 0) | (flags & FLAG_ALT); if (prefix) *ptr++ = prefix; if (e2 < 0) { *ptr++ = '0'; } else { for(;e2>=0;e2--) { int digit = (int)cast_vd_tdx(value); if ((int)cmp_vm_tdx_tdx(value, cast_tdx_d(digit)) < 0) digit--; if (ptr - buf >= (ptrdiff_t)bufsize-1) { *ptr = '\0'; return -1; } *ptr++ = digit + '0'; value = mul_tdx_tdx_tdx(add_tdx_tdx_tdx(value, cast_tdx_d(-digit)), cast_tdx_d(10)); } } if (flag_dp) { if (ptr - buf >= (ptrdiff_t)bufsize-1) { *ptr = '\0'; return -1; } *ptr++ = '.'; } for(e2++;e2 < 0 && precision > 0;precision--, e2++) { if (ptr - buf >= (ptrdiff_t)bufsize-1) { *ptr = '\0'; return -1; } *ptr++ = '0'; } while (precision-- > 0) { int digit = (int)cast_vd_tdx(value); if ((int)cmp_vm_tdx_tdx(value, cast_tdx_d(digit)) < 0) digit--; if (ptr - buf >= (ptrdiff_t)bufsize-1) { *ptr = '\0'; return -1; } *ptr++ = digit + '0'; value = mul_tdx_tdx_tdx(add_tdx_tdx_tdx(value, cast_tdx_d(-digit)), cast_tdx_d(10)); } if (flag_rtz && flag_dp) { while(ptr[-1] == '0') *(--ptr) = 0; assert(ptr > buf); if (ptr[-1] == '.') *(--ptr) = 0; } if (flag_exp || (typespec == 'e' && exp)) { if (ptr - buf >= (ptrdiff_t)bufsize-8) { *ptr = '\0'; return -1; } *ptr++ = (flags & FLAG_UPPER) ? 'E' : 'e'; if (exp < 0){ *ptr++ = '-'; exp = -exp; } else { *ptr++ = '+'; } if (exp >= 1000) { *ptr++ = exp / 1000 + '0'; exp %= 1000; *ptr++ = exp / 100 + '0'; exp %= 100; } else if (exp >= 100) { *ptr++ = exp / 100 + '0'; exp %= 100; } *ptr++ = exp / 10 + '0'; *ptr++ = exp % 10 + '0'; } } *ptr = 0; length = ptr - buf; ptr = buf; if (!(flags & FLAG_LEFT) && length < width) { int i; int nPad = width - length; for(i=width; i>=nPad; i--) { ptr[i] = ptr[i-nPad]; } i = prefix != 0 && (flags & FLAG_ZERO); while (nPad--) { ptr[i++] = (flags & FLAG_ZERO) ? '0' : ' '; } length = width; } if (flags & FLAG_LEFT) { while (length < width) { ptr[length++] = ' '; } ptr[length] = '\0'; } return length; } static int snprintquadhex(char *buf, size_t bufsize, vargquad argvalue, int width, int precision, int flags) { if (width > bufsize) width = bufsize; char *bufend = buf + bufsize, *ptr = buf; vquad c128 = cast_vq_aq(argvalue); int mainpos = 0; if (c128.y >> 63) { ptr += snprintf(ptr, bufend - ptr, "-"); } else if (flags & FLAG_SIGN) { ptr += snprintf(ptr, bufend - ptr, "+"); } else if (flags & FLAG_BLANK) { ptr += snprintf(ptr, bufend - ptr, " "); } if (isnan_vo_vq(c128)) { ptr += snprintf(ptr, bufend - ptr, (flags & FLAG_UPPER) ? "NAN" : "nan"); flags &= ~FLAG_ZERO; } else if (isinf_vo_vq(c128)) { ptr += snprintf(ptr, bufend - ptr, (flags & FLAG_UPPER) ? "INF" : "inf"); flags &= ~FLAG_ZERO; } else { int iszero = iszero_vo_vq(c128); if (precision >= 0 && precision < 28) { int s = (28 - precision) * 4 - 1; if (s < 64) { uint64_t u = c128.x + (((uint64_t)1) << s); if (u < c128.x) c128.y++; c128.x = u; } else { c128.y += ((uint64_t)1) << (s - 64); } } int exp = (c128.y >> 48) & 0x7fff; uint64_t h = c128.y << 16, l = c128.x; ptr += snprintf(ptr, bufend - ptr, (flags & FLAG_UPPER) ? "0X" :"0x"); mainpos = ptr - buf; ptr += snprintf(ptr, bufend - ptr, exp != 0 ? "1" : "0"); if ((!(h == 0 && l == 0 && precision < 0) && precision != 0) || (flags & FLAG_ALT)) ptr += snprintf(ptr, bufend - ptr, "."); const char *digits = (flags & FLAG_UPPER) ? "0123456789ABCDEF" : "0123456789abcdef"; int niter = (precision < 0 || precision > 28) ? 28 : precision; for(int i=0;i<12 && i < niter;i++) { if (h == 0 && l == 0 && precision < 0) break; ptr += snprintf(ptr, bufend - ptr, "%c", digits[(h >> 60) & 0xf]); h <<= 4; } for(int i=0;i<16 && i < niter-12;i++) { if (l == 0 && precision < 0) break; ptr += snprintf(ptr, bufend - ptr, "%c", digits[(l >> 60) & 0xf]); l <<= 4; } if (exp == 0) exp++; if (iszero) exp = 0x3fff; ptr += snprintf(ptr, bufend - ptr, "%c%+d", (flags & FLAG_UPPER) ? 'P' : 'p', exp - 0x3fff); } int length = ptr - buf; ptr = buf; if (!(flags & FLAG_ZERO)) mainpos = 0; if (!(flags & FLAG_LEFT) && length < width) { int i; int nPad = width - length; for(i=width;i-nPad>=mainpos; i--) { ptr[i] = ptr[i-nPad]; } i = mainpos; while (nPad--) { ptr[i++] = (flags & FLAG_ZERO) ? '0' : ' '; } length = width; } if (flags & FLAG_LEFT) { while (length < width) { ptr[length++] = ' '; } ptr[length] = '\0'; } return length; } #define XBUFSIZE 5000 static int xvprintf(size_t (*consumer)(const char *ptr, size_t size, void *arg), void *arg, const char *fmt, va_list ap) { char *xbuf = (char *)malloc(XBUFSIZE+10); int outlen = 0, errorflag = 0; while(*fmt != '\0' && !errorflag) { // Copy the format string until a '%' is read if (*fmt != '%') { do { outlen += (*consumer)(fmt++, 1, arg); } while(*fmt != '%' && *fmt != '\0'); if (*fmt == '\0') break; } const char *subfmtstart = fmt; if ((*++fmt) == '\0') { errorflag = 1; outlen += (*consumer)("%", 1, arg); break; } // Read flags int flags = 0, done = 0; do { switch(*fmt) { case '-': flags |= FLAG_LEFT; break; case '+': flags |= FLAG_SIGN; break; case ' ': flags |= FLAG_BLANK; break; case '#': flags |= FLAG_ALT; break; case '0': flags |= FLAG_ZERO; break; default: done = 1; break; } } while(!done && (*++fmt) != 0); // Read width int width = 0; if (*fmt == '*') { width = va_arg(ap, int); if (width < 0) { flags |= FLAG_LEFT; width = -width; } fmt++; } else { while(*fmt >= '0' && *fmt <= '9') { width = width*10 + *fmt - '0'; fmt++; } } // Read precision int precision = -1; if (*fmt == '.') { precision = 0; fmt++; if (*fmt == '*') { precision = va_arg(ap, int); if (precision < 0) precision = -precision; fmt++; } else { while(*fmt >= '0' && *fmt <= '9') { precision = precision*10 + *fmt - '0'; fmt++; } } } // Read size prefix int flag_quad = 0, flag_ptrquad = 0; int size_prefix = 0, subfmt_processed = 0; if (*fmt == 'Q') { flag_quad = 1; fmt++; } else if (*fmt == 'P') { flag_ptrquad = 1; fmt++; } else { int pl = 0; if (*fmt == 'h' || *fmt == 'l' || *fmt == 'j' || *fmt == 'z' || *fmt == 't' || *fmt == 'L') { size_prefix = *fmt; pl = 1; } if ((*fmt == 'h' && *(fmt+1) == 'h') || (*fmt == 'l' && *(fmt+1) == 'l')) { size_prefix = *fmt + 256 * *(fmt+1); pl = 2; } fmt += pl; } // Call type-specific function va_list ap2; va_copy(ap2, ap); switch(*fmt) { case 'E': case 'F': case 'G': case 'A': flags |= FLAG_UPPER; // fall through case 'e': case 'f': case 'g': case 'a': { vargquad value; if (flag_quad || flag_ptrquad) { if (flag_quad) { value = va_arg(ap, vargquad); } else { value = *(vargquad *)va_arg(ap, vargquad *); } if (tolower(*fmt) == 'a') { snprintquadhex(xbuf, XBUFSIZE, value, width, precision, flags); } else { snprintquad(xbuf, XBUFSIZE, value, tolower(*fmt), width, precision, flags); } outlen += (*consumer)(xbuf, strlen(xbuf), arg); subfmt_processed = 1; break; } if (size_prefix == 0) { // double and long double va_arg(ap, double); } else if (size_prefix == 'L') { va_arg(ap, long double); } else errorflag = 1; } break; case 'd': case 'i': case 'u': case 'o': case 'x': case 'X': { switch(size_prefix) { case 0: case 'h': case 'h' + 256*'h': va_arg(ap, int); break; case 'l': va_arg(ap, long int); break; case 'j': va_arg(ap, intmax_t); break; case 'z': va_arg(ap, size_t); break; case 't': va_arg(ap, ptrdiff_t); break; case 'l' + 256*'l': va_arg(ap, long long int); break; default: errorflag = 1; break; } } break; case 'c': if (size_prefix == 0) { va_arg(ap, int); } else #if 0 if (size_prefix == 'l') { va_arg(ap, wint_t); // wint_t is not defined } else #endif errorflag = 1; break; case 's': if (size_prefix == 0 || size_prefix == 'l') { va_arg(ap, void *); } else errorflag = 1; break; case 'p': case 'n': { if ((*fmt == 'p' && size_prefix != 0) || size_prefix == 'L') { errorflag = 1; break; } va_arg(ap, void *); } break; default: errorflag = 1; } if (!subfmt_processed) { char *subfmt = (char *)malloc(fmt - subfmtstart + 2); memcpy(subfmt, subfmtstart, fmt - subfmtstart + 1); subfmt[fmt - subfmtstart + 1] = 0; int ret = vsnprintf(xbuf, XBUFSIZE, subfmt, ap2); free(subfmt); if (ret < 0) { errorflag = 1; break; } outlen += (*consumer)(xbuf, strlen(xbuf), arg); } va_end(ap2); fmt++; } free(xbuf); return errorflag ? -1 : outlen; } // typedef struct { FILE *fp; } stream_consumer_t; static size_t stream_consumer(const char *ptr, size_t size, void *varg) { stream_consumer_t *arg = (stream_consumer_t *)varg; return fwrite(ptr, size, 1, arg->fp); } EXPORT int Sleef_vfprintf(FILE *fp, const char *fmt, va_list ap) { stream_consumer_t arg = { fp }; return xvprintf(stream_consumer, &arg, fmt, ap); } EXPORT int Sleef_vprintf(const char *fmt, va_list ap) { return Sleef_vfprintf(stdout, fmt, ap); } EXPORT int Sleef_fprintf(FILE *fp, const char *fmt, ...) { va_list ap; va_start(ap, fmt); int ret = Sleef_vfprintf(fp, fmt, ap); va_end(ap); return ret; } EXPORT int Sleef_printf(const char *fmt, ...) { va_list ap; va_start(ap, fmt); int ret = Sleef_vfprintf(stdout, fmt, ap); va_end(ap); return ret; } typedef struct { char *buf; size_t pos, size; } buf_consumer_t; static size_t buf_consumer(const char *ptr, size_t size, void *varg) { buf_consumer_t *arg = (buf_consumer_t *)varg; size_t p = 0; while(p < size) { if (arg->pos >= arg->size - 1) break; arg->buf[arg->pos++] = ptr[p++]; } arg->buf[arg->pos] = '\0'; return p; } EXPORT int Sleef_vsnprintf(char *str, size_t size, const char *fmt, va_list ap) { buf_consumer_t arg = { str, 0, size }; return xvprintf(buf_consumer, &arg, fmt, ap); } EXPORT int Sleef_snprintf(char *str, size_t size, const char *fmt, ...) { va_list ap; va_start(ap, fmt); int ret = Sleef_vsnprintf(str, size, fmt, ap); va_end(ap); return ret; } #if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 13) #include static int pa_quad = -1, printf_Qmodifier = -1, printf_Pmodifier = -1; static void Sleef_quad_va(void *ptr, va_list *ap) { *(Sleef_quad *)ptr = va_arg(*ap, Sleef_quad); } static int printf_arginfo(const struct printf_info *info, size_t n, int *argtypes, int *s) { if (info->user & printf_Qmodifier) { argtypes[0] = pa_quad; return 1; } else if (info->user & printf_Pmodifier) { argtypes[0] = PA_FLAG_PTR | pa_quad; return 1; } return -1; } static int printf_output(FILE *fp, const struct printf_info *info, const void *const *args) { if (!(info->user & printf_Qmodifier || info->user & printf_Pmodifier)) return -2; int flags = 0; if (info->showsign) flags |= FLAG_SIGN; if (info->space) flags |= FLAG_BLANK; if (info->alt) flags |= FLAG_ALT; if (info->left) flags |= FLAG_LEFT; if (isupper(info->spec)) flags |= FLAG_UPPER; vargquad q = **(const vargquad **)args[0]; char *xbuf = (char *)malloc(XBUFSIZE+10); int len = 0; if (tolower(info->spec) == 'a') { len = snprintquadhex(xbuf, XBUFSIZE, q, info->width, info->prec, flags); } else { len = snprintquad(xbuf, XBUFSIZE, q, tolower(info->spec), info->width, info->prec, flags); } size_t wlen = fwrite(xbuf, len, 1, fp); free(xbuf); return (int)wlen; } EXPORT int Sleef_registerPrintfHook() { printf_Qmodifier = register_printf_modifier(L"Q"); printf_Pmodifier = register_printf_modifier(L"P"); if (printf_Qmodifier == -1 || printf_Pmodifier == -1) return -1; pa_quad = register_printf_type(Sleef_quad_va); if (pa_quad == -1) return -2; if (register_printf_specifier('a', printf_output, printf_arginfo)) return -3; if (register_printf_specifier('e', printf_output, printf_arginfo)) return -4; if (register_printf_specifier('f', printf_output, printf_arginfo)) return -5; if (register_printf_specifier('g', printf_output, printf_arginfo)) return -6; if (register_printf_specifier('A', printf_output, printf_arginfo)) return -7; if (register_printf_specifier('E', printf_output, printf_arginfo)) return -8; if (register_printf_specifier('F', printf_output, printf_arginfo)) return -9; if (register_printf_specifier('G', printf_output, printf_arginfo)) return -10; return 0; } EXPORT void Sleef_unregisterPrintfHook() { register_printf_specifier('a', NULL, NULL); register_printf_specifier('e', NULL, NULL); register_printf_specifier('f', NULL, NULL); register_printf_specifier('g', NULL, NULL); register_printf_specifier('A', NULL, NULL); register_printf_specifier('E', NULL, NULL); register_printf_specifier('F', NULL, NULL); register_printf_specifier('G', NULL, NULL); } #endif // #if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 13) #endif // #ifdef ENABLE_PUREC_SCALAR // Functions for debugging ------------------------------------------------------------------------------------------------------------ #ifdef ENABLE_MAIN // gcc -DENABLE_MAIN -DENABLEFLOAT128 -Wno-attributes -I../libm -I../quad-tester -I../common -I../arch -DUSEMPFR -DENABLE_AVX2 -mavx2 -mfma sleefsimdqp.c rempitabqp.c ../common/common.c ../quad-tester/qtesterutil.c -lm -lmpfr // gcc-10.2.0 -DENABLE_MAIN -Wno-attributes -I../libm -I../quad-tester -I../common -I../arch -DUSEMPFR -DENABLE_SVE -march=armv8-a+sve sleefsimdqp.c rempitabqp.c ../common/common.c ../quad-tester/qtesterutil.c -lm -lmpfr #include #include #include #include #include #include #include #include #include "qtesterutil.h" #if 0 int main(int argc, char **argv) { Sleef_quad q0 = atof(argv[1]); int lane = 0; vargquad a0; memset(&a0, 0, sizeof(vargquad)); a0 = xsetq(a0, lane, q0); tdx t = cast_tdx_vq(cast_vq_aq(a0)); printvdouble("t.d3.x", t.d3.x); } #endif #if 1 int main(int argc, char **argv) { xsrand(time(NULL) + (int)getpid()); int lane = xrand() % VECTLENDP; printf("lane = %d\n", lane); char s[200]; double ad[32]; mpfr_set_default_prec(18000); mpfr_t fr0, fr1, fr2, fr3; mpfr_inits(fr0, fr1, fr2, fr3, NULL); mpfr_set_d(fr0, 0, GMP_RNDN); if (argc >= 2) mpfr_set_str(fr0, argv[1], 10, GMP_RNDN); Sleef_quad q0 = mpfr_get_f128(fr0, GMP_RNDN); mpfr_set_f128(fr0, q0, GMP_RNDN); if (argc >= 2) printf("arg0 : %s\n", sprintfr(fr0)); vargquad a0; #if 0 memrand(&a0, sizeof(vargquad)); #elif 0 memset(&a0, 0, sizeof(vargquad)); #endif a0 = xsetq(a0, lane, q0); mpfr_set_d(fr1, 0, GMP_RNDN); if (argc >= 3) mpfr_set_str(fr1, argv[2], 10, GMP_RNDN); Sleef_quad q1 = mpfr_get_f128(fr1, GMP_RNDN); mpfr_set_f128(fr1, q1, GMP_RNDN); if (argc >= 3) printf("arg1 : %s\n", sprintfr(fr1)); vargquad a1; #if 0 memrand(&a1, sizeof(vargquad)); #elif 0 memset(&a1, 0, sizeof(vargquad)); #endif a1 = xsetq(a1, lane, q1); mpfr_set_d(fr2, 0, GMP_RNDN); if (argc >= 4) mpfr_set_str(fr2, argv[3], 10, GMP_RNDN); Sleef_quad q2 = mpfr_get_f128(fr2, GMP_RNDN); mpfr_set_f128(fr2, q2, GMP_RNDN); if (argc >= 4) printf("arg2 : %s\n", sprintfr(fr2)); vargquad a2; #if 0 memrand(&a2, sizeof(vargquad)); #elif 0 memset(&a2, 0, sizeof(vargquad)); #endif a2 = xsetq(a2, lane, q2); // vargquad a3; #if 0 a3 = xaddq_u05(a0, a1); mpfr_add(fr3, fr0, fr1, GMP_RNDN); printf("\nadd\n"); mpfr_set_f128(fr3, mpfr_get_f128(fr3, GMP_RNDN), GMP_RNDN); printf("corr : %s\n", sprintfr(fr3)); mpfr_set_f128(fr3, xgetq(a3, lane), GMP_RNDN); printf("test : %s\n", sprintfr(fr3)); #endif #if 0 a3 = xsubq_u05(a0, a1); mpfr_sub(fr3, fr0, fr1, GMP_RNDN); printf("\nsub\n"); mpfr_set_f128(fr3, mpfr_get_f128(fr3, GMP_RNDN), GMP_RNDN); printf("corr : %s\n", sprintfr(fr3)); mpfr_set_f128(fr3, xgetq(a3, lane), GMP_RNDN); printf("test : %s\n", sprintfr(fr3)); #endif #if 0 a3 = xmulq_u05(a0, a1); mpfr_mul(fr3, fr0, fr1, GMP_RNDN); printf("\nmul\n"); mpfr_set_f128(fr3, mpfr_get_f128(fr3, GMP_RNDN), GMP_RNDN); printf("corr : %s\n", sprintfr(fr3)); mpfr_set_f128(fr3, xgetq(a3, lane), GMP_RNDN); printf("test : %s\n", sprintfr(fr3)); #endif #if 0 a3 = xdivq_u05(a0, a1); mpfr_div(fr3, fr0, fr1, GMP_RNDN); printf("\ndiv\n"); mpfr_set_f128(fr3, mpfr_get_f128(fr3, GMP_RNDN), GMP_RNDN); printf("corr : %s\n", sprintfr(fr3)); mpfr_set_f128(fr3, xgetq(a3, lane), GMP_RNDN); printf("test : %s\n", sprintfr(fr3)); #endif #if 0 a3 = xsqrtq_u05(a0); mpfr_sqrt(fr3, fr0, GMP_RNDN); printf("\nsqrt\n"); mpfr_set_f128(fr3, mpfr_get_f128(fr3, GMP_RNDN), GMP_RNDN); printf("corr : %s\n", sprintfr(fr3)); mpfr_set_f128(fr3, xgetq(a3, lane), GMP_RNDN); printf("test : %s\n", sprintfr(fr3)); #endif #if 0 a3 = xsinq_u10(a0); mpfr_sin(fr3, fr0, GMP_RNDN); printf("\nsin\n"); mpfr_set_f128(fr3, mpfr_get_f128(fr3, GMP_RNDN), GMP_RNDN); printf("corr : %s\n", sprintfr(fr3)); mpfr_set_f128(fr3, xgetq(a3, lane), GMP_RNDN); printf("test : %s\n", sprintfr(fr3)); #endif #if 0 a3 = xfabsq(a0); mpfr_abs(fr3, fr0, GMP_RNDN); printf("\nfabs\n"); mpfr_set_f128(fr3, mpfr_get_f128(fr3, GMP_RNDN), GMP_RNDN); printf("corr : %s\n", sprintfr(fr3)); mpfr_set_f128(fr3, xgetq(a3, lane), GMP_RNDN); printf("test : %s\n", sprintfr(fr3)); #endif #if 0 a3 = xcopysignq(a0, a1); mpfr_copysign(fr3, fr0, fr1, GMP_RNDN); printf("\ncopysign\n"); mpfr_set_f128(fr3, mpfr_get_f128(fr3, GMP_RNDN), GMP_RNDN); printf("corr : %s\n", sprintfr(fr3)); mpfr_set_f128(fr3, xgetq(a3, lane), GMP_RNDN); printf("test : %s\n", sprintfr(fr3)); #endif #if 0 a3 = xfmaxq(a0, a1); mpfr_max(fr3, fr0, fr1, GMP_RNDN); printf("\nmax\n"); mpfr_set_f128(fr3, mpfr_get_f128(fr3, GMP_RNDN), GMP_RNDN); printf("corr : %s\n", sprintfr(fr3)); mpfr_set_f128(fr3, xgetq(a3, lane), GMP_RNDN); printf("test : %s\n", sprintfr(fr3)); #endif #if 0 a3 = xfminq(a0, a1); mpfr_min(fr3, fr0, fr1, GMP_RNDN); printf("\nmin\n"); mpfr_set_f128(fr3, mpfr_get_f128(fr3, GMP_RNDN), GMP_RNDN); printf("corr : %s\n", sprintfr(fr3)); mpfr_set_f128(fr3, xgetq(a3, lane), GMP_RNDN); printf("test : %s\n", sprintfr(fr3)); #endif #if 0 a3 = xfdimq_u05(a0, a1); mpfr_dim(fr3, fr0, fr1, GMP_RNDN); printf("\nfdim\n"); mpfr_set_f128(fr3, mpfr_get_f128(fr3, GMP_RNDN), GMP_RNDN); printf("corr : %s\n", sprintfr(fr3)); mpfr_set_f128(fr3, xgetq(a3, lane), GMP_RNDN); printf("test : %s\n", sprintfr(fr3)); #endif #if 0 a3 = xsinhq_u10(a0); mpfr_sinh(fr3, fr0, GMP_RNDN); printf("\nsinh\n"); mpfr_set_f128(fr3, mpfr_get_f128(fr3, GMP_RNDN), GMP_RNDN); printf("corr : %s\n", sprintfr(fr3)); mpfr_set_f128(fr3, xgetq(a3, lane), GMP_RNDN); printf("test : %s\n", sprintfr(fr3)); #endif #if 0 a3 = xcoshq_u10(a0); mpfr_cosh(fr3, fr0, GMP_RNDN); printf("\ncosh\n"); mpfr_set_f128(fr3, mpfr_get_f128(fr3, GMP_RNDN), GMP_RNDN); printf("corr : %s\n", sprintfr(fr3)); mpfr_set_f128(fr3, xgetq(a3, lane), GMP_RNDN); printf("test : %s\n", sprintfr(fr3)); #endif #if 0 a3 = xtanhq_u10(a0); mpfr_tanh(fr3, fr0, GMP_RNDN); printf("\ntanh\n"); mpfr_set_f128(fr3, mpfr_get_f128(fr3, GMP_RNDN), GMP_RNDN); printf("corr : %s\n", sprintfr(fr3)); mpfr_set_f128(fr3, xgetq(a3, lane), GMP_RNDN); printf("test : %s\n", sprintfr(fr3)); #endif #if 0 a3 = xatan2q_u10(a0, a1); mpfr_atan2(fr3, fr0, fr1, GMP_RNDN); printf("\natan2\n"); mpfr_set_f128(fr3, mpfr_get_f128(fr3, GMP_RNDN), GMP_RNDN); printf("corr : %s\n", sprintfr(fr3)); mpfr_set_f128(fr3, xgetq(a3, lane), GMP_RNDN); printf("test : %s\n", sprintfr(fr3)); #endif #if 0 a3 = xpowq_u10(a0, a1); mpfr_pow(fr3, fr0, fr1, GMP_RNDN); printf("\npow\n"); mpfr_set_f128(fr3, mpfr_get_f128(fr3, GMP_RNDN), GMP_RNDN); printf("corr : %s\n", sprintfr(fr3)); mpfr_set_f128(fr3, xgetq(a3, lane), GMP_RNDN); printf("test : %s\n", sprintfr(fr3)); #endif #if 0 a3 = xtruncq(a0); mpfr_trunc(fr3, fr0); printf("\ntrunc\n"); mpfr_set_f128(fr3, mpfr_get_f128(fr3, GMP_RNDN), GMP_RNDN); printf("corr : %s\n", sprintfr(fr3)); mpfr_set_f128(fr3, xgetq(a3, lane), GMP_RNDN); printf("test : %s\n", sprintfr(fr3)); #endif #if 0 a3 = xfloorq(a0); mpfr_floor(fr3, fr0); printf("\nfloor\n"); mpfr_set_f128(fr3, mpfr_get_f128(fr3, GMP_RNDN), GMP_RNDN); printf("corr : %s\n", sprintfr(fr3)); mpfr_set_f128(fr3, xgetq(a3, lane), GMP_RNDN); printf("test : %s\n", sprintfr(fr3)); #endif #if 0 a3 = xceilq(a0); mpfr_ceil(fr3, fr0); printf("\nceil\n"); mpfr_set_f128(fr3, mpfr_get_f128(fr3, GMP_RNDN), GMP_RNDN); printf("corr : %s\n", sprintfr(fr3)); mpfr_set_f128(fr3, xgetq(a3, lane), GMP_RNDN); printf("test : %s\n", sprintfr(fr3)); #endif #if 0 a3 = xfmodq(a0, a1); mpfr_fmod(fr3, fr0, fr1, GMP_RNDN); printf("\nfmod\n"); mpfr_set_f128(fr3, mpfr_get_f128(fr3, GMP_RNDN), GMP_RNDN); printf("corr : %s\n", sprintfr(fr3)); mpfr_set_f128(fr3, xgetq(a3, lane), GMP_RNDN); printf("test : %s\n", sprintfr(fr3)); #endif #if 0 a3 = xremainderq(a0, a1); mpfr_remainder(fr3, fr0, fr1, GMP_RNDN); printf("\nremainder\n"); mpfr_set_f128(fr3, mpfr_get_f128(fr3, GMP_RNDN), GMP_RNDN); printf("corr : %s\n", sprintfr(fr3)); mpfr_set_f128(fr3, xgetq(a3, lane), GMP_RNDN); printf("test : %s\n", sprintfr(fr3)); #endif #if 0 a3 = xcbrtq_u10(a0); mpfr_cbrt(fr3, fr0, GMP_RNDN); printf("\ncbrt\n"); mpfr_set_f128(fr3, mpfr_get_f128(fr3, GMP_RNDN), GMP_RNDN); printf("corr : %s\n", sprintfr(fr3)); mpfr_set_f128(fr3, xgetq(a3, lane), GMP_RNDN); printf("test : %s\n", sprintfr(fr3)); #endif #if 1 a3 = xfmaq_u05(a0, a1, a2); mpfr_fma(fr3, fr0, fr1, fr2, GMP_RNDN); printf("\nfma\n"); mpfr_set_f128(fr3, mpfr_get_f128(fr3, GMP_RNDN), GMP_RNDN); printf("corr : %s\n", sprintfr(fr3)); mpfr_set_f128(fr3, xgetq(a3, lane), GMP_RNDN); printf("test : %s\n", sprintfr(fr3)); #endif #if 0 a3 = xhypotq_u05(a0, a1); mpfr_hypot(fr3, fr0, fr1, GMP_RNDN); printf("\nhypot\n"); mpfr_set_f128(fr3, mpfr_get_f128(fr3, GMP_RNDN), GMP_RNDN); printf("corr : %s\n", sprintfr(fr3)); mpfr_set_f128(fr3, xgetq(a3, lane), GMP_RNDN); printf("test : %s\n", sprintfr(fr3)); #endif #if 0 vint vi = xilogbq(a0); printf("\nilogb\n"); printf("corr : %d\n", ilogb((double)q0)); printf("test : %d\n", vi); #endif #if 0 a3 = xldexpq(a0, vcast_vi_i(atoi(argv[2]))); printf("\nldexp\n"); printf("corr : %.20g\n", ldexp((double)q0, atoi(argv[2]))); mpfr_set_f128(fr3, xgetq(a3, lane), GMP_RNDN); printf("test : %s\n", sprintfr(fr3)); #endif mpfr_clears(fr0, fr1, fr2, fr3, NULL); } #endif #endif