// Copyright Naoki Shibata and contributors 2010 - 2021. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) #if CONFIG == 1 #if !defined(__AVX__) && !defined(SLEEF_GENHEADER) #error Please specify -mavx. #endif #elif CONFIG == 4 #if (!defined(__AVX__) || !defined(__FMA4__)) && !defined(SLEEF_GENHEADER) #error Please specify -mavx and -mfma4. #endif #else #error CONFIG macro invalid or not defined #endif #define ENABLE_DP //@#define ENABLE_DP #define LOG2VECTLENDP 2 //@#define LOG2VECTLENDP 2 #define VECTLENDP (1 << LOG2VECTLENDP) //@#define VECTLENDP (1 << LOG2VECTLENDP) #define ENABLE_SP //@#define ENABLE_SP #define LOG2VECTLENSP (LOG2VECTLENDP+1) //@#define LOG2VECTLENSP (LOG2VECTLENDP+1) #define VECTLENSP (1 << LOG2VECTLENSP) //@#define VECTLENSP (1 << LOG2VECTLENSP) #define FULL_FP_ROUNDING //@#define FULL_FP_ROUNDING #define ACCURATE_SQRT //@#define ACCURATE_SQRT #if !defined(SLEEF_GENHEADER) #if defined(_MSC_VER) #include #else #include #endif #include #include "misc.h" #endif // #if !defined(SLEEF_GENHEADER) typedef __m256i vmask; typedef __m256i vopmask; typedef __m256d vdouble; typedef __m128i vint; typedef __m256 vfloat; typedef struct { __m128i x, y; } vint2; typedef __m256i vint64; typedef __m256i vuint64; typedef struct { vmask x, y; } vquad; typedef vquad vargquad; // #if !defined(SLEEF_GENHEADER) #ifndef __SLEEF_H__ void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx); #endif static INLINE int cpuSupportsAVX() { int32_t reg[4]; Sleef_x86CpuID(reg, 1, 0); return (reg[2] & (1 << 28)) != 0; } static INLINE int cpuSupportsFMA4() { int32_t reg[4]; Sleef_x86CpuID(reg, 0x80000001, 0); return (reg[2] & (1 << 16)) != 0; } #if CONFIG == 4 && defined(__AVX__) && defined(__FMA4__) static INLINE int vavailability_i(int name) { int d = cpuSupportsAVX() && cpuSupportsFMA4(); return d ? 3 : 0; } #define ENABLE_FMA_DP #define ENABLE_FMA_SP #define ISANAME "AVX + AMD FMA4" #define DFTPRIORITY 21 #else static INLINE int vavailability_i(int name) { int d = cpuSupportsAVX(); return d ? 3 : 0; } #define ISANAME "AVX" #define DFTPRIORITY 20 #endif #endif // #if !defined(SLEEF_GENHEADER) static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch((const char *)ptr, _MM_HINT_T0); } static INLINE int vtestallones_i_vo32(vopmask g) { return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1))); } static INLINE int vtestallones_i_vo64(vopmask g) { return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1))); } // static INLINE vdouble vcast_vd_d(double d) { return _mm256_set1_pd(d); } static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm256_castpd_si256(vd); } static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm256_castsi256_pd(vm); } // static vint2 vloadu_vi2_p(int32_t *p) { vint2 r; r.x = _mm_loadu_si128((__m128i *) p ); r.y = _mm_loadu_si128((__m128i *)(p + 4)); return r; } static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm_storeu_si128((__m128i *) p , v.x); _mm_storeu_si128((__m128i *)(p + 4), v.y); } static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); } static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); } // static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vopmask vcast_vo32_vo64(vopmask o) { return _mm256_castsi128_si256(_mm256_cvtpd_epi32(_mm256_and_pd(vreinterpret_vd_vm(o), _mm256_set1_pd(-1.0)))); } static INLINE vopmask vcast_vo64_vo32(vopmask o) { return vreinterpret_vm_vd(_mm256_cmp_pd(_mm256_cvtepi32_pd(_mm256_castsi256_si128(o)), _mm256_set1_pd(-1.0), _CMP_EQ_OQ)); } static INLINE vopmask vcast_vo_i(int i) { return _mm256_set1_epi64x(i ? -1 : 0); } // static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); } static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); } static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); } static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); } static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm256_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); } static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm256_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); } static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); } static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); } static INLINE vmask vcastu_vm_vi(vint vi) { __m256i m = _mm256_castsi128_si256(_mm_and_si128(_mm_shuffle_epi32(vi, 0x40), _mm_set_epi32(-1, 0, -1, 0))); return _mm256_insertf128_si256(m, _mm_and_si128(_mm_shuffle_epi32(vi, 0xc8), _mm_set_epi32(-1, 0, -1, 0)), 1); } static INLINE vint vcastu_vi_vm(vmask vi) { return _mm_or_si128(_mm_and_si128(_mm_shuffle_epi32(_mm256_castsi256_si128(vi) , 0x0d), _mm_set_epi32( 0, 0, -1, -1)), _mm_and_si128(_mm_shuffle_epi32(_mm256_extractf128_si256(vi, 1), 0xd0), _mm_set_epi32(-1, -1, 0, 0))); } static INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm256_set_epi32(i0, i1, i0, i1, i0, i1, i0, i1); } static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_cmp_pd(vreinterpret_vd_vm(vxor_vm_vm_vm(vxor_vm_vm_vm(x, y), vreinterpret_vm_vd(_mm256_set1_pd(1.0)))), _mm256_set1_pd(1.0), _CMP_EQ_OQ)); } static INLINE vmask vcast_vm_i64(int64_t i) { return _mm256_set1_epi64x(i); } static INLINE vmask vcast_vm_u64(uint64_t i) { return _mm256_set1_epi64x((uint64_t)i); } // static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); } static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); } static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); } static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); } static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set1_pd(1), x); } static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); } static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm256_andnot_pd(_mm256_set1_pd(-0.0), d); } static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm256_xor_pd(_mm256_set1_pd(-0.0), d); } static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); } static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); } #if CONFIG == 1 static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(z, vmul_vd_vd_vd(x, y)); } #else static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); } static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_msub_pd(x, y, z); } static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmacc_pd(x, y, z); } static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); } static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); } static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_msub_pd(x, y, z); } static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmacc_pd(x, y, z); } static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmsub_pd(x, y, z); } #endif static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_EQ_OQ)); } static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_NEQ_UQ)); } static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LT_OQ)); } static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LE_OQ)); } static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GT_OQ)); } static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GE_OQ)); } // static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); } static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); } static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); } static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); } static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); } static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); } static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); } static INLINE vint vandnot_vi_vo_vi(vopmask m, vint y) { return _mm_andnot_si128(_mm256_castsi256_si128(m), y); } static INLINE vint vand_vi_vo_vi(vopmask m, vint y) { return _mm_and_si128(_mm256_castsi256_si128(m), y); } static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); } static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); } static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); } static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); } static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); } static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpeq_epi32(x, y)); } static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpgt_epi32(x, y)); } static INLINE vint vsel_vi_vo_vi_vi(vopmask o, vint x, vint y) { return _mm_blendv_epi8(y, x, _mm256_castsi256_si128(o)); } static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm256_blendv_pd(y, x, _mm256_castsi256_pd(o)); } static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) { return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0)); } static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) { return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2)); } static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) { return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3))); } static INLINE vopmask visinf_vo_vd(vdouble d) { return vreinterpret_vm_vd(_mm256_cmp_pd(vabs_vd_vd(d), _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ)); } static INLINE vopmask vispinf_vo_vd(vdouble d) { return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ)); } static INLINE vopmask visminf_vo_vd(vdouble d) { return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ)); } static INLINE vopmask visnan_vo_vd(vdouble d) { return vreinterpret_vm_vd(_mm256_cmp_pd(d, d, _CMP_NEQ_UQ)); } static INLINE vdouble vload_vd_p(const double *ptr) { return _mm256_load_pd(ptr); } static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(ptr); } static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); } static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); } static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { int a[VECTLENDP]; vstoreu_v_p_vi(a, vi); return _mm256_set_pd(ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]); } #if defined(_MSC_VER) // This function is needed when debugging on MSVC. static INLINE double vcast_d_vd(vdouble v) { double a[VECTLENDP]; vstoreu_v_p_vd(a, v); return a[0]; } #endif // static INLINE vint2 vcast_vi2_vm(vmask vm) { vint2 r; r.x = _mm256_castsi256_si128(vm); r.y = _mm256_extractf128_si256(vm, 1); return r; } static INLINE vmask vcast_vm_vi2(vint2 vi) { vmask m = _mm256_castsi128_si256(vi.x); m = _mm256_insertf128_si256(m, vi.y, 1); return m; } static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvtps_epi32(vf)); } static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvttps_epi32(vf)); } static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps(vcast_vm_vi2(vi)); } static INLINE vfloat vcast_vf_f(float f) { return _mm256_set1_ps(f); } static INLINE vint2 vcast_vi2_i(int i) { vint2 r; r.x = r.y = _mm_set1_epi32(i); return r; } static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm256_castps_si256(vf); } static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm256_castsi256_ps(vm); } static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return vreinterpret_vf_vm(vcast_vm_vi2(vi)); } static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return vcast_vi2_vm(vreinterpret_vm_vf(vf)); } static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); } static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); } static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); } static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); } static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); } static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); } static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); } static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); } static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); } static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); } #if CONFIG == 1 static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); } static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } #else static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); } static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmacc_ps(x, y, z); } static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_msub_ps(x, y, z); } static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); } static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); } static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_msub_ps(x, y, z); } static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmacc_ps(x, y, z); } static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmsub_ps(x, y, z); } #endif static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_EQ_OQ)); } static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_NEQ_UQ)); } static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LT_OQ)); } static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LE_OQ)); } static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GT_OQ)); } static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GE_OQ)); } static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 vi = { _mm_add_epi32(x.x, y.x), _mm_add_epi32(x.y, y.y) }; return vi; } static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 vi = { _mm_sub_epi32(x.x, y.x), _mm_sub_epi32(x.y, y.y) }; return vi; } static INLINE vint2 vneg_vi2_vi2(vint2 e) { vint2 vi = { _mm_sub_epi32(_mm_set1_epi32(0), e.x), _mm_sub_epi32(_mm_set1_epi32(0), e.y) }; return vi; } static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 vi = { _mm_and_si128(x.x, y.x), _mm_and_si128(x.y, y.y) }; return vi; } static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 vi = { _mm_andnot_si128(x.x, y.x), _mm_andnot_si128(x.y, y.y) }; return vi; } static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 vi = { _mm_or_si128(x.x, y.x), _mm_or_si128(x.y, y.y) }; return vi; } static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 vi = { _mm_xor_si128(x.x, y.x), _mm_xor_si128(x.y, y.y) }; return vi; } static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi2_vi2_vi2(vcast_vi2_vm(x), y); } static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(vcast_vi2_vm(x), y); } static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { vint2 vi = { _mm_slli_epi32(x.x, c), _mm_slli_epi32(x.y, c) }; return vi; } static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { vint2 vi = { _mm_srli_epi32(x.x, c), _mm_srli_epi32(x.y, c) }; return vi; } static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { vint2 vi = { _mm_srai_epi32(x.x, c), _mm_srai_epi32(x.y, c) }; return vi; } static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = _mm_cmpeq_epi32(x.x, y.x); r.y = _mm_cmpeq_epi32(x.y, y.y); return vcast_vm_vi2(r); } static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = _mm_cmpgt_epi32(x.x, y.x); r.y = _mm_cmpgt_epi32(x.y, y.y); return vcast_vm_vi2(r); } static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = _mm_cmpeq_epi32(x.x, y.x); r.y = _mm_cmpeq_epi32(x.y, y.y); return r; } static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = _mm_cmpgt_epi32(x.x, y.x); r.y = _mm_cmpgt_epi32(x.y, y.y); return r; } static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { vint2 n = vcast_vi2_vm(m); vint2 r = { _mm_blendv_epi8(y.x, x.x, n.x), _mm_blendv_epi8(y.y, x.y, n.y) }; return r; } static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { vint2 ix = vcast_vi2_vm(x), iy = vcast_vi2_vm(y), iz; iz.x = _mm_add_epi64(ix.x, iy.x); iz.y = _mm_add_epi64(ix.y, iy.y); return vcast_vm_vi2(iz); } static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm256_blendv_ps(y, x, _mm256_castsi256_ps(o)); } static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) { return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0)); } static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) { return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2)); } static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) { return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3))); } static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); } static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); } static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); } static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); } // static INLINE vfloat vload_vf_p(const float *ptr) { return _mm256_load_ps(ptr); } static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr); } static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); } static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); } static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { int a[VECTLENSP]; vstoreu_v_p_vi2(a, vi2); return _mm256_set_ps(ptr[a[7]], ptr[a[6]], ptr[a[5]], ptr[a[4]], ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]); } #ifdef _MSC_VER // This function is needed when debugging on MSVC. static INLINE float vcast_f_vf(vfloat v) { float a[VECTLENSP]; vstoreu_v_p_vf(a, v); return a[0]; } #endif // #define PNMASK _mm256_set_pd( -0.0, +0.0, -0.0, +0.0 ) #define NPMASK _mm256_set_pd( +0.0, -0.0, +0.0, -0.0 ) #define PNMASKf _mm256_set_ps( -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f ) #define NPMASKf _mm256_set_ps( +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f ) static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); } static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); } static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); } static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); } static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_addsub_pd(x, y); } static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_addsub_ps(x, y); } #if CONFIG == 1 static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } #else static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); } static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); } #endif static INLINE vdouble vrev21_vd_vd(vdouble d0) { return _mm256_shuffle_pd(d0, d0, (0 << 3) | (1 << 2) | (0 << 1) | (1 << 0)); } static INLINE vdouble vreva2_vd_vd(vdouble d0) { d0 = _mm256_permute2f128_pd(d0, d0, 1); return _mm256_shuffle_pd(d0, d0, (1 << 3) | (0 << 2) | (1 << 1) | (0 << 0)); } static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm256_stream_pd(ptr, v); } static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { _mm_store_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0)); _mm_store_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1)); } static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { _mm_stream_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0)); _mm_stream_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1)); } // static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm256_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); } static INLINE vfloat vreva2_vf_vf(vfloat d0) { d0 = _mm256_permute2f128_ps(d0, d0, 1); return _mm256_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); } static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm256_stream_ps(ptr, v); } static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0)))); _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0)))); _mm_storel_pd((double *)(ptr+(offset + step * 2)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1)))); _mm_storeh_pd((double *)(ptr+(offset + step * 3)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1)))); } static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); } // static vquad loadu_vq_p(void *p) { vquad vq; memcpy(&vq, p, VECTLENDP * 16); return vq; } static INLINE vquad cast_vq_aq(vargquad aq) { vquad vq; memcpy(&vq, &aq, VECTLENDP * 16); return vq; } static INLINE vargquad cast_aq_vq(vquad vq) { vargquad aq; memcpy(&aq, &vq, VECTLENDP * 16); return aq; } static INLINE int vtestallzeros_i_vo64(vopmask g) { return _mm_movemask_epi8(_mm_or_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1))) == 0; } static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_blendv_pd(vreinterpret_vd_vm(y), vreinterpret_vd_vm(x), vreinterpret_vd_vm(o))); } static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { __m128i xh = _mm256_extractf128_si256(x, 1), xl = _mm256_extractf128_si256(x, 0); __m128i yh = _mm256_extractf128_si256(y, 1), yl = _mm256_extractf128_si256(y, 0); vmask r = _mm256_castsi128_si256(_mm_sub_epi64(xl, yl)); return _mm256_insertf128_si256(r, _mm_sub_epi64(xh, yh), 1); } static INLINE vmask vneg64_vm_vm(vmask x) { return vsub64_vm_vm_vm(vcast_vm_i_i(0, 0), x); } static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { __m128i xh = _mm256_extractf128_si256(x, 1), xl = _mm256_extractf128_si256(x, 0); __m128i yh = _mm256_extractf128_si256(y, 1), yl = _mm256_extractf128_si256(y, 0); vmask r = _mm256_castsi128_si256(_mm_cmpgt_epi64(xl, yl)); return _mm256_insertf128_si256(r, _mm_cmpgt_epi64(xh, yh), 1); } #define vsll64_vm_vm_i(x, c) \ _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi64(_mm256_extractf128_si256(x, 0), c)), \ _mm_slli_epi64(_mm256_extractf128_si256(x, 1), c), 1) #define vsrl64_vm_vm_i(x, c) \ _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_srli_epi64(_mm256_extractf128_si256(x, 0), c)), \ _mm_srli_epi64(_mm256_extractf128_si256(x, 1), c), 1) //@#define vsll64_vm_vm_i(x, c) _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi64(_mm256_extractf128_si256(x, 0), c)), _mm_slli_epi64(_mm256_extractf128_si256(x, 1), c), 1) //@#define vsrl64_vm_vm_i(x, c) _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_srli_epi64(_mm256_extractf128_si256(x, 0), c)), _mm_srli_epi64(_mm256_extractf128_si256(x, 1), c), 1) static INLINE vmask vcast_vm_vi(vint vi) { vint vi0 = _mm_and_si128(_mm_shuffle_epi32(vi, (1 << 4) | (1 << 6)), _mm_set_epi32(0, -1, 0, -1)); vint vi1 = _mm_and_si128(_mm_shuffle_epi32(vi, (2 << 0) | (2 << 2) | (3 << 4) | (3 << 6)), _mm_set_epi32(0, -1, 0, -1)); vmask m = _mm256_insertf128_si256(_mm256_castsi128_si256(vi0), vi1, 1); return vor_vm_vm_vm(vcastu_vm_vi(vand_vi_vo_vi(vgt_vo_vi_vi(vcast_vi_i(0), vi), vcast_vi_i(-1))), m); } static INLINE vint vcast_vi_vm(vmask vm) { return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vm)), _mm_set1_ps(0), 0x08)), _mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80))); } static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; } static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return m; } static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return v; } static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return m; }