#include #include #include #include void nnp_shdotxf1__psimd( const float x[restrict static 1], const uint16_t y[restrict static 1], size_t stride_y, float sum[restrict static 1], size_t n) { psimd_f32 vacc0 = psimd_zero_f32(); const uint16_t *restrict y0 = y; for (; n >= 8; n -= 8) { const psimd_f32 vx_lo = psimd_load_f32(x); const psimd_f32 vx_hi = psimd_load_f32(x + 4); x += 8; const psimd_f32x2 vy0 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y0)); y0 += 4; vacc0 += vx_lo * vy0.lo; vacc0 += vx_hi * vy0.hi; } float acc0 = psimd_reduce_sum_f32(vacc0); while (n--) { const float sx = (*x++); acc0 += sx * fp16_alt_to_fp32_value(*y0++); } sum[0] = acc0; } void nnp_shdotxf2__psimd( const float x[restrict static 1], const uint16_t y[restrict static 1], size_t stride_y, float sum[restrict static 1], size_t n) { psimd_f32 vacc0, vacc1; vacc0 = vacc1 = psimd_zero_f32(); const uint16_t *restrict y0 = y; const uint16_t *restrict y1 = y0 + stride_y; for (; n >= 8; n -= 8) { const psimd_f32 vx_lo = psimd_load_f32(x); const psimd_f32 vx_hi = psimd_load_f32(x + 4); x += 8; const psimd_f32x2 vy0 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y0)); y0 += 8; vacc0 += vx_lo * vy0.lo; vacc0 += vx_hi * vy0.hi; const psimd_f32x2 vy1 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y1)); y1 += 8; vacc1 += vx_lo * vy1.lo; vacc1 += vx_hi * vy1.hi; } float acc0 = psimd_reduce_sum_f32(vacc0); float acc1 = psimd_reduce_sum_f32(vacc1); while (n--) { const float sx = (*x++); acc0 += sx * fp16_alt_to_fp32_value(*y0++); acc1 += sx * fp16_alt_to_fp32_value(*y1++); } sum[0] = acc0; sum[1] = acc1; } void nnp_shdotxf3__psimd( const float x[restrict static 1], const uint16_t y[restrict static 1], size_t stride_y, float sum[restrict static 1], size_t n) { psimd_f32 vacc0, vacc1, vacc2; vacc0 = vacc1 = vacc2 = psimd_zero_f32(); const uint16_t *restrict y0 = y; const uint16_t *restrict y1 = y0 + stride_y; const uint16_t *restrict y2 = y1 + stride_y; for (; n >= 8; n -= 8) { const psimd_f32 vx_lo = psimd_load_f32(x); const psimd_f32 vx_hi = psimd_load_f32(x + 4); x += 8; const psimd_f32x2 vy0 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y0)); y0 += 8; vacc0 += vx_lo * vy0.lo; vacc0 += vx_hi * vy0.hi; const psimd_f32x2 vy1 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y1)); y1 += 8; vacc1 += vx_lo * vy1.lo; vacc1 += vx_hi * vy1.hi; const psimd_f32x2 vy2 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y2)); y2 += 8; vacc2 += vx_lo * vy2.lo; vacc2 += vx_hi * vy2.hi; } float acc0 = psimd_reduce_sum_f32(vacc0); float acc1 = psimd_reduce_sum_f32(vacc1); float acc2 = psimd_reduce_sum_f32(vacc2); while (n--) { const float sx = (*x++); acc0 += sx * fp16_alt_to_fp32_value(*y0++); acc1 += sx * fp16_alt_to_fp32_value(*y1++); acc2 += sx * fp16_alt_to_fp32_value(*y2++); } sum[0] = acc0; sum[1] = acc1; sum[2] = acc2; } void nnp_shdotxf4__psimd( const float x[restrict static 1], const uint16_t y[restrict static 1], size_t stride_y, float sum[restrict static 1], size_t n) { psimd_f32 vacc0, vacc1, vacc2, vacc3; vacc0 = vacc1 = vacc2 = vacc3 = psimd_zero_f32(); const uint16_t *restrict y0 = y; const uint16_t *restrict y1 = y0 + stride_y; const uint16_t *restrict y2 = y1 + stride_y; const uint16_t *restrict y3 = y2 + stride_y; for (; n >= 8; n -= 8) { const psimd_f32 vx_lo = psimd_load_f32(x); const psimd_f32 vx_hi = psimd_load_f32(x + 4); x += 8; const psimd_f32x2 vy0 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y0)); y0 += 8; vacc0 += vx_lo * vy0.lo; vacc0 += vx_hi * vy0.hi; const psimd_f32x2 vy1 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y1)); y1 += 8; vacc1 += vx_lo * vy1.lo; vacc1 += vx_hi * vy1.hi; const psimd_f32x2 vy2 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y2)); y2 += 8; vacc2 += vx_lo * vy2.lo; vacc2 += vx_hi * vy2.hi; const psimd_f32x2 vy3 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y3)); y3 += 8; vacc3 += vx_lo * vy3.lo; vacc3 += vx_hi * vy3.hi; } float acc0 = psimd_reduce_sum_f32(vacc0); float acc1 = psimd_reduce_sum_f32(vacc1); float acc2 = psimd_reduce_sum_f32(vacc2); float acc3 = psimd_reduce_sum_f32(vacc3); while (n--) { const float sx = (*x++); acc0 += sx * fp16_alt_to_fp32_value(*y0++); acc1 += sx * fp16_alt_to_fp32_value(*y1++); acc2 += sx * fp16_alt_to_fp32_value(*y2++); acc3 += sx * fp16_alt_to_fp32_value(*y3++); } sum[0] = acc0; sum[1] = acc1; sum[2] = acc2; sum[3] = acc3; } void nnp_shdotxf5__psimd( const float x[restrict static 1], const uint16_t y[restrict static 1], size_t stride_y, float sum[restrict static 1], size_t n) { psimd_f32 vacc0, vacc1, vacc2, vacc3, vacc4; vacc0 = vacc1 = vacc2 = vacc3 = vacc4 = psimd_zero_f32(); const uint16_t *restrict y0 = y; const uint16_t *restrict y1 = y0 + stride_y; const uint16_t *restrict y2 = y1 + stride_y; const uint16_t *restrict y3 = y2 + stride_y; const uint16_t *restrict y4 = y3 + stride_y; for (; n >= 8; n -= 8) { const psimd_f32 vx_lo = psimd_load_f32(x); const psimd_f32 vx_hi = psimd_load_f32(x + 4); x += 8; const psimd_f32x2 vy0 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y0)); y0 += 8; vacc0 += vx_lo * vy0.lo; vacc0 += vx_hi * vy0.hi; const psimd_f32x2 vy1 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y1)); y1 += 8; vacc1 += vx_lo * vy1.lo; vacc1 += vx_hi * vy1.hi; const psimd_f32x2 vy2 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y2)); y2 += 8; vacc2 += vx_lo * vy2.lo; vacc2 += vx_hi * vy2.hi; const psimd_f32x2 vy3 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y3)); y3 += 8; vacc3 += vx_lo * vy3.lo; vacc3 += vx_hi * vy3.hi; const psimd_f32x2 vy4 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y4)); y4 += 8; vacc4 += vx_lo * vy4.lo; vacc4 += vx_hi * vy4.hi; } float acc0 = psimd_reduce_sum_f32(vacc0); float acc1 = psimd_reduce_sum_f32(vacc1); float acc2 = psimd_reduce_sum_f32(vacc2); float acc3 = psimd_reduce_sum_f32(vacc3); float acc4 = psimd_reduce_sum_f32(vacc4); while (n--) { const float sx = (*x++); acc0 += sx * fp16_alt_to_fp32_value(*y0++); acc1 += sx * fp16_alt_to_fp32_value(*y1++); acc2 += sx * fp16_alt_to_fp32_value(*y2++); acc3 += sx * fp16_alt_to_fp32_value(*y3++); acc4 += sx * fp16_alt_to_fp32_value(*y4++); } sum[0] = acc0; sum[1] = acc1; sum[2] = acc2; sum[3] = acc3; sum[4] = acc4; } void nnp_shdotxf6__psimd( const float x[restrict static 1], const uint16_t y[restrict static 1], size_t stride_y, float sum[restrict static 1], size_t n) { psimd_f32 vacc0, vacc1, vacc2, vacc3, vacc4, vacc5; vacc0 = vacc1 = vacc2 = vacc3 = vacc4 = vacc5 = psimd_zero_f32(); const uint16_t *restrict y0 = y; const uint16_t *restrict y1 = y0 + stride_y; const uint16_t *restrict y2 = y1 + stride_y; const uint16_t *restrict y3 = y2 + stride_y; const uint16_t *restrict y4 = y3 + stride_y; const uint16_t *restrict y5 = y4 + stride_y; for (; n >= 8; n -= 8) { const psimd_f32 vx_lo = psimd_load_f32(x); const psimd_f32 vx_hi = psimd_load_f32(x + 4); x += 8; const psimd_f32x2 vy0 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y0)); y0 += 8; vacc0 += vx_lo * vy0.lo; vacc0 += vx_hi * vy0.hi; const psimd_f32x2 vy1 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y1)); y1 += 8; vacc1 += vx_lo * vy1.lo; vacc1 += vx_hi * vy1.hi; const psimd_f32x2 vy2 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y2)); y2 += 8; vacc2 += vx_lo * vy2.lo; vacc2 += vx_hi * vy2.hi; const psimd_f32x2 vy3 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y3)); y3 += 8; vacc3 += vx_lo * vy3.lo; vacc3 += vx_hi * vy3.hi; const psimd_f32x2 vy4 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y4)); y4 += 8; vacc4 += vx_lo * vy4.lo; vacc4 += vx_hi * vy4.hi; const psimd_f32x2 vy5 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y5)); y5 += 8; vacc5 += vx_lo * vy5.lo; vacc5 += vx_hi * vy5.hi; } float acc0 = psimd_reduce_sum_f32(vacc0); float acc1 = psimd_reduce_sum_f32(vacc1); float acc2 = psimd_reduce_sum_f32(vacc2); float acc3 = psimd_reduce_sum_f32(vacc3); float acc4 = psimd_reduce_sum_f32(vacc4); float acc5 = psimd_reduce_sum_f32(vacc5); while (n--) { const float sx = (*x++); acc0 += sx * fp16_alt_to_fp32_value(*y0++); acc1 += sx * fp16_alt_to_fp32_value(*y1++); acc2 += sx * fp16_alt_to_fp32_value(*y2++); acc3 += sx * fp16_alt_to_fp32_value(*y3++); acc4 += sx * fp16_alt_to_fp32_value(*y4++); acc5 += sx * fp16_alt_to_fp32_value(*y5++); } sum[0] = acc0; sum[1] = acc1; sum[2] = acc2; sum[3] = acc3; sum[4] = acc4; sum[5] = acc5; } void nnp_shdotxf7__psimd( const float x[restrict static 1], const uint16_t y[restrict static 1], size_t stride_y, float sum[restrict static 1], size_t n) { psimd_f32 vacc0, vacc1, vacc2, vacc3, vacc4, vacc5, vacc6; vacc0 = vacc1 = vacc2 = vacc3 = vacc4 = vacc5 = vacc6 = psimd_zero_f32(); const uint16_t *restrict y0 = y; const uint16_t *restrict y1 = y0 + stride_y; const uint16_t *restrict y2 = y1 + stride_y; const uint16_t *restrict y3 = y2 + stride_y; const uint16_t *restrict y4 = y3 + stride_y; const uint16_t *restrict y5 = y4 + stride_y; const uint16_t *restrict y6 = y5 + stride_y; for (; n >= 8; n -= 8) { const psimd_f32 vx_lo = psimd_load_f32(x); const psimd_f32 vx_hi = psimd_load_f32(x + 4); x += 8; const psimd_f32x2 vy0 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y0)); y0 += 8; vacc0 += vx_lo * vy0.lo; vacc0 += vx_hi * vy0.hi; const psimd_f32x2 vy1 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y1)); y1 += 8; vacc1 += vx_lo * vy1.lo; vacc1 += vx_hi * vy1.hi; const psimd_f32x2 vy2 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y2)); y2 += 8; vacc2 += vx_lo * vy2.lo; vacc2 += vx_hi * vy2.hi; const psimd_f32x2 vy3 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y3)); y3 += 8; vacc3 += vx_lo * vy3.lo; vacc3 += vx_hi * vy3.hi; const psimd_f32x2 vy4 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y4)); y4 += 8; vacc4 += vx_lo * vy4.lo; vacc4 += vx_hi * vy4.hi; const psimd_f32x2 vy5 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y5)); y5 += 8; vacc5 += vx_lo * vy5.lo; vacc5 += vx_hi * vy5.hi; const psimd_f32x2 vy6 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y6)); y6 += 8; vacc6 += vx_lo * vy6.lo; vacc6 += vx_hi * vy6.hi; } float acc0 = psimd_reduce_sum_f32(vacc0); float acc1 = psimd_reduce_sum_f32(vacc1); float acc2 = psimd_reduce_sum_f32(vacc2); float acc3 = psimd_reduce_sum_f32(vacc3); float acc4 = psimd_reduce_sum_f32(vacc4); float acc5 = psimd_reduce_sum_f32(vacc5); float acc6 = psimd_reduce_sum_f32(vacc6); while (n--) { const float sx = (*x++); acc0 += sx * fp16_alt_to_fp32_value(*y0++); acc1 += sx * fp16_alt_to_fp32_value(*y1++); acc2 += sx * fp16_alt_to_fp32_value(*y2++); acc3 += sx * fp16_alt_to_fp32_value(*y3++); acc4 += sx * fp16_alt_to_fp32_value(*y4++); acc5 += sx * fp16_alt_to_fp32_value(*y5++); acc6 += sx * fp16_alt_to_fp32_value(*y6++); } sum[0] = acc0; sum[1] = acc1; sum[2] = acc2; sum[3] = acc3; sum[4] = acc4; sum[5] = acc5; sum[6] = acc6; } void nnp_shdotxf8__psimd( const float x[restrict static 1], const uint16_t y[restrict static 1], size_t stride_y, float sum[restrict static 1], size_t n) { psimd_f32 vacc0, vacc1, vacc2, vacc3, vacc4, vacc5, vacc6, vacc7; vacc0 = vacc1 = vacc2 = vacc3 = vacc4 = vacc5 = vacc6 = vacc7 = psimd_zero_f32(); const uint16_t *restrict y0 = y; const uint16_t *restrict y1 = y0 + stride_y; const uint16_t *restrict y2 = y1 + stride_y; const uint16_t *restrict y3 = y2 + stride_y; const uint16_t *restrict y4 = y3 + stride_y; const uint16_t *restrict y5 = y4 + stride_y; const uint16_t *restrict y6 = y5 + stride_y; const uint16_t *restrict y7 = y6 + stride_y; for (; n >= 8; n -= 8) { const psimd_f32 vx_lo = psimd_load_f32(x); const psimd_f32 vx_hi = psimd_load_f32(x + 4); x += 8; const psimd_f32x2 vy0 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y0)); y0 += 8; vacc0 += vx_lo * vy0.lo; vacc0 += vx_hi * vy0.hi; const psimd_f32x2 vy1 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y1)); y1 += 8; vacc1 += vx_lo * vy1.lo; vacc1 += vx_hi * vy1.hi; const psimd_f32x2 vy2 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y2)); y2 += 8; vacc2 += vx_lo * vy2.lo; vacc2 += vx_hi * vy2.hi; const psimd_f32x2 vy3 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y3)); y3 += 8; vacc3 += vx_lo * vy3.lo; vacc3 += vx_hi * vy3.hi; const psimd_f32x2 vy4 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y4)); y4 += 8; vacc4 += vx_lo * vy4.lo; vacc4 += vx_hi * vy4.hi; const psimd_f32x2 vy5 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y5)); y5 += 8; vacc5 += vx_lo * vy5.lo; vacc5 += vx_hi * vy5.hi; const psimd_f32x2 vy6 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y6)); y6 += 8; vacc6 += vx_lo * vy6.lo; vacc6 += vx_hi * vy6.hi; const psimd_f32x2 vy7 = fp16_alt_to_fp32x2_psimd(psimd_load_u16(y7)); y7 += 8; vacc7 += vx_lo * vy7.lo; vacc7 += vx_hi * vy7.hi; } float acc0 = psimd_reduce_sum_f32(vacc0); float acc1 = psimd_reduce_sum_f32(vacc1); float acc2 = psimd_reduce_sum_f32(vacc2); float acc3 = psimd_reduce_sum_f32(vacc3); float acc4 = psimd_reduce_sum_f32(vacc4); float acc5 = psimd_reduce_sum_f32(vacc5); float acc6 = psimd_reduce_sum_f32(vacc6); float acc7 = psimd_reduce_sum_f32(vacc7); while (n--) { const float sx = (*x++); acc0 += sx * fp16_alt_to_fp32_value(*y0++); acc1 += sx * fp16_alt_to_fp32_value(*y1++); acc2 += sx * fp16_alt_to_fp32_value(*y2++); acc3 += sx * fp16_alt_to_fp32_value(*y3++); acc4 += sx * fp16_alt_to_fp32_value(*y4++); acc5 += sx * fp16_alt_to_fp32_value(*y5++); acc6 += sx * fp16_alt_to_fp32_value(*y6++); acc7 += sx * fp16_alt_to_fp32_value(*y7++); } sum[0] = acc0; sum[1] = acc1; sum[2] = acc2; sum[3] = acc3; sum[4] = acc4; sum[5] = acc5; sum[6] = acc6; sum[7] = acc7; }