#include #include #include #include /** * From * * g[ n] = 0.5 (w[ n] + conj w[-n]) * g[-n] = 0.5 (w[-n] + conj w[ n]) = conj g[n] * h[ n] = -0.5i (w[ n] - conj w[-n]) * h[-n] = -0.5i (w[-n] - conj w[ n]) = conj h[n] * * ...and... * * w[ n] = g[ n] + h[ n] * conj CEXP_nPI_OVER_N * w[-n] = g[-n] + h[-n] * conj CEXP_nPI_OVER_N * * ...we have: * w[-n] = g[-n] + h[-n] * conj CEXP_nPI_OVER_N * = conj g[ n] + conj h[ n] * conj CEXP_nPI_OVER_N * * Therefore * conj w[-n] = g[n] + h[n] * CEXP_nPI_OVER_N * = g[n] - h[n] * conj CEXP_nPI_OVER_N * w[ n] = g[n] + h[n] * conj CEXP_nPI_OVER_N * * From there: * g[n] = 0.5 (w[n] + conj w[-n]) * h[n] * conj CEXP_nPI_OVER_N = 0.5 (w[n] - conj w[-n]) * h[n] = 0.5 (w[n] - conj w[-n]) * CEXP_nPI_OVER_N */ void nnp_ifft8_real__ref( const float f[restrict static 8], size_t f_stride, float t[restrict static 8], size_t t_stride) { /* Load inputs and scale */ const float W0_real = 0.5f * f[0 * f_stride]; const float W4_real = 0.5f * f[1 * f_stride]; const float W1_real = 0.5f * f[2 * f_stride]; const float W1_imag = 0.5f * f[3 * f_stride]; const float W2_real = f[4 * f_stride]; const float W2_imag = f[5 * f_stride]; const float W3_real = 0.5f * f[6 * f_stride]; const float W3_imag = 0.5f * f[7 * f_stride]; float _Complex w0 = CMPLXF(W0_real + W4_real, W0_real - W4_real); float _Complex w2 = CMPLXF(W2_real, -W2_imag); const float G1_real = W1_real + W3_real; const float G1_imag = W1_imag - W3_imag; const float H1_real = W1_real - W3_real; const float H1_imag = W1_imag + W3_imag; const float H1_plus = H1_real + H1_imag; const float H1_minus = H1_real - H1_imag; const float w1_real = G1_real - SQRT2_OVER_2 * H1_plus; const float w1_imag = G1_imag + SQRT2_OVER_2 * H1_minus; const float w3_real = G1_real + SQRT2_OVER_2 * H1_plus; const float w3_imag = -G1_imag + SQRT2_OVER_2 * H1_minus; float _Complex w1 = CMPLXF(w1_real, w1_imag); float _Complex w3 = CMPLXF(w3_real, w3_imag); ifft4fc(&w0, &w1, &w2, &w3); /* Store outputs */ t[0 * t_stride] = crealf(w0); t[1 * t_stride] = cimagf(w0); t[2 * t_stride] = crealf(w1); t[3 * t_stride] = cimagf(w1); t[4 * t_stride] = crealf(w2); t[5 * t_stride] = cimagf(w2); t[6 * t_stride] = crealf(w3); t[7 * t_stride] = cimagf(w3); } void nnp_ifft16_real__ref( const float f[restrict static 16], size_t f_stride, float t[restrict static 16], size_t t_stride) { /* Load inputs and scale */ const float W0_real = 0.5f * f[ 0 * f_stride]; const float W8_real = 0.5f * f[ 1 * f_stride]; const float W1_real = 0.5f * f[ 2 * f_stride]; const float W1_imag = 0.5f * f[ 3 * f_stride]; const float W2_real = 0.5f * f[ 4 * f_stride]; const float W2_imag = 0.5f * f[ 5 * f_stride]; const float W3_real = 0.5f * f[ 6 * f_stride]; const float W3_imag = 0.5f * f[ 7 * f_stride]; const float W4_real = f[ 8 * f_stride]; const float W4_imag = f[ 9 * f_stride]; const float W5_real = 0.5f * f[10 * f_stride]; const float W5_imag = 0.5f * f[11 * f_stride]; const float W6_real = 0.5f * f[12 * f_stride]; const float W6_imag = 0.5f * f[13 * f_stride]; const float W7_real = 0.5f * f[14 * f_stride]; const float W7_imag = 0.5f * f[15 * f_stride]; float _Complex w0 = CMPLXF(W0_real + W8_real, W0_real - W8_real); float _Complex w4 = CMPLXF(W4_real, -W4_imag); const float G2_real = W2_real + W6_real; const float G2_imag = W2_imag - W6_imag; const float H2_real = W2_real - W6_real; const float H2_imag = W2_imag + W6_imag; const float H2_plus = H2_real + H2_imag; const float H2_minus = H2_real - H2_imag; const float w2_real = G2_real - SQRT2_OVER_2 * H2_plus; const float w2_imag = G2_imag + SQRT2_OVER_2 * H2_minus; const float w6_real = G2_real + SQRT2_OVER_2 * H2_plus; const float w6_imag = -G2_imag + SQRT2_OVER_2 * H2_minus; const float G1_real = W1_real + W7_real; const float G1_imag = W1_imag - W7_imag; const float G3_real = W3_real + W5_real; const float G3_imag = W3_imag - W5_imag; const float H1_real = W1_real - W7_real; const float H1_imag = W1_imag + W7_imag; const float H3_real = W3_real - W5_real; const float H3_imag = W3_imag + W5_imag; const float w1_real = G1_real - H1_imag * COS_1PI_OVER_8 - H1_real * COS_3PI_OVER_8; const float w1_imag = G1_imag + H1_real * COS_1PI_OVER_8 - H1_imag * COS_3PI_OVER_8; const float w7_real = G1_real + H1_imag * COS_1PI_OVER_8 + H1_real * COS_3PI_OVER_8; const float w7_imag = -G1_imag + H1_real * COS_1PI_OVER_8 - H1_imag * COS_3PI_OVER_8; const float w3_real = G3_real - H3_imag * COS_3PI_OVER_8 - H3_real * COS_1PI_OVER_8; const float w3_imag = G3_imag + H3_real * COS_3PI_OVER_8 - H3_imag * COS_1PI_OVER_8; const float w5_real = G3_real + H3_imag * COS_3PI_OVER_8 + H3_real * COS_1PI_OVER_8; const float w5_imag = -G3_imag + H3_real * COS_3PI_OVER_8 - H3_imag * COS_1PI_OVER_8; float _Complex w1 = CMPLXF(w1_real, w1_imag); float _Complex w7 = CMPLXF(w7_real, w7_imag); float _Complex w2 = CMPLXF(w2_real, w2_imag); float _Complex w6 = CMPLXF(w6_real, w6_imag); float _Complex w3 = CMPLXF(w3_real, w3_imag); float _Complex w5 = CMPLXF(w5_real, w5_imag); ifft8fc(&w0, &w1, &w2, &w3, &w4, &w5, &w6, &w7); /* Store outputs */ t[ 0 * t_stride] = crealf(w0); t[ 1 * t_stride] = cimagf(w0); t[ 2 * t_stride] = crealf(w1); t[ 3 * t_stride] = cimagf(w1); t[ 4 * t_stride] = crealf(w2); t[ 5 * t_stride] = cimagf(w2); t[ 6 * t_stride] = crealf(w3); t[ 7 * t_stride] = cimagf(w3); t[ 8 * t_stride] = crealf(w4); t[ 9 * t_stride] = cimagf(w4); t[10 * t_stride] = crealf(w5); t[11 * t_stride] = cimagf(w5); t[12 * t_stride] = crealf(w6); t[13 * t_stride] = cimagf(w6); t[14 * t_stride] = crealf(w7); t[15 * t_stride] = cimagf(w7); } void nnp_ifft32_real__ref( const float f[restrict static 16], size_t f_stride, float t[restrict static 16], size_t t_stride) { /* Load inputs and scale */ const float W0_real = 0.5f * f[ 0 * f_stride]; const float W16_real = 0.5f * f[ 1 * f_stride]; const float W1_real = 0.5f * f[ 2 * f_stride]; const float W1_imag = 0.5f * f[ 3 * f_stride]; const float W2_real = 0.5f * f[ 4 * f_stride]; const float W2_imag = 0.5f * f[ 5 * f_stride]; const float W3_real = 0.5f * f[ 6 * f_stride]; const float W3_imag = 0.5f * f[ 7 * f_stride]; const float W4_real = 0.5f * f[ 8 * f_stride]; const float W4_imag = 0.5f * f[ 9 * f_stride]; const float W5_real = 0.5f * f[10 * f_stride]; const float W5_imag = 0.5f * f[11 * f_stride]; const float W6_real = 0.5f * f[12 * f_stride]; const float W6_imag = 0.5f * f[13 * f_stride]; const float W7_real = 0.5f * f[14 * f_stride]; const float W7_imag = 0.5f * f[15 * f_stride]; const float W8_real = f[16 * f_stride]; const float W8_imag = f[17 * f_stride]; const float W9_real = 0.5f * f[18 * f_stride]; const float W9_imag = 0.5f * f[19 * f_stride]; const float W10_real = 0.5f * f[20 * f_stride]; const float W10_imag = 0.5f * f[21 * f_stride]; const float W11_real = 0.5f * f[22 * f_stride]; const float W11_imag = 0.5f * f[23 * f_stride]; const float W12_real = 0.5f * f[24 * f_stride]; const float W12_imag = 0.5f * f[25 * f_stride]; const float W13_real = 0.5f * f[26 * f_stride]; const float W13_imag = 0.5f * f[27 * f_stride]; const float W14_real = 0.5f * f[28 * f_stride]; const float W14_imag = 0.5f * f[29 * f_stride]; const float W15_real = 0.5f * f[30 * f_stride]; const float W15_imag = 0.5f * f[31 * f_stride]; float _Complex w0 = CMPLXF(W0_real + W16_real, W0_real - W16_real); float _Complex w8 = CMPLXF(W8_real, -W8_imag); const float G4_real = W4_real + W12_real; const float G4_imag = W4_imag - W12_imag; const float H4_real = W4_real - W12_real; const float H4_imag = W4_imag + W12_imag; const float H4_plus = H4_real + H4_imag; const float H4_minus = H4_real - H4_imag; const float w4_real = G4_real - SQRT2_OVER_2 * H4_plus; const float w4_imag = G4_imag + SQRT2_OVER_2 * H4_minus; const float w12_real = G4_real + SQRT2_OVER_2 * H4_plus; const float w12_imag = -G4_imag + SQRT2_OVER_2 * H4_minus; float _Complex w4 = CMPLXF( w4_real, w4_imag); float _Complex w12 = CMPLXF(w12_real, w12_imag); const float G1_real = W1_real + W15_real; const float G1_imag = W1_imag - W15_imag; const float G7_real = W7_real + W9_real; const float G7_imag = W7_imag - W9_imag; const float H1_real = W1_real - W15_real; const float H1_imag = W1_imag + W15_imag; const float H7_real = W7_real - W9_real; const float H7_imag = W7_imag + W9_imag; const float w1_real = G1_real - H1_imag * COS__1PI_OVER_16 - H1_real * COS__7PI_OVER_16; const float w1_imag = G1_imag + H1_real * COS__1PI_OVER_16 - H1_imag * COS__7PI_OVER_16; const float w15_real = G1_real + H1_imag * COS__1PI_OVER_16 + H1_real * COS__7PI_OVER_16; const float w15_imag = -G1_imag + H1_real * COS__1PI_OVER_16 - H1_imag * COS__7PI_OVER_16; float _Complex w1 = CMPLXF( w1_real, w1_imag); float _Complex w15 = CMPLXF(w15_real, w15_imag); const float w7_real = G7_real - H7_imag * COS__7PI_OVER_16 - H7_real * COS__1PI_OVER_16; const float w7_imag = G7_imag + H7_real * COS__7PI_OVER_16 - H7_imag * COS__1PI_OVER_16; const float w9_real = G7_real + H7_imag * COS__7PI_OVER_16 + H7_real * COS__1PI_OVER_16; const float w9_imag = -G7_imag + H7_real * COS__7PI_OVER_16 - H7_imag * COS__1PI_OVER_16; float _Complex w7 = CMPLXF(w7_real, w7_imag); float _Complex w9 = CMPLXF(w9_real, w9_imag); const float G2_real = W2_real + W14_real; const float G2_imag = W2_imag - W14_imag; const float G6_real = W6_real + W10_real; const float G6_imag = W6_imag - W10_imag; const float H2_real = W2_real - W14_real; const float H2_imag = W2_imag + W14_imag; const float H6_real = W6_real - W10_real; const float H6_imag = W6_imag + W10_imag; const float w2_real = G2_real - H2_imag * COS__2PI_OVER_16 - H2_real * COS__6PI_OVER_16; const float w2_imag = G2_imag + H2_real * COS__2PI_OVER_16 - H2_imag * COS__6PI_OVER_16; const float w14_real = G2_real + H2_imag * COS__2PI_OVER_16 + H2_real * COS__6PI_OVER_16; const float w14_imag = -G2_imag + H2_real * COS__2PI_OVER_16 - H2_imag * COS__6PI_OVER_16; float _Complex w2 = CMPLXF( w2_real, w2_imag); float _Complex w14 = CMPLXF(w14_real, w14_imag); const float w6_real = G6_real - H6_imag * COS__6PI_OVER_16 - H6_real * COS__2PI_OVER_16; const float w6_imag = G6_imag + H6_real * COS__6PI_OVER_16 - H6_imag * COS__2PI_OVER_16; const float w10_real = G6_real + H6_imag * COS__6PI_OVER_16 + H6_real * COS__2PI_OVER_16; const float w10_imag = -G6_imag + H6_real * COS__6PI_OVER_16 - H6_imag * COS__2PI_OVER_16; float _Complex w6 = CMPLXF( w6_real, w6_imag); float _Complex w10 = CMPLXF(w10_real, w10_imag); const float G3_real = W3_real + W13_real; const float G3_imag = W3_imag - W13_imag; const float G5_real = W5_real + W11_real; const float G5_imag = W5_imag - W11_imag; const float H3_real = W3_real - W13_real; const float H3_imag = W3_imag + W13_imag; const float H5_real = W5_real - W11_real; const float H5_imag = W5_imag + W11_imag; const float w3_real = G3_real - H3_imag * COS__3PI_OVER_16 - H3_real * COS__5PI_OVER_16; const float w3_imag = G3_imag + H3_real * COS__3PI_OVER_16 - H3_imag * COS__5PI_OVER_16; const float w13_real = G3_real + H3_imag * COS__3PI_OVER_16 + H3_real * COS__5PI_OVER_16; const float w13_imag = -G3_imag + H3_real * COS__3PI_OVER_16 - H3_imag * COS__5PI_OVER_16; float _Complex w3 = CMPLXF( w3_real, w3_imag); float _Complex w13 = CMPLXF(w13_real, w13_imag); const float w5_real = G5_real - H5_imag * COS__5PI_OVER_16 - H5_real * COS__3PI_OVER_16; const float w5_imag = G5_imag + H5_real * COS__5PI_OVER_16 - H5_imag * COS__3PI_OVER_16; const float w11_real = G5_real + H5_imag * COS__5PI_OVER_16 + H5_real * COS__3PI_OVER_16; const float w11_imag = -G5_imag + H5_real * COS__5PI_OVER_16 - H5_imag * COS__3PI_OVER_16; float _Complex w5 = CMPLXF( w5_real, w5_imag); float _Complex w11 = CMPLXF(w11_real, w11_imag); ifft16fc(&w0, &w1, &w2, &w3, &w4, &w5, &w6, &w7, &w8, &w9, &w10, &w11, &w12, &w13, &w14, &w15); /* Store outputs */ t[ 0 * t_stride] = crealf(w0); t[ 1 * t_stride] = cimagf(w0); t[ 2 * t_stride] = crealf(w1); t[ 3 * t_stride] = cimagf(w1); t[ 4 * t_stride] = crealf(w2); t[ 5 * t_stride] = cimagf(w2); t[ 6 * t_stride] = crealf(w3); t[ 7 * t_stride] = cimagf(w3); t[ 8 * t_stride] = crealf(w4); t[ 9 * t_stride] = cimagf(w4); t[10 * t_stride] = crealf(w5); t[11 * t_stride] = cimagf(w5); t[12 * t_stride] = crealf(w6); t[13 * t_stride] = cimagf(w6); t[14 * t_stride] = crealf(w7); t[15 * t_stride] = cimagf(w7); t[16 * t_stride] = crealf(w8); t[17 * t_stride] = cimagf(w8); t[18 * t_stride] = crealf(w9); t[19 * t_stride] = cimagf(w9); t[20 * t_stride] = crealf(w10); t[21 * t_stride] = cimagf(w10); t[22 * t_stride] = crealf(w11); t[23 * t_stride] = cimagf(w11); t[24 * t_stride] = crealf(w12); t[25 * t_stride] = cimagf(w12); t[26 * t_stride] = crealf(w13); t[27 * t_stride] = cimagf(w13); t[28 * t_stride] = crealf(w14); t[29 * t_stride] = cimagf(w14); t[30 * t_stride] = crealf(w15); t[31 * t_stride] = cimagf(w15); }