#pragma once #include static inline void neon_transpose4x4_inplace_f32( float32x4_t row0[static restrict 1], float32x4_t row1[static restrict 1], float32x4_t row2[static restrict 1], float32x4_t row3[static restrict 1]) { /* * row0 = ( x00 x01 x02 x03 ) * row1 = ( x10 x11 x12 x13 ) * row2 = ( x20 x21 x22 x23 ) * row3 = ( x30 x31 x32 x33 ) */ /* * row01 = ( x00 x10 x02 x12 ), ( x01 x11 x03, x13 ) * row23 = ( x20 x30 x22 x32 ), ( x21 x31 x23, x33 ) */ float32x4x2_t row01 = vtrnq_f32(*row0, *row1); float32x4x2_t row23 = vtrnq_f32(*row2, *row3); /* * row0 = ( x00 x10 x20 x30 ) * row1 = ( x01 x11 x21 x31 ) * row2 = ( x02 x12 x22 x32 ) * row3 = ( x03 x13 x23 x33 ) */ *row0 = vcombine_f32(vget_low_f32(row01.val[0]), vget_low_f32(row23.val[0])); *row1 = vcombine_f32(vget_low_f32(row01.val[1]), vget_low_f32(row23.val[1])); *row2 = vcombine_f32(vget_high_f32(row01.val[0]), vget_high_f32(row23.val[0])); *row3 = vcombine_f32(vget_high_f32(row01.val[1]), vget_high_f32(row23.val[1])); }