#include #include #include #include #include #include #define BLOCK_SIZE 8 #define KERNEL_SIZE 3 #define OUTPUT_SIZE (BLOCK_SIZE - KERNEL_SIZE + 1) void nnp_iwt8x8_3x3_with_offset__scalar( const float data[restrict static 1], float transform[restrict static 1], size_t data_stride, size_t transform_stride, uint32_t row_count, uint32_t column_count, uint32_t row_offset, uint32_t column_offset) { transform_stride /= sizeof(float); float block[BLOCK_SIZE][BLOCK_SIZE]; if (row_offset != 0) { memset(&block[0][0], 0, row_offset * BLOCK_SIZE * sizeof(float)); } const uint32_t row_end = row_offset + row_count; if (row_end != BLOCK_SIZE) { memset(&block[row_end][0], 0, (BLOCK_SIZE - row_end) * BLOCK_SIZE * sizeof(float)); } for (uint32_t row = row_offset; row < row_end; row++) { float d0, d1, d2, d3, d4, d5, d6, d7; d0 = d1 = d2 = d3 = d4 = d5 = d6 = d7 = 0.0f; const float *restrict row_data = data; uint32_t remaining_column_count = column_count; switch (column_offset) { case 0: d0 = *row_data++; if (--remaining_column_count == 0) { break; } case 1: d1 = *row_data++; if (--remaining_column_count == 0) { break; } case 2: d2 = *row_data++; if (--remaining_column_count == 0) { break; } case 3: d3 = *row_data++; if (--remaining_column_count == 0) { break; } case 4: d4 = *row_data++; if (--remaining_column_count == 0) { break; } case 5: d5 = *row_data++; if (--remaining_column_count == 0) { break; } case 6: d6 = *row_data++; if (--remaining_column_count == 0) { break; } case 7: d7 = *row_data; break; default: NNP_UNREACHABLE; } winograd_f6k3_input_transform(d0, d1, d2, d3, d4, d5, d6, d7, &block[row][0], &block[row][1], &block[row][2], &block[row][3], &block[row][4], &block[row][5], &block[row][6], &block[row][7]); data += data_stride; } for (uint32_t column = 0; column < BLOCK_SIZE; column++) { float wd0, wd1, wd2, wd3, wd4, wd5, wd6, wd7; winograd_f6k3_input_transform( block[0][column], block[1][column], block[2][column], block[3][column], block[4][column], block[5][column], block[6][column], block[7][column], &wd0, &wd1, &wd2, &wd3, &wd4, &wd5, &wd6, &wd7); *transform = wd0; transform += transform_stride; *transform = wd1; transform += transform_stride; *transform = wd2; transform += transform_stride; *transform = wd3; transform += transform_stride; *transform = wd4; transform += transform_stride; *transform = wd5; transform += transform_stride; *transform = wd6; transform += transform_stride; *transform = wd7; transform += transform_stride; } } void nnp_kwt8x8_3x3__scalar( const float g[restrict static 9], float transform[restrict static 1], size_t stride_g, size_t transform_stride, uint32_t row_count, uint32_t column_count, uint32_t row_offset, uint32_t column_offset) { transform_stride /= sizeof(float); float block[KERNEL_SIZE][BLOCK_SIZE]; for (uint32_t row = 0; row < KERNEL_SIZE; row++) { float w0, w1, w2, w3, w4, w5, w6, w7; winograd_f6k3_kernel_transform( g[0], g[1], g[2], &block[row][0], &block[row][1], &block[row][2], &block[row][3], &block[row][4], &block[row][5], &block[row][6], &block[row][7], true); g += KERNEL_SIZE; } for (uint32_t column = 0; column < BLOCK_SIZE; column++) { float w0, w1, w2, w3, w4, w5, w6, w7; winograd_f6k3_kernel_transform( block[0][column], block[1][column], block[2][column], &w0, &w1, &w2, &w3, &w4, &w5, &w6, &w7, true); *transform = w0; transform += transform_stride; *transform = w1; transform += transform_stride; *transform = w2; transform += transform_stride; *transform = w3; transform += transform_stride; *transform = w4; transform += transform_stride; *transform = w5; transform += transform_stride; *transform = w6; transform += transform_stride; *transform = w7; transform += transform_stride; } } #if !NNP_INFERENCE_ONLY void nnp_kwt8x8_3Rx3R__scalar( const float g[restrict static 9], float transform[restrict static 1], size_t stride_g, size_t transform_stride, uint32_t row_count, uint32_t column_count, uint32_t row_offset, uint32_t column_offset) { transform_stride /= sizeof(float); g += KERNEL_SIZE * (KERNEL_SIZE - 1); float block[KERNEL_SIZE][BLOCK_SIZE]; for (uint32_t row = 0; row < KERNEL_SIZE; row++) { float w0, w1, w2, w3, w4, w5, w6, w7; winograd_f6k3_kernel_transform( g[2], g[1], g[0], &block[row][0], &block[row][1], &block[row][2], &block[row][3], &block[row][4], &block[row][5], &block[row][6], &block[row][7], true); g -= KERNEL_SIZE; } for (uint32_t column = 0; column < BLOCK_SIZE; column++) { float w0, w1, w2, w3, w4, w5, w6, w7; winograd_f6k3_kernel_transform( block[0][column], block[1][column], block[2][column], &w0, &w1, &w2, &w3, &w4, &w5, &w6, &w7, true); *transform = w0; transform += transform_stride; *transform = w1; transform += transform_stride; *transform = w2; transform += transform_stride; *transform = w3; transform += transform_stride; *transform = w4; transform += transform_stride; *transform = w5; transform += transform_stride; *transform = w6; transform += transform_stride; *transform = w7; transform += transform_stride; } } void nnp_owt8x8_3x3__scalar( const float transform[restrict static 1], float output[restrict static 1], size_t transform_stride, size_t output_stride, uint32_t row_count, uint32_t column_count, uint32_t row_offset, uint32_t column_offset) { transform_stride /= sizeof(float); float block[OUTPUT_SIZE][BLOCK_SIZE]; for (uint32_t column = 0; column < BLOCK_SIZE; column++) { const float m0 = *transform; transform += transform_stride; const float m1 = *transform; transform += transform_stride; const float m2 = *transform; transform += transform_stride; const float m3 = *transform; transform += transform_stride; const float m4 = *transform; transform += transform_stride; const float m5 = *transform; transform += transform_stride; const float m6 = *transform; transform += transform_stride; const float m7 = *transform; transform += transform_stride; winograd_f6k3_output_transform( m0, m1, m2, m3, m4, m5, m6, m7, &block[0][column], &block[1][column], &block[2][column], &block[3][column], &block[4][column], &block[5][column]); } const uint32_t row_end = row_offset + row_count; for (uint32_t row = row_offset; row < row_end; row++) { float s0, s1, s2, s3, s4, s5; winograd_f6k3_output_transform( block[row][0], block[row][1], block[row][2], block[row][3], block[row][4], block[row][5], block[row][6], block[row][7], &s0, &s1, &s2, &s3, &s4, &s5); float *restrict row_output = output + (row - row_offset) * output_stride; uint32_t remaining_column_count = column_count; switch (column_offset) { case 0: *row_output++ = s0; if (--remaining_column_count == 0) { break; } case 1: *row_output++ = s1; if (--remaining_column_count == 0) { break; } case 2: *row_output++ = s2; if (--remaining_column_count == 0) { break; } case 3: *row_output++ = s3; if (--remaining_column_count == 0) { break; } case 4: *row_output++ = s4; if (--remaining_column_count == 0) { break; } case 5: *row_output = s5; break; default: NNP_UNREACHABLE; } } } #endif /* !NNP_INFERENCE_ONLY */ void nnp_owt8x8_3x3_with_bias__scalar( const float transform[restrict static 1], float output[restrict static 1], const float bias[restrict static 1], size_t transform_stride, size_t output_stride, uint32_t row_count, uint32_t column_count) { transform_stride /= sizeof(float); const uint32_t row_offset = 0; const uint32_t column_offset = 0; float block[OUTPUT_SIZE][BLOCK_SIZE]; for (uint32_t column = 0; column < BLOCK_SIZE; column++) { const float m0 = *transform; transform += transform_stride; float m1 = *transform; transform += transform_stride; const float m2 = *transform; transform += transform_stride; const float m3 = *transform; transform += transform_stride; const float m4 = *transform; transform += transform_stride; const float m5 = *transform; transform += transform_stride; const float m6 = *transform; transform += transform_stride; const float m7 = *transform; transform += transform_stride; if (column == 1) { const float bias_value = *bias; m1 += bias_value; } winograd_f6k3_output_transform( m0, m1, m2, m3, m4, m5, m6, m7, &block[0][column], &block[1][column], &block[2][column], &block[3][column], &block[4][column], &block[5][column]); } const uint32_t row_end = row_offset + row_count; for (uint32_t row = row_offset; row < row_end; row++) { float s0, s1, s2, s3, s4, s5; winograd_f6k3_output_transform( block[row][0], block[row][1], block[row][2], block[row][3], block[row][4], block[row][5], block[row][6], block[row][7], &s0, &s1, &s2, &s3, &s4, &s5); float *restrict row_output = output + (row - row_offset) * output_stride; uint32_t remaining_column_count = column_count; switch (column_offset) { case 0: *row_output++ = s0; if (--remaining_column_count == 0) { break; } case 1: *row_output++ = s1; if (--remaining_column_count == 0) { break; } case 2: *row_output++ = s2; if (--remaining_column_count == 0) { break; } case 3: *row_output++ = s3; if (--remaining_column_count == 0) { break; } case 4: *row_output++ = s4; if (--remaining_column_count == 0) { break; } case 5: *row_output = s5; break; default: NNP_UNREACHABLE; } } } void nnp_owt8x8_3x3_with_bias_with_relu__scalar( const float transform[restrict static 1], float output[restrict static 1], const float bias[restrict static 1], size_t transform_stride, size_t output_stride, uint32_t row_count, uint32_t column_count) { transform_stride /= sizeof(float); const uint32_t row_offset = 0; const uint32_t column_offset = 0; float block[OUTPUT_SIZE][BLOCK_SIZE]; for (uint32_t column = 0; column < BLOCK_SIZE; column++) { const float m0 = *transform; transform += transform_stride; float m1 = *transform; transform += transform_stride; const float m2 = *transform; transform += transform_stride; const float m3 = *transform; transform += transform_stride; const float m4 = *transform; transform += transform_stride; const float m5 = *transform; transform += transform_stride; const float m6 = *transform; transform += transform_stride; const float m7 = *transform; transform += transform_stride; if (column == 1) { const float bias_value = *bias; m1 += bias_value; } winograd_f6k3_output_transform( m0, m1, m2, m3, m4, m5, m6, m7, &block[0][column], &block[1][column], &block[2][column], &block[3][column], &block[4][column], &block[5][column]); } const uint32_t row_end = row_offset + row_count; for (uint32_t row = row_offset; row < row_end; row++) { float s0, s1, s2, s3, s4, s5; winograd_f6k3_output_transform( block[row][0], block[row][1], block[row][2], block[row][3], block[row][4], block[row][5], block[row][6], block[row][7], &s0, &s1, &s2, &s3, &s4, &s5); float *restrict row_output = output + (row - row_offset) * output_stride; uint32_t remaining_column_count = column_count; switch (column_offset) { case 0: *row_output++ = relu(s0, 0.0f); if (--remaining_column_count == 0) { break; } case 1: *row_output++ = relu(s1, 0.0f); if (--remaining_column_count == 0) { break; } case 2: *row_output++ = relu(s2, 0.0f); if (--remaining_column_count == 0) { break; } case 3: *row_output++ = relu(s3, 0.0f); if (--remaining_column_count == 0) { break; } case 4: *row_output++ = relu(s4, 0.0f); if (--remaining_column_count == 0) { break; } case 5: *row_output = relu(s5, 0.0f); break; default: NNP_UNREACHABLE; } } }