#include #include #include #include #include #include #include #include #include #include #include #include struct NNP_CACHE_ALIGN kernel_transform_context { nnp_transform_2d_with_offset transform_function; const float* kernel; void* kernel_transform; size_t tuple_size; size_t input_channels; size_t input_channels_block_size; size_t output_channels; struct nnp_size kernel_size; }; static void compute_kernel_transform( const struct kernel_transform_context context[restrict static 1], size_t output_channels_subblock_start, size_t input_channels_block_offset, size_t output_channels_subblock_size, size_t input_channels_block_increment) { const size_t tuple_size = context->tuple_size; const size_t input_channels = context->input_channels; const size_t input_channels_block_size = context->input_channels_block_size; const size_t output_channels = context->output_channels; const struct nnp_size kernel_size = context->kernel_size; const float (*kernel)[input_channels][kernel_size.width * kernel_size.height] = (const float(*)[input_channels][kernel_size.width * kernel_size.height]) context->kernel; void* kernel_transform = context->kernel_transform; nnp_transform_2d_with_offset transform_function = context->transform_function; for (size_t output_channels_subblock_offset = 0; output_channels_subblock_offset < output_channels_subblock_size; output_channels_subblock_offset += 1) { const size_t output_channel = output_channels_subblock_start + output_channels_subblock_offset; transform_function( kernel[output_channel][input_channels_block_offset], kernel_transform + (output_channels_subblock_start * input_channels_block_size + input_channels_block_offset * output_channels_subblock_size + output_channels_subblock_offset) * tuple_size, kernel_size.width, input_channels_block_size * output_channels * tuple_size, kernel_size.height, kernel_size.width, 0, 0); } } struct NNP_CACHE_ALIGN input_transform_context { const float* input; void* input_transform; nnp_transform_2d_with_offset transform_function; const size_t tuple_size; const size_t tiles_count; const struct fxdiv_divisor_size_t tiles_x_count; const size_t input_channels_block_start; const size_t input_channels_block_size; const struct nnp_size input_size; const size_t input_padding_left; const size_t input_padding_top; const struct nnp_size input_tile; const struct nnp_size input_tile_step; }; static void compute_input_transform( const struct input_transform_context context[restrict static 1], size_t input_channels_block_offset, size_t tiles_subblock_start, size_t input_channels_block_range, size_t tiles_subblock_size) { const size_t tuple_size = context->tuple_size; const size_t tiles_count = context->tiles_count; const struct fxdiv_divisor_size_t tiles_x_count = context->tiles_x_count; const size_t input_channels_block_start = context->input_channels_block_start; const size_t input_channels_block_size = context->input_channels_block_size; const struct nnp_size input_size = context->input_size; const size_t input_padding_left = context->input_padding_left; const size_t input_padding_top = context->input_padding_top; const struct nnp_size input_tile = context->input_tile; const struct nnp_size input_tile_step = context->input_tile_step; const float (*input)[input_size.height][input_size.width] = (const float(*)[input_size.height][input_size.width]) context->input; void* input_transform = context->input_transform; nnp_transform_2d_with_offset transform_function = context->transform_function; const size_t input_channel = input_channels_block_start + input_channels_block_offset; for (size_t tiles_subblock_offset = 0; tiles_subblock_offset < tiles_subblock_size; tiles_subblock_offset += 1) { const size_t tile = tiles_subblock_start + tiles_subblock_offset; const struct fxdiv_result_size_t tile_xy = fxdiv_divide_size_t(tile, tiles_x_count); const size_t tile_x = tile_xy.remainder; const size_t tile_y = tile_xy.quotient; const size_t output_x = tile_x * input_tile_step.width; const size_t output_y = tile_y * input_tile_step.height; const size_t input_x = min(doz(output_x, input_padding_left), input_size.width); const size_t input_y = min(doz(output_y, input_padding_top), input_size.height); const size_t row_offset = doz(input_padding_top, output_y); const size_t row_count = min(input_size.height - input_y, input_tile.height - row_offset); const size_t column_offset = doz(input_padding_left, output_x); const size_t column_count = min(input_size.width - input_x, input_tile.width - column_offset); transform_function( &input[input_channel][input_y][input_x], input_transform + (tiles_subblock_start * input_channels_block_size + input_channels_block_offset * tiles_subblock_size + tiles_subblock_offset) * tuple_size, input_size.width, input_channels_block_size * tiles_count * tuple_size, row_count, column_count, row_offset, column_offset); } } struct NNP_CACHE_ALIGN output_transform_context { nnp_transform_2d_with_bias transform_function; float* output; const void* output_transform; const float* bias; size_t tuple_size; size_t tiles_count; struct fxdiv_divisor_size_t tiles_x_count; struct fxdiv_divisor_size_t tiles_block_max; size_t output_channels; struct nnp_size output_size; struct nnp_size output_tile; }; static void compute_output_transform( const struct output_transform_context context[restrict static 1], size_t output_channels_subblock_start, size_t tiles_subblock_start, size_t output_channels_subblock_size, size_t tiles_subblock_size) { const size_t tuple_size = context->tuple_size; const size_t tiles_count = context->tiles_count; const struct fxdiv_divisor_size_t tiles_x_count = context->tiles_x_count; const struct fxdiv_divisor_size_t tiles_block_max = context->tiles_block_max; const size_t output_channels = context->output_channels; const struct nnp_size output_size = context->output_size; const struct nnp_size output_tile = context->output_tile; const size_t tiles_block_start = fxdiv_round_down_size_t(tiles_subblock_start, tiles_block_max); const size_t tiles_block_size = min(tiles_count - tiles_block_start, tiles_block_max.value); float (*output)[output_size.height][output_size.width] = (float(*)[output_size.height][output_size.width]) context->output; const void* output_transform = context->output_transform; const float* bias = context->bias; nnp_transform_2d_with_bias transform_function = context->transform_function; for (size_t tiles_subblock_offset = 0; tiles_subblock_offset < tiles_subblock_size; tiles_subblock_offset += 1) { const size_t tile = tiles_subblock_start + tiles_subblock_offset; const struct fxdiv_result_size_t tile_xy = fxdiv_divide_size_t(tile, tiles_x_count); const size_t tile_x = tile_xy.remainder; const size_t tile_y = tile_xy.quotient; const size_t output_x = tile_x * output_tile.width; const size_t output_y = tile_y * output_tile.height; for (size_t output_channels_subblock_offset = 0; output_channels_subblock_offset < output_channels_subblock_size; output_channels_subblock_offset += 1) { const size_t output_channel = output_channels_subblock_start + output_channels_subblock_offset; transform_function( output_transform + (tiles_block_start * output_channels + output_channels_subblock_start * tiles_block_size + ((tiles_subblock_start - tiles_block_start) + tiles_subblock_offset) * output_channels_subblock_size + output_channels_subblock_offset) * tuple_size, &output[output_channel][output_y][output_x], &bias[output_channel], tiles_count * output_channels * tuple_size, output_size.width, min(output_tile.height, output_size.height - output_y), min(output_tile.width, output_size.width - output_x)); } } } struct NNP_CACHE_ALIGN tuple_multiplication_context { size_t tuple_elements; size_t tuple_size; size_t tiles_subblock_max; size_t input_channels_block_size; size_t input_channels_block_start; size_t output_channels; size_t output_channels_subblock_max; size_t output_channels_block_start; const void* input_transform; const void* kernel_transform; void* output_transform; nnp_fast_tuple_gemm_function fast_gemm; nnp_full_tuple_gemm_function full_gemm; }; static void compute_tuple_multiplication( const struct tuple_multiplication_context context[restrict static 1], size_t tiles_block_start, size_t output_channels_subblock_start, size_t tiles_block_size, size_t output_channels_subblock_size) { const size_t tuple_elements = context->tuple_elements; const size_t tuple_size = context->tuple_size; const size_t tiles_subblock_max = context->tiles_subblock_max; const size_t input_channels_block_size = context->input_channels_block_size; const size_t input_channels_block_start = context->input_channels_block_start; const size_t output_channels = context->output_channels; const size_t output_channels_subblock_max = context->output_channels_subblock_max; const size_t output_channels_block_start = context->output_channels_block_start; const void* input_transform = context->input_transform + tiles_block_start * input_channels_block_size * tuple_size; const void* kernel_transform = context->kernel_transform + (output_channels_block_start + output_channels_subblock_start) * input_channels_block_size * tuple_size; void* output_transform = context->output_transform + (tiles_block_start * output_channels + (output_channels_block_start + output_channels_subblock_start) * tiles_block_size) * tuple_size; if (output_channels_subblock_size == output_channels_subblock_max) { const nnp_fast_tuple_gemm_function fast_gemm = context->fast_gemm; while (tiles_block_size >= tiles_subblock_max) { tiles_block_size -= tiles_subblock_max; fast_gemm( input_channels_block_size, input_channels_block_start, input_transform, kernel_transform, output_transform, output_channels_subblock_size * tuple_elements); input_transform += tiles_subblock_max * input_channels_block_size * tuple_size; output_transform += tiles_subblock_max * output_channels_subblock_size * tuple_size; } } const nnp_full_tuple_gemm_function full_gemm = context->full_gemm; while (tiles_block_size != 0) { const size_t tiles_subblock_size = min(tiles_block_size, tiles_subblock_max); tiles_block_size -= tiles_subblock_size; full_gemm( tiles_subblock_size, output_channels_subblock_size, input_channels_block_size, input_channels_block_start, input_transform, kernel_transform, output_transform, output_channels_subblock_size * tuple_elements); input_transform += tiles_subblock_max * input_channels_block_size * tuple_size; output_transform += tiles_subblock_max * output_channels_subblock_size * tuple_size; } } struct NNP_CACHE_ALIGN kernel_packing_context { const float* kernel; float* packed_kernel; size_t reduction_size; size_t reduction_block_start; size_t reduction_block_size; }; static void compute_kernel_packing( const struct kernel_packing_context context[restrict static 1], size_t output_channels_subblock_start, size_t reduction_block_offset, size_t output_channels_subblock_size, size_t reduction_block_range) { const size_t reduction_size = context->reduction_size; const size_t reduction_block_start = context->reduction_block_start; const size_t reduction_block_size = context->reduction_block_size; const float* kernel = context->kernel + output_channels_subblock_start * reduction_size + reduction_block_offset; float* packed_kernel = context->packed_kernel + output_channels_subblock_start * reduction_block_size + reduction_block_offset * output_channels_subblock_size; for (size_t output_channels_subblock_offset = 0; output_channels_subblock_offset < output_channels_subblock_size; output_channels_subblock_offset += 1) { packed_kernel[output_channels_subblock_offset] = kernel[output_channels_subblock_offset * reduction_size]; } } struct NNP_CACHE_ALIGN input_packing_context { const float* input; float* packed_input; size_t simd_width; size_t reduction_block_start; size_t reduction_block_size; size_t output_image_block_start; struct nnp_size input_size; size_t input_padding_top; size_t input_padding_left; struct fxdiv_divisor_size_t kernel_elements; struct fxdiv_divisor_size_t kernel_width; struct fxdiv_divisor_size_t output_width; struct nnp_size output_subsampling; }; static void compute_input_packing( const struct input_packing_context context[restrict static 1], size_t reduction_block_offset, size_t output_image_subblock_start, size_t reduction_block_range, size_t output_image_subblock_size) { const size_t simd_width = context->simd_width; const size_t reduction_block_start = context->reduction_block_start; const size_t reduction_block_size = context->reduction_block_size; const size_t output_image_block_start = context->output_image_block_start; const struct nnp_size input_size = context->input_size; const size_t input_padding_top = context->input_padding_top; const size_t input_padding_left = context->input_padding_left; const struct fxdiv_divisor_size_t kernel_elements = context->kernel_elements; const struct fxdiv_divisor_size_t kernel_width = context->kernel_width; const struct fxdiv_divisor_size_t output_width = context->output_width; const struct nnp_size output_subsampling = context->output_subsampling; const float (*input)[input_size.height][input_size.width] = (const float(*)[input_size.height][input_size.width]) context->input; float* packed_input = context->packed_input; const size_t output_image_subblock_stride = round_up_by_power_of_2(output_image_subblock_size, simd_width); const size_t reduction_index = reduction_block_start + reduction_block_offset; const struct fxdiv_result_size_t reduction_index_divmod = fxdiv_divide_size_t(reduction_index, kernel_elements); const size_t input_channel = reduction_index_divmod.quotient; const struct fxdiv_result_size_t kernel_xy = fxdiv_divide_size_t(reduction_index_divmod.remainder, kernel_width); const size_t kernel_y = kernel_xy.quotient; const size_t kernel_x = kernel_xy.remainder; for (size_t output_image_subblock_offset = 0; output_image_subblock_offset < output_image_subblock_size; output_image_subblock_offset += 1) { const size_t output_image_index = output_image_block_start + output_image_subblock_start + output_image_subblock_offset; const struct fxdiv_result_size_t output_xy = fxdiv_divide_size_t(output_image_index, output_width); const size_t output_y = output_xy.quotient; const size_t output_x = output_xy.remainder; const size_t input_y = output_y * output_subsampling.height + kernel_y - input_padding_top; const size_t input_x = output_x * output_subsampling.width + kernel_x - input_padding_left; const size_t packed_index = output_image_subblock_start * reduction_block_size + reduction_block_offset * output_image_subblock_stride + output_image_subblock_offset; if ((input_x < input_size.width) && (input_y < input_size.height)) { packed_input[packed_index] = input[input_channel][input_y][input_x]; } else { packed_input[packed_index] = 0.0f; } } } struct NNP_CACHE_ALIGN matrix_multiplication_context { const float* packed_kernel; const float* packed_input; float* output; size_t reduction_block_start; size_t reduction_block_size; size_t output_image_size; size_t output_image_block_start; size_t output_image_subblock_max; size_t output_channels_subblock_max; }; static void compute_matrix_multiplication( const struct matrix_multiplication_context context[restrict static 1], size_t output_channels_block_start, size_t output_image_subblock_start, size_t output_channels_block_size, size_t output_image_subblock_size) { const size_t reduction_block_start = context->reduction_block_start; const size_t reduction_block_size = context->reduction_block_size; const size_t output_image_size = context->output_image_size; const size_t output_image_block_start = context->output_image_block_start; const size_t output_image_subblock_max = context->output_image_subblock_max; const size_t output_channels_subblock_max = context->output_channels_subblock_max; const float* packed_kernel = context->packed_kernel + output_channels_block_start * reduction_block_size; const float* packed_input = context->packed_input + output_image_subblock_start * reduction_block_size; float* output = context->output + output_channels_block_start * output_image_size + output_image_block_start + output_image_subblock_start; if (output_image_subblock_size == output_image_subblock_max) { const nnp_fast_sgemm_function fast_gemm = nnp_hwinfo.sgemm.only_mr_x_nr; while (output_channels_block_size >= output_channels_subblock_max) { output_channels_block_size -= output_channels_subblock_max; fast_gemm( reduction_block_size, reduction_block_start, packed_kernel, packed_input, output, output_image_size); packed_kernel += reduction_block_size * output_channels_subblock_max; output += output_image_size * output_channels_subblock_max; } } const nnp_full_sgemm_function full_gemm = nnp_hwinfo.sgemm.upto_mr_x_nr; while (output_channels_block_size != 0) { const size_t output_channels_subblock_size = min(output_channels_block_size, output_channels_subblock_max); output_channels_block_size -= output_channels_subblock_size; full_gemm( output_channels_subblock_size, output_image_subblock_size, reduction_block_size, reduction_block_start, packed_kernel, packed_input, output, output_image_size); packed_kernel += reduction_block_size * output_channels_subblock_max; output += output_image_size * output_channels_subblock_max; } } struct NNP_CACHE_ALIGN direct_convolution_context { const float* input; const float* kernel; float* output; size_t image_elements; size_t input_channels; size_t input_channels_block_max; size_t output_channels_block_max; nnp_fast_conv_function fast_conv; nnp_full_conv_function full_conv; }; static void compute_direct_convolution( const struct direct_convolution_context context[restrict static 1], size_t output_channels_block_start, size_t output_channels_block_size) { const size_t image_elements = context->image_elements; const size_t input_channels = context->input_channels; const size_t input_channels_block_max = context->input_channels_block_max; const size_t output_channels_block_max = context->output_channels_block_max; const float* input = context->input; const float* kernel = context->kernel + output_channels_block_start * input_channels; float* output = context->output + output_channels_block_start * image_elements; memset(output, 0, sizeof(float) * output_channels_block_size * image_elements); size_t input_channels_unprocessed = input_channels; if (output_channels_block_size == output_channels_block_max) { const nnp_fast_conv_function fast_conv = context->fast_conv; while (input_channels_unprocessed >= input_channels_block_max) { input_channels_unprocessed -= input_channels_block_max; fast_conv( input_channels, image_elements, input, kernel, output); input += input_channels_block_max * image_elements; kernel += input_channels_block_max; } } const nnp_full_conv_function full_conv = context->full_conv; while (input_channels_unprocessed != 0) { const size_t input_channels_block_size = min(input_channels_unprocessed, input_channels_block_max); input_channels_unprocessed -= input_channels_block_size; full_conv( input_channels_block_size, output_channels_block_size, input_channels, image_elements, input, kernel, output); input += input_channels_block_max * image_elements; kernel += input_channels_block_max; } } static enum nnp_status compute_fast_convolution_inference( const bool fourier_transform, const enum nnp_convolution_transform_strategy transform_strategy, const size_t transform_element_size, const size_t input_channels, const size_t output_channels, const struct nnp_size tile_size, const struct nnp_size input_size, const struct nnp_padding input_padding, const struct nnp_size kernel_size, const struct nnp_size output_size, const struct nnp_size output_subsampling, const float* input, const float* kernel, const float* bias, float* output, void* workspace_buffer, size_t* workspace_size, const nnp_transform_2d_with_offset input_transform_function, const nnp_transform_2d_with_offset kernel_transform_function, const nnp_transform_2d_with_bias output_transform_function, pthreadpool_t threadpool, struct nnp_profile* profile) { void* memory_block = NULL; size_t memory_size = 0; const size_t simd_width = nnp_hwinfo.simd_width; const size_t tuple_elements = (fourier_transform ? simd_width * 2 : simd_width); const size_t tuple_size = tuple_elements * transform_element_size; const size_t tile_elements = tile_size.height * tile_size.width; const size_t tuple_count = tile_elements / tuple_elements; const struct nnp_size output_tile_size = { .width = (tile_size.width - kernel_size.width) / output_subsampling.width + 1, .height = (tile_size.height - kernel_size.height) / output_subsampling.height + 1 }; const struct nnp_size tile_step = { .width = tile_size.width - kernel_size.width + 1, .height = tile_size.height - kernel_size.height + 1 }; const size_t tiles_y_count = divide_round_up(output_size.height, output_tile_size.height); const size_t tiles_x_count = divide_round_up(output_size.width, output_tile_size.width); const size_t tiles_count = tiles_x_count * tiles_y_count; /* Calculate cache blocking parameters */ const size_t cache_elements_l1 = nnp_hwinfo.blocking.l1 / tuple_size; const size_t cache_elements_l2 = nnp_hwinfo.blocking.l2 / tuple_size; const size_t cache_elements_l3 = nnp_hwinfo.blocking.l3 / tuple_size; const size_t tiles_subblock_max = (fourier_transform ? nnp_hwinfo.cxgemm.mr : nnp_hwinfo.sxgemm.mr); const size_t output_channels_subblock_max = (fourier_transform ? nnp_hwinfo.cxgemm.nr : nnp_hwinfo.sxgemm.nr); const size_t input_channels_block_max = round_down(cache_elements_l1 / (tiles_subblock_max + output_channels_subblock_max), 2); const size_t tiles_block_max = round_down(cache_elements_l2 / input_channels_block_max, tiles_subblock_max); const size_t output_channels_block_max = round_down(cache_elements_l3 / input_channels_block_max, output_channels_subblock_max); const size_t transform_tile_size = tile_elements * transform_element_size; const size_t input_transform_size = tiles_count * min(input_channels, input_channels_block_max) * transform_tile_size; const size_t output_transform_size = tiles_count * output_channels * transform_tile_size; switch (transform_strategy) { case nnp_convolution_transform_strategy_compute: case nnp_convolution_transform_strategy_reuse: { memory_size = input_transform_size + output_transform_size; const size_t kernel_transform_size = output_channels * min(input_channels, input_channels_block_max) * transform_tile_size; if (transform_strategy == nnp_convolution_transform_strategy_compute) { memory_size += kernel_transform_size; } if (workspace_buffer == NULL) { if (workspace_size == NULL) { memory_block = allocate_memory(memory_size); if (memory_block == NULL) { return nnp_status_out_of_memory; } } else { *workspace_size = memory_size; return nnp_status_success; } } else { if (*workspace_size < memory_size) { return nnp_status_insufficient_buffer; } memory_block = workspace_buffer; } void* input_transform = memory_block; void* output_transform = memory_block + input_transform_size; void* kernel_transform = memory_block + input_transform_size + output_transform_size; for (size_t input_channels_block_start = 0; input_channels_block_start < input_channels; input_channels_block_start += input_channels_block_max) { const size_t input_channels_block_size = min(input_channels - input_channels_block_start, input_channels_block_max); if (transform_strategy == nnp_convolution_transform_strategy_compute) { NNP_KERNEL_TRANSFORM_START(profile) struct kernel_transform_context kernel_transform_context = { .transform_function = kernel_transform_function, .kernel = kernel + input_channels_block_start * kernel_size.height * kernel_size.width, .kernel_transform = kernel_transform, .tuple_size = tuple_size, .input_channels = input_channels, .input_channels_block_size = input_channels_block_size, .output_channels = output_channels, .kernel_size = kernel_size, }; pthreadpool_parallelize_2d_tile_2d(threadpool, (pthreadpool_task_2d_tile_2d_t) compute_kernel_transform, &kernel_transform_context, output_channels, input_channels_block_size, output_channels_subblock_max, 1, PTHREADPOOL_FLAG_DISABLE_DENORMALS); NNP_KERNEL_TRANSFORM_END(profile) } else { kernel_transform = (void*) kernel + input_channels_block_start * output_channels * transform_tile_size; } NNP_INPUT_TRANSFORM_START(profile) struct input_transform_context input_transform_context = { .input = input, .input_transform = input_transform, .transform_function = input_transform_function, .tuple_size = tuple_size, .tiles_count = tiles_count, .tiles_x_count = fxdiv_init_size_t(tiles_x_count), .input_channels_block_start = input_channels_block_start, .input_channels_block_size = input_channels_block_size, .input_size = input_size, .input_padding_left = input_padding.left, .input_padding_top = input_padding.top, .input_tile = tile_size, .input_tile_step = tile_step, }; pthreadpool_parallelize_2d_tile_2d(threadpool, (pthreadpool_task_2d_tile_2d_t) compute_input_transform, &input_transform_context, input_channels_block_size, tiles_count, 1, tiles_subblock_max, PTHREADPOOL_FLAG_DISABLE_DENORMALS); NNP_INPUT_TRANSFORM_END(profile) NNP_BLOCK_MULTIPLICATION_START(profile) for (size_t tuple_index = 0; tuple_index < tuple_count; tuple_index += 1) { nnp_full_tuple_gemm_function full_gemm_function; nnp_fast_tuple_gemm_function fast_gemm_function; if (fourier_transform) { if (tuple_index < NNP_COMPLEX_TUPLE_INDEX) { fast_gemm_function = nnp_hwinfo.cxgemm.s4cX_conjb_only_mr_x_nr; full_gemm_function = nnp_hwinfo.cxgemm.s4cX_conjb_upto_mr_x_nr; } else { fast_gemm_function = nnp_hwinfo.cxgemm.cX_conjb_only_mr_x_nr; full_gemm_function = nnp_hwinfo.cxgemm.cX_conjb_upto_mr_x_nr; } } else { if NNP_LIKELY(transform_element_size == sizeof(float)) { fast_gemm_function = nnp_hwinfo.sxgemm.only_mr_x_nr; full_gemm_function = nnp_hwinfo.sxgemm.upto_mr_x_nr; } else { #if NNP_BACKEND_ARM fast_gemm_function = nnp_hwinfo.hxgemm.only_mr_x_nr; full_gemm_function = nnp_hwinfo.hxgemm.upto_mr_x_nr; #endif /* NNP_BACKEND_ARM */ } } for (size_t output_channels_block_start = 0; output_channels_block_start < output_channels; output_channels_block_start += output_channels_block_max) { const size_t output_channels_block_size = min(output_channels - output_channels_block_start, output_channels_block_max); struct tuple_multiplication_context tuple_multiplication_context = { .tuple_elements = tuple_elements, .tuple_size = tuple_size, .tiles_subblock_max = tiles_subblock_max, .input_channels_block_start = input_channels_block_start, .input_channels_block_size = input_channels_block_size, .output_channels = output_channels, .output_channels_subblock_max = output_channels_subblock_max, .output_channels_block_start = output_channels_block_start, .input_transform = input_transform + tuple_index * tiles_count * input_channels_block_size * tuple_size, .kernel_transform = kernel_transform + tuple_index * output_channels * input_channels_block_size * tuple_size, .output_transform = output_transform + tuple_index * tiles_count * output_channels * tuple_size, .fast_gemm = fast_gemm_function, .full_gemm = full_gemm_function, }; pthreadpool_parallelize_2d_tile_2d(threadpool, (pthreadpool_task_2d_tile_2d_t) compute_tuple_multiplication, &tuple_multiplication_context, tiles_count, output_channels_block_size, tiles_block_max, output_channels_subblock_max, PTHREADPOOL_FLAG_DISABLE_DENORMALS); } } NNP_BLOCK_MULTIPLICATION_END(profile) } NNP_OUTPUT_TRANSFORM_START(profile) struct output_transform_context output_transform_context = { .transform_function = output_transform_function, .output = output, .output_transform = output_transform, .bias = bias, .tuple_size = tuple_size, .tiles_count = tiles_count, .tiles_x_count = fxdiv_init_size_t(tiles_x_count), .tiles_block_max = fxdiv_init_size_t(tiles_block_max), .output_channels = output_channels, .output_size = output_size, .output_tile = output_tile_size, }; pthreadpool_parallelize_2d_tile_2d(threadpool, (pthreadpool_task_2d_tile_2d_t) compute_output_transform, &output_transform_context, output_channels, tiles_count, output_channels_subblock_max, tiles_subblock_max, PTHREADPOOL_FLAG_DISABLE_DENORMALS); NNP_OUTPUT_TRANSFORM_END(profile) break; } case nnp_convolution_transform_strategy_precompute: { const size_t kernel_transform_size = output_channels * input_channels * transform_tile_size; if (workspace_buffer == NULL) { *workspace_size = kernel_transform_size; return nnp_status_success; } else { if (*workspace_size < kernel_transform_size) { return nnp_status_insufficient_buffer; } memory_block = workspace_buffer; } for (size_t input_channels_block_start = 0; input_channels_block_start < input_channels; input_channels_block_start += input_channels_block_max) { const size_t input_channels_block_size = min(input_channels - input_channels_block_start, input_channels_block_max); NNP_KERNEL_TRANSFORM_START(profile) struct kernel_transform_context kernel_transform_context = { .transform_function = kernel_transform_function, .kernel = kernel + input_channels_block_start * kernel_size.height * kernel_size.width, .kernel_transform = (void*) workspace_buffer + input_channels_block_start * output_channels * transform_tile_size, .tuple_size = tuple_size, .input_channels = input_channels, .input_channels_block_size = input_channels_block_size, .output_channels = output_channels, .kernel_size = kernel_size, }; pthreadpool_parallelize_2d_tile_2d(threadpool, (pthreadpool_task_2d_tile_2d_t) compute_kernel_transform, &kernel_transform_context, output_channels, input_channels_block_size, output_channels_subblock_max, 1, PTHREADPOOL_FLAG_DISABLE_DENORMALS); NNP_KERNEL_TRANSFORM_END(profile) } break; } default: return nnp_status_invalid_transform_strategy; } if (memory_block != workspace_buffer) { release_memory(memory_block, memory_size); } return nnp_status_success; } static enum nnp_status compute_gemm_convolution_inference( const enum nnp_convolution_transform_strategy transform_strategy, const size_t input_channels, const size_t output_channels, const struct nnp_size input_size, const struct nnp_padding input_padding, const struct nnp_size kernel_size, const struct nnp_size output_size, const struct nnp_size output_subsampling, const float* input, const float* kernel, const float* bias, float* output, void* workspace_buffer, size_t* workspace_size, enum nnp_activation activation, pthreadpool_t threadpool, struct nnp_profile* profile) { enum nnp_status status = nnp_status_success; void* memory_block = NULL; size_t memory_size = 0; const size_t simd_width = nnp_hwinfo.simd_width; /* Calculate cache blocking parameters */ const size_t cache_elements_l1 = nnp_hwinfo.blocking.l1 / sizeof(float); const size_t cache_elements_l2 = nnp_hwinfo.blocking.l2 / sizeof(float); const size_t cache_elements_l3 = nnp_hwinfo.blocking.l3 / sizeof(float); const size_t output_channels_subblock_max = nnp_hwinfo.sgemm.mr; const size_t output_image_subblock_max = nnp_hwinfo.sgemm.nr; const size_t reduction_size = input_channels * kernel_size.height * kernel_size.width; const size_t output_image_size = output_size.height * output_size.width; const size_t reduction_block_max = round_down(cache_elements_l1 / (output_channels_subblock_max + output_image_subblock_max), 2); const size_t output_channels_block_max = round_down(cache_elements_l2 / reduction_block_max, output_channels_subblock_max); const size_t output_image_block_max = round_down(cache_elements_l3 / reduction_block_max, output_image_subblock_max); switch (transform_strategy) { case nnp_convolution_transform_strategy_compute: case nnp_convolution_transform_strategy_reuse: { const size_t packed_kernel_size = output_channels * min(reduction_block_max, reduction_size) * sizeof(float); const size_t packed_input_size = min(output_image_block_max, round_up(output_image_size, simd_width)) * min(reduction_block_max, reduction_size) * sizeof(float); memory_size = packed_kernel_size + packed_input_size; if (workspace_buffer == NULL) { if (workspace_size == NULL) { memory_block = allocate_memory(memory_size); if (memory_block == NULL) { return nnp_status_out_of_memory; } } else { *workspace_size = memory_size; return nnp_status_success; } } else { if (*workspace_size < memory_size) { return nnp_status_insufficient_buffer; } memory_block = workspace_buffer; } float* packed_input = memory_block; float* packed_kernel = memory_block + packed_input_size; for (size_t reduction_block_start = 0; reduction_block_start < reduction_size; reduction_block_start += reduction_block_max) { const size_t reduction_block_size = min(reduction_size - reduction_block_start, reduction_block_max); if (transform_strategy == nnp_convolution_transform_strategy_compute) { /* Pack kernel into memory block */ NNP_KERNEL_TRANSFORM_START(profile) struct kernel_packing_context kernel_packing_context = { .kernel = kernel + reduction_block_start, .packed_kernel = packed_kernel, .reduction_size = reduction_size, .reduction_block_start = reduction_block_start, .reduction_block_size = reduction_block_size, }; pthreadpool_parallelize_2d_tile_2d(threadpool, (pthreadpool_task_2d_tile_2d_t) compute_kernel_packing, &kernel_packing_context, output_channels, reduction_block_size, output_channels_subblock_max, 1, PTHREADPOOL_FLAG_DISABLE_DENORMALS); NNP_KERNEL_TRANSFORM_END(profile) } else { packed_kernel = (void*) kernel + output_channels * reduction_block_start * sizeof(float); } const struct fxdiv_divisor_size_t kernel_elements_divisor = fxdiv_init_size_t(kernel_size.height * kernel_size.width); const struct fxdiv_divisor_size_t kernel_width_divisor = fxdiv_init_size_t(kernel_size.width); const struct fxdiv_divisor_size_t output_width_divisor = fxdiv_init_size_t(output_size.width); for (size_t output_image_block_start = 0; output_image_block_start < output_image_size; output_image_block_start += output_image_block_max) { const size_t output_image_block_size = min(output_image_size - output_image_block_start, output_image_block_max); /* Pack image into L3 block */ NNP_INPUT_TRANSFORM_START(profile) struct input_packing_context input_packing_context = { .input = input, .packed_input = packed_input, .simd_width = simd_width, .reduction_block_start = reduction_block_start, .reduction_block_size = reduction_block_size, .output_image_block_start = output_image_block_start, .input_size = input_size, .input_padding_top = input_padding.top, .input_padding_left = input_padding.left, .kernel_elements = kernel_elements_divisor, .kernel_width = kernel_width_divisor, .output_width = output_width_divisor, .output_subsampling = output_subsampling, }; pthreadpool_parallelize_2d_tile_2d(threadpool, (pthreadpool_task_2d_tile_2d_t) compute_input_packing, &input_packing_context, reduction_block_size, output_image_block_size, 1, output_image_subblock_max, PTHREADPOOL_FLAG_DISABLE_DENORMALS); NNP_INPUT_TRANSFORM_END(profile) NNP_BLOCK_MULTIPLICATION_START(profile) struct matrix_multiplication_context matrix_multiplication_context = { .packed_kernel = packed_kernel, .packed_input = packed_input, .output = output, .reduction_block_start = reduction_block_start, .reduction_block_size = reduction_block_size, .output_image_size = output_image_size, .output_image_block_start = output_image_block_start, .output_image_subblock_max = output_image_subblock_max, .output_channels_subblock_max = output_channels_subblock_max, }; pthreadpool_parallelize_2d_tile_2d(threadpool, (pthreadpool_task_2d_tile_2d_t) compute_matrix_multiplication, &matrix_multiplication_context, output_channels, output_image_block_size, output_channels_block_max, output_image_subblock_max, PTHREADPOOL_FLAG_DISABLE_DENORMALS); NNP_BLOCK_MULTIPLICATION_END(profile) } } /* Add bias */ NNP_OUTPUT_TRANSFORM_START(profile) switch (activation) { case nnp_activation_identity: for (size_t output_channel = 0; output_channel < output_channels; output_channel += 1) { const float bias_value = bias[output_channel]; for (size_t index = 0; index < output_image_size; index += 1) { output[output_channel * output_image_size + index] += bias_value; } } break; case nnp_activation_relu: for (size_t output_channel = 0; output_channel < output_channels; output_channel += 1) { const float bias_value = bias[output_channel]; for (size_t index = 0; index < output_image_size; index += 1) { output[output_channel * output_image_size + index] = relu(output[output_channel * output_image_size + index] + bias_value, 0.0f); } } break; default: NNP_UNREACHABLE; } NNP_OUTPUT_TRANSFORM_END(profile) break; } case nnp_convolution_transform_strategy_precompute: { const size_t packed_kernel_size = output_channels * reduction_size * sizeof(float); if (workspace_buffer == NULL) { *workspace_size = packed_kernel_size; return nnp_status_success; } else { if (*workspace_size < packed_kernel_size) { return nnp_status_insufficient_buffer; } memory_block = workspace_buffer; } for (size_t reduction_block_start = 0; reduction_block_start < reduction_size; reduction_block_start += reduction_block_max) { const size_t reduction_block_size = min(reduction_size - reduction_block_start, reduction_block_max); /* Pack kernel into memory block */ NNP_KERNEL_TRANSFORM_START(profile) struct kernel_packing_context kernel_packing_context = { .kernel = kernel + reduction_block_start, .packed_kernel = (void*) workspace_buffer + output_channels * reduction_block_start * sizeof(float), .reduction_size = reduction_size, .reduction_block_start = reduction_block_start, .reduction_block_size = reduction_block_size, }; pthreadpool_parallelize_2d_tile_2d(threadpool, (pthreadpool_task_2d_tile_2d_t) compute_kernel_packing, &kernel_packing_context, output_channels, reduction_block_size, output_channels_subblock_max, 1, PTHREADPOOL_FLAG_DISABLE_DENORMALS); NNP_KERNEL_TRANSFORM_END(profile) } break; } default: return nnp_status_invalid_transform_strategy; } if (memory_block != workspace_buffer) { release_memory(memory_block, memory_size); } return status; } static enum nnp_status compute_direct_convolution_inference( const size_t input_channels, const size_t output_channels, const struct nnp_size image_size, const struct nnp_size kernel_size, const float* input, const float* kernel, const float* bias, float* output, void* workspace_buffer, size_t* workspace_size, enum nnp_activation activation, pthreadpool_t threadpool, struct nnp_profile* profile) { const size_t image_elements = image_size.height * image_size.width; if (workspace_buffer == NULL && workspace_size != NULL) { *workspace_size = 0; return nnp_status_success; } NNP_BLOCK_MULTIPLICATION_START(profile) struct direct_convolution_context direct_convolution_context = { .input = input, .kernel = kernel, .output = output, .image_elements = image_elements, .input_channels = input_channels, .input_channels_block_max = nnp_hwinfo.conv1x1.mr, .output_channels_block_max = nnp_hwinfo.conv1x1.nr, .fast_conv = nnp_hwinfo.conv1x1.only_mr_x_nr, .full_conv = nnp_hwinfo.conv1x1.upto_mr_x_nr, }; pthreadpool_parallelize_1d_tile_1d(threadpool, (pthreadpool_task_1d_tile_1d_t) compute_direct_convolution, &direct_convolution_context, output_channels, nnp_hwinfo.conv1x1.nr, PTHREADPOOL_FLAG_DISABLE_DENORMALS); NNP_BLOCK_MULTIPLICATION_END(profile) /* Add bias */ NNP_OUTPUT_TRANSFORM_START(profile) switch (activation) { case nnp_activation_identity: for (size_t output_channel = 0; output_channel < output_channels; output_channel += 1) { const float bias_value = bias[output_channel]; for (size_t index = 0; index < image_elements; index += 1) { output[output_channel * image_elements + index] += bias_value; } } break; case nnp_activation_relu: for (size_t output_channel = 0; output_channel < output_channels; output_channel += 1) { const float bias_value = bias[output_channel]; for (size_t index = 0; index < image_elements; index += 1) { output[output_channel * image_elements + index] = relu(output[output_channel * image_elements + index] + bias_value, 0.0f); } } break; default: NNP_UNREACHABLE; } NNP_OUTPUT_TRANSFORM_END(profile) return nnp_status_success; } static inline enum nnp_convolution_algorithm select_algorithm( struct nnp_size kernel_size, struct nnp_size output_subsampling, struct nnp_size output_size) { if (max(output_subsampling.height, output_subsampling.width) == 1) { /* Stride-1 convolution: consider fast convolution algorithm and direct 1x1 */ if (max(kernel_size.height, kernel_size.width) == 1) { return nnp_convolution_algorithm_direct; } else if (kernel_size.height == 3 && kernel_size.width == 3) { return nnp_convolution_algorithm_wt8x8; } else if (min(kernel_size.height, kernel_size.width) >= 2) { /* Consider FFT-based fast convolution */ if (max(kernel_size.height, kernel_size.width) <= 8) { /* Decide between FFT 8x8 and FFT 16x16 */ const size_t tile_count_8x8 = divide_round_up(output_size.height, 8 - kernel_size.height + 1) * divide_round_up(output_size.width, 8 - kernel_size.width + 1); const size_t tile_count_16x16 = divide_round_up(output_size.height, 16 - kernel_size.height + 1) * divide_round_up(output_size.width, 16 - kernel_size.width + 1); if (tile_count_8x8 <= 4 * tile_count_16x16) { /* 8x8 tiles are more efficient */ return nnp_convolution_algorithm_ft8x8; } else { return nnp_convolution_algorithm_ft16x16; } } else if (max(kernel_size.height, kernel_size.width) <= 16) { return nnp_convolution_algorithm_ft16x16; } } } /* Fall-back algorithm */ return nnp_convolution_algorithm_implicit_gemm; } enum nnp_status nnp_convolution_inference( enum nnp_convolution_algorithm algorithm, enum nnp_convolution_transform_strategy transform_strategy, size_t input_channels, size_t output_channels, struct nnp_size input_size, struct nnp_padding input_padding, struct nnp_size kernel_size, struct nnp_size output_subsampling, const float* input, const float* kernel, const float* bias, float* output, void* workspace_buffer, size_t* workspace_size, enum nnp_activation activation, const void* activation_parameters, pthreadpool_t threadpool, struct nnp_profile* profile) { NNP_TOTAL_START(profile) /* Basic validation of parameters. This check detects invalid, but not unsupported parameters. */ enum nnp_status status = validate_convolution_arguments( 1, input_channels, output_channels, input_size, input_padding, kernel_size, output_subsampling, activation, activation_parameters); if (status != nnp_status_success) { goto cleanup; } if (activation_parameters != NULL) { status = nnp_status_unsupported_activation_parameters; goto cleanup; } const struct nnp_size output_size = { .width = (input_padding.left + input_size.width + input_padding.right - kernel_size.width) / output_subsampling.width + 1, .height = (input_padding.top + input_size.height + input_padding.bottom - kernel_size.height) / output_subsampling.height + 1 }; if (algorithm == nnp_convolution_algorithm_auto) { algorithm = select_algorithm(kernel_size, output_subsampling, output_size); } struct nnp_size tile_size; size_t transform_element_size; bool fourier_transform; nnp_transform_2d_with_offset input_transform_function = NULL; nnp_transform_2d_with_offset kernel_transform_function = NULL; nnp_transform_2d_with_bias output_transform_function = NULL; switch (algorithm) { case nnp_convolution_algorithm_wt8x8_fp16: #if NNP_BACKEND_ARM if (kernel_size.height != 3 || kernel_size.width != 3) { status = nnp_status_unsupported_algorithm; goto cleanup; } if (max(output_subsampling.height, output_subsampling.width) > 1) { status = nnp_status_unsupported_algorithm; goto cleanup; } tile_size = (struct nnp_size) { .height = 8, .width = 8 }; transform_element_size = sizeof(uint16_t); fourier_transform = false; input_transform_function = nnp_hwinfo.transforms.iwt_f6x6_3x3_fp16_with_offset; kernel_transform_function = nnp_hwinfo.transforms.kwt_f6x6_3x3_fp16; switch (activation) { case nnp_activation_identity: output_transform_function = nnp_hwinfo.transforms.owt_f6x6_3x3_fp16_with_bias; break; case nnp_activation_relu: output_transform_function = nnp_hwinfo.transforms.owt_f6x6_3x3_fp16_with_bias_with_relu; break; default: NNP_UNREACHABLE; } if (input_transform_function != NULL && kernel_transform_function != NULL && output_transform_function != NULL) { break; } #endif /* * Fallthrough otherwise. The rationale here is that only some backends have fp16 storage natively implemented * (e.g. ARM NEON + VFP_FP16 currently), while configuration is (currently) fairly platform-independent. * Thus silently falling back to the baseline Winograd implementation is reasonable. */ case nnp_convolution_algorithm_wt8x8: if (kernel_size.height != 3 || kernel_size.width != 3) { status = nnp_status_unsupported_algorithm; goto cleanup; } tile_size = (struct nnp_size) { .height = 8, .width = 8 }; transform_element_size = sizeof(float); fourier_transform = false; input_transform_function = nnp_hwinfo.transforms.iwt_f6x6_3x3_with_offset_and_stream; kernel_transform_function = nnp_hwinfo.transforms.kwt_f6x6_3x3; switch (activation) { case nnp_activation_identity: if (output_subsampling.height == 1 && output_subsampling.width == 1) { output_transform_function = nnp_hwinfo.transforms.owt_f6x6_3x3_with_bias; } else if (output_subsampling.height == 2 && output_subsampling.width == 2) { output_transform_function = nnp_hwinfo.transforms.owt_f6x6_3x3s2_with_bias; } break; case nnp_activation_relu: if (output_subsampling.height == 1 && output_subsampling.width == 1) { output_transform_function = nnp_hwinfo.transforms.owt_f6x6_3x3_with_bias_with_relu; } else if (output_subsampling.height == 2 && output_subsampling.width == 2) { output_transform_function = nnp_hwinfo.transforms.owt_f6x6_3x3s2_with_bias_with_relu; } break; default: NNP_UNREACHABLE; } break; case nnp_convolution_algorithm_ft8x8: if (max(kernel_size.height, kernel_size.width) > 8) { status = nnp_status_unsupported_algorithm; goto cleanup; } if (max(output_subsampling.height, output_subsampling.width) > 1) { status = nnp_status_unsupported_algorithm; goto cleanup; } tile_size = (struct nnp_size) { .height = 8, .width = 8 }; transform_element_size = sizeof(float); fourier_transform = true; input_transform_function = nnp_hwinfo.transforms.fft8x8_with_offset_and_stream; kernel_transform_function = nnp_hwinfo.transforms.fft8x8_with_offset_and_stream; switch (activation) { case nnp_activation_identity: output_transform_function = nnp_hwinfo.transforms.ifft8x8_with_bias; break; case nnp_activation_relu: output_transform_function = nnp_hwinfo.transforms.ifft8x8_with_bias_with_relu; break; default: NNP_UNREACHABLE; } break; case nnp_convolution_algorithm_ft16x16: if (max(kernel_size.height, kernel_size.width) > 16) { status = nnp_status_unsupported_algorithm; goto cleanup; } if (max(output_subsampling.height, output_subsampling.width) > 1) { status = nnp_status_unsupported_algorithm; goto cleanup; } tile_size = (struct nnp_size) { .height = 16, .width = 16 }; transform_element_size = sizeof(float); fourier_transform = true; input_transform_function = nnp_hwinfo.transforms.fft16x16_with_offset_and_stream; kernel_transform_function = nnp_hwinfo.transforms.fft16x16_with_offset_and_stream; switch (activation) { case nnp_activation_identity: output_transform_function = nnp_hwinfo.transforms.ifft16x16_with_bias; break; case nnp_activation_relu: output_transform_function = nnp_hwinfo.transforms.ifft16x16_with_bias_with_relu; break; default: NNP_UNREACHABLE; } break; case nnp_convolution_algorithm_implicit_gemm: break; case nnp_convolution_algorithm_direct: if (max(kernel_size.height, kernel_size.width) > 1) { status = nnp_status_unsupported_algorithm; goto cleanup; } if (max(output_subsampling.height, output_subsampling.width) > 1) { status = nnp_status_unsupported_algorithm; goto cleanup; } break; case nnp_convolution_algorithm_auto: NNP_UNREACHABLE; default: status = nnp_status_invalid_algorithm; goto cleanup; } switch (algorithm) { case nnp_convolution_algorithm_wt8x8: case nnp_convolution_algorithm_wt8x8_fp16: case nnp_convolution_algorithm_ft8x8: case nnp_convolution_algorithm_ft16x16: if (input_transform_function == NULL || kernel_transform_function == NULL || output_transform_function == NULL) { status = nnp_status_unsupported_algorithm; goto cleanup; } status = compute_fast_convolution_inference( fourier_transform, transform_strategy, transform_element_size, input_channels, output_channels, tile_size, input_size, input_padding, kernel_size, output_size, output_subsampling, input, kernel, bias, output, workspace_buffer, workspace_size, input_transform_function, kernel_transform_function, output_transform_function, threadpool, profile); break; case nnp_convolution_algorithm_implicit_gemm: status = compute_gemm_convolution_inference( transform_strategy, input_channels, output_channels, input_size, input_padding, kernel_size, output_size, output_subsampling, input, kernel, bias, output, workspace_buffer, workspace_size, activation, threadpool, profile); break; case nnp_convolution_algorithm_direct: if (transform_strategy != nnp_convolution_transform_strategy_compute) { status = nnp_status_unsupported_transform_strategy; goto cleanup; } status = compute_direct_convolution_inference( input_channels, output_channels, input_size, kernel_size, input, kernel, bias, output, workspace_buffer, workspace_size, activation, threadpool, profile); break; case nnp_convolution_algorithm_auto: NNP_UNREACHABLE; } cleanup: NNP_TOTAL_END(profile) return status; }