// Copyright 2022 Google LLC // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. #include #include #include #include #include "xnnpack.h" // For xnn_operator_t. #include "xnnpack/allocator.h" #include "xnnpack/common.h" // For XNN_ALLOCATION_ALIGNMENT. #include "xnnpack/log.h" #include "xnnpack/math.h" #include "xnnpack/operator-utils.h" #include "xnnpack/operator.h" // For xnn_operator definition. #include "xnnpack/operator-type.h" void* xnn_get_pointer_to_write_weights( xnn_operator_t op, size_t aligned_weights_size, int padding_byte) { assert(aligned_weights_size % XNN_ALLOCATION_ALIGNMENT == 0); void* weights_ptr = NULL; if (use_weights_cache(op)) { weights_ptr = op->weights_cache->reserve_space(op->weights_cache->context, aligned_weights_size); if (weights_ptr == NULL) { return NULL; } } else { op->packed_weights.pointer = xnn_allocate_simd_memory(aligned_weights_size); if (op->packed_weights.pointer == NULL) { return NULL; } weights_ptr = op->packed_weights.pointer; } memset(weights_ptr, padding_byte, aligned_weights_size); return weights_ptr; } size_t xnn_compute_convolution_output_dimension( size_t padded_input_dimension, size_t kernel_dimension, size_t dilation_dimension, size_t subsampling_dimension) { const size_t effective_kernel_dimension = (kernel_dimension - 1) * dilation_dimension + 1; return doz(padded_input_dimension, effective_kernel_dimension) / subsampling_dimension + 1; } size_t xnn_compute_deconvolution_output_dimension( size_t input_dimension, size_t output_padding_dimension, size_t adjustment_dimension, size_t kernel_dimension, size_t dilation_dimension, size_t stride_dimension) { const size_t effective_kernel_dimension = (kernel_dimension - 1) * dilation_dimension + 1; return doz( stride_dimension * (input_dimension - 1) + adjustment_dimension + effective_kernel_dimension, output_padding_dimension); } size_t xnn_compute_unpooling_output_dimension( size_t input_dimension, size_t input_padding_dimension, size_t kernel_dimension) { return xnn_compute_deconvolution_output_dimension( input_dimension, input_padding_dimension, /*adjustment_dimension=*/0, kernel_dimension, /*dilation_dimension=*/1, /*stride_dimension=*/kernel_dimension); } // Calculate how much work a microkernel does. // A MxN microkernel does M+N (scalar) loads and M*N (scalar) FMAs. // So, given batch_size, the microkernel does: // divide_round_up(batch_size, mr) * (mr + nr) loads, and // divide_round_up(batch_size, mr) * (mr * nr) FMAs. // The total cost is then a linear combination of these 2 operations. From experimental data, use a multiplier of 3 for // loads, to prefer higher tile sizes which have better computation intensity. static size_t calculate_microkernel_cost(size_t batch_size, uint32_t mr, uint32_t nr) { return divide_round_up(batch_size, mr) * (3 * (mr + nr) + mr * nr); } static bool mr_is_available_gemm(size_t mr, struct xnn_hmp_gemm_ukernel *gemm_cases) { return gemm_cases[mr-1].function[XNN_UARCH_DEFAULT] != NULL; } uint32_t xnn_get_heuristic_mr_gemm( size_t batch_size, uint32_t max_mr, uint32_t nr, struct xnn_hmp_gemm_ukernel *gemm_cases) { if (batch_size <= max_mr && mr_is_available_gemm(batch_size, gemm_cases)) { // We have a microkernel with MR that is the exact match with batch_size. return batch_size; } // Try to find the best fitting mr. // - use a cost heuristic to calculate how much work is done by the microkernel (see calculate_microkernel_cost) // - smaller cost is better uint32_t best_mr = max_mr; size_t best_cost = SIZE_MAX; for (uint32_t mr = 1; mr <= max_mr; mr++) { if (!mr_is_available_gemm(mr, gemm_cases)){ continue; } const size_t current_cost = calculate_microkernel_cost(batch_size, mr, nr); if (current_cost <= best_cost) { best_mr = mr; best_cost = current_cost; } } return best_mr; } static bool mr_is_available_igemm(size_t mr, struct xnn_hmp_igemm_ukernel *igemm_cases) { return igemm_cases[mr-1].function[XNN_UARCH_DEFAULT] != NULL; } uint32_t xnn_get_heuristic_mr_igemm( size_t batch_size, uint32_t max_mr, uint32_t nr, struct xnn_hmp_igemm_ukernel *igemm_cases) { if (batch_size <= max_mr && mr_is_available_igemm(batch_size, igemm_cases)) { // We have a microkernel with MR that is the exact match with batch_size. return batch_size; } // Try to find the best fitting mr. // - use a cost heuristic to calculate how much work is done by the microkernel (see calculate_microkernel_cost) // - smaller cost is better uint32_t best_mr = max_mr; size_t best_cost = SIZE_MAX; for (uint32_t mr = 1; mr <= max_mr; mr++) { if (!mr_is_available_igemm(mr, igemm_cases)){ continue; } const size_t current_cost = calculate_microkernel_cost(batch_size, mr, nr); if (current_cost <= best_cost) { best_mr = mr; best_cost = current_cost; } } return best_mr; } enum xnn_status xnn_destroy_operator(xnn_operator_t op) { if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { xnn_log_error("failed to delete operator: XNNPACK is not initialized"); return xnn_status_uninitialized; } if (op == NULL) { return xnn_status_invalid_parameter; } xnn_release_memory(op->indirection_buffer); if (op->weights_cache == NULL) { xnn_release_simd_memory(op->packed_weights.pointer); } xnn_release_simd_memory(op->zero_buffer); if (op->zero_buffers) { for (size_t i = 1; i < op->batch_size; ++i) { xnn_release_simd_memory(op->zero_buffers[i]); } xnn_release_memory(op->zero_buffers); } xnn_release_memory(op->pixelwise_buffer); xnn_release_memory(op->subconvolution_buffer); xnn_release_simd_memory(op->lookup_table); return xnn_status_success; } const char* xnn_unary_operator_to_string(enum xnn_unary_operator op) { switch (op) { case xnn_unary_abs: return "abs"; case xnn_unary_bankers_rounding: return "bankers_rounding"; case xnn_unary_ceiling: return "ceiling"; case xnn_unary_clamp: return "clamp"; case xnn_unary_convert: return "convert"; case xnn_unary_elu: return "elu"; case xnn_unary_exp: return "exp"; case xnn_unary_floor: return "floor"; case xnn_unary_gelu: return "gelu"; case xnn_unary_hardswish: return "hardswish"; case xnn_unary_leaky_relu: return "leaky_relu"; case xnn_unary_log: return "log"; case xnn_unary_negate: return "negate"; case xnn_unary_reciprocal_square_root: return "reciprocal_square_root"; case xnn_unary_sigmoid: return "sigmoid"; case xnn_unary_square: return "square"; case xnn_unary_square_root: return "square_root"; case xnn_unary_tanh: return "tanh"; case xnn_unary_cube_root: return "cube_root"; case xnn_unary_cosine: return "cosine"; case xnn_unary_sine: return "sine"; case xnn_unary_count_leading_zeros: return "count_leading_zeros"; case xnn_unary_bitwise_not: return "bitwise_not"; case xnn_unary_popcount: return "popcount"; case xnn_unary_sign: return "sign"; case xnn_unary_invalid: return "invalid"; } XNN_UNREACHABLE; return "unknown"; } const char* xnn_binary_operator_to_string(enum xnn_binary_operator op) { switch (op) { case xnn_binary_add: return "add"; case xnn_binary_divide: return "divide"; case xnn_binary_multiply: return "multiply"; case xnn_binary_subtract: return "subtract"; case xnn_binary_copysign: return "copysign"; case xnn_binary_squared_difference: return "squared_difference"; case xnn_binary_prelu: return "prelu"; case xnn_binary_minimum: return "minimum"; case xnn_binary_maximum: return "maximum"; case xnn_binary_modulus: return "modulus"; case xnn_binary_atan2: return "atan2"; case xnn_binary_pow: return "pow"; case xnn_binary_bitwise_and: return "bitwise_and"; case xnn_binary_bitwise_or: return "bitwise_or"; case xnn_binary_bitwise_xor: return "bitwise_xor"; case xnn_binary_shift_left: return "shift_left"; case xnn_binary_shift_right_logical: return "shift_right_logical"; case xnn_binary_shift_right_arithmetic: return "shift_right_arithmetic"; case xnn_binary_invalid: return "invalid"; } XNN_UNREACHABLE; return "unknown"; } enum xnn_operator_type xnn_reduce_operator_to_operator_type(enum xnn_reduce_operator type) { switch (type) { case xnn_reduce_mean: return xnn_operator_type_mean_nd; case xnn_reduce_sum: return xnn_operator_type_sum_nd; default: return xnn_operator_type_invalid; } }