// Auto-generated file. Do not edit! // Template: src/f32-avgpool/rvv_9x.c.in // Generator: tools/xngen // // Copyright 2024 Imagination Technologies, inc. // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. #include #include "xnnpack/avgpool.h" #include void xnn_f32_avgpool_minmax_ukernel_9x__rvv_c2v( size_t output_pixels, size_t kernel_elements, size_t channels, const float** input, size_t input_offset, const float* zero, float* output, size_t input_increment, size_t output_increment, const struct xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(output_pixels != 0); assert(kernel_elements != 0); assert(kernel_elements <= 9); assert(channels != 0); assert((input_offset & 3) == 0); input_offset >>= XNN_LOG2_SIZEOF_FLOAT; const float scale = params->scalar.scale; const float min = params->scalar.min; const float max = params->scalar.max; do { const float *i[9]; for (size_t kk = 0; kk < kernel_elements; ++kk) { assert(input[kk] != NULL); i[kk] = (input[kk] != zero ? input[kk] + input_offset : zero) ; } for (size_t tail = kernel_elements; tail < 9; ++tail) { i[tail] = zero; } input = (const float**) ((uintptr_t) input + input_increment); for (size_t c = channels; c != 0; ) { int32_t n = __riscv_vsetvl_e32m2(c); vfloat32m2_t i0_f32v = __riscv_vle32_v_f32m2(i[0], n); i[0] += n; vfloat32m2_t i1_f32v = __riscv_vle32_v_f32m2(i[1], n); i[1] += n; vfloat32m2_t i2_f32v = __riscv_vle32_v_f32m2(i[2], n); i[2] += n; vfloat32m2_t i3_f32v = __riscv_vle32_v_f32m2(i[3], n); i[3] += n; vfloat32m2_t i4_f32v = __riscv_vle32_v_f32m2(i[4], n); i[4] += n; vfloat32m2_t i5_f32v = __riscv_vle32_v_f32m2(i[5], n); i[5] += n; vfloat32m2_t i6_f32v = __riscv_vle32_v_f32m2(i[6], n); i[6] += n; vfloat32m2_t i7_f32v = __riscv_vle32_v_f32m2(i[7], n); i[7] += n; vfloat32m2_t i8_f32v = __riscv_vle32_v_f32m2(i[8], n); i[8] += n; vfloat32m2_t sum01_f32v = __riscv_vfadd_vv_f32m2(i0_f32v, i1_f32v, n); vfloat32m2_t sum23_f32v = __riscv_vfadd_vv_f32m2(i2_f32v, i3_f32v, n); vfloat32m2_t sum45_f32v = __riscv_vfadd_vv_f32m2(i4_f32v, i5_f32v, n); vfloat32m2_t sum67_f32v = __riscv_vfadd_vv_f32m2(i6_f32v, i7_f32v, n); vfloat32m2_t sum018_f32v = __riscv_vfadd_vv_f32m2(sum01_f32v, i8_f32v, n); vfloat32m2_t sum2345_f32v = __riscv_vfadd_vv_f32m2(sum23_f32v, sum45_f32v, n); vfloat32m2_t sum01678_f32v = __riscv_vfadd_vv_f32m2(sum018_f32v, sum67_f32v, n); vfloat32m2_t sum_f32v = __riscv_vfadd_vv_f32m2(sum2345_f32v, sum01678_f32v, n); vfloat32m2_t out_f32v = __riscv_vfmul_vf_f32m2(sum_f32v, scale, n); out_f32v = __riscv_vfmin_vf_f32m2(__riscv_vfmax_vf_f32m2(out_f32v, min, n), max, n); __riscv_vse32_v_f32m2(output, out_f32v, n); output += n; c -= n; } output = (float*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); }