// Copyright 2020 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

$assert REQUANTIZATION in ["FP32", "RNDNU"] or not REQUANTIZATION
$assert DATATYPE in ["QC8", "QS8", "QD8_F16", "QD8_F32"]
$assert DATATYPE != "QC8" or REQUANTIZATION == "FP32"
$assert DATATYPE != "QD8" or not REQUANTIZATION

#include "xnnpack/assembly.h"

$DATATYPE_SPEC = {"QC8": "qs8_qc8w", "QS8": "qs8", "QD8_F16" : "qd8_f16_qc8w", "QD8_F32": "qd8_f32_qc8w"}[DATATYPE]
$PARAMS_UNION = {"QC8": "xnn_qs8_qc8w_conv_minmax_params", "QS8": "xnn_qs8_conv_minmax_params", "QD8_F16": "xnn_f16_minmax_params", "QD8_F32": "xnn_f32_minmax_params"}[DATATYPE]
$REQUANTIZATION_SPEC = "_" + REQUANTIZATION.lower() if REQUANTIZATION else ""
$REWIND_DECREMENT = 3 if DATATYPE == "QC8" else {"RNDNU": 15, "FP32": 7}[REQUANTIZATION] if REQUANTIZATION else 4
$SP_OFFSET = 48 if DATATYPE in ["QD8_F16", "QD8_F32"] else 32
$PARAM_OFFSET = 32 if DATATYPE in ["QD8_F16", "QD8_F32"] else 24
$STACK_REGISTERS = 12 if DATATYPE in ["QD8_F16", "QD8_F32"] else 11
$SCALING_PARAMS = "xnn_qd8_quantization_params" if DATATYPE in ["QD8_F16", "QD8_F32"] else ""
$ISA = "fp16arith" if DATATYPE in ["QD8_F16"] else ""
# void xnn_${DATATYPE_SPEC}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__asm_aarch64_neondot${ISA}_cortex_a55(
#     size_t mr,                 x0
#     size_t nc,                 x1
#     size_t kc,                 x2 / x0
#     size_t ks,                 x3 / x9
#     const int8_t** restrict a,  x4
#     const int8_t* restrict w,  x5
#     int8_t* restrict c,        x6
#     size_t cm_stride,          x7
#     size_t cn_stride,                  [sp] -> (x0)
#     size_t a_offset,                   [sp + 8] -> x8
#     const int8_t* zero,                [sp + 16] -> x12
$if DATATYPE in ["QD8_F16", "QD8_F32"]:
  #     const int8_t* zero_data,             [sp + 24] -> x19
  #     const union ${PARAMS_UNION} *params,  [sp + 32]  -> x11
  #     const struct ${SCALING_PARAMS} *quantization_params) [sp + 40] -> x17
$else:
  #     const union ${PARAMS_UNION} *params)  [sp + 24] -> x11

# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.

// Register usage
// A0  x13  v0  v4
// A1  x14  v1  v5
// A2  x15  v2  v6
// A3  x10  v3  v7
// B    x5  v8  v9 v10 v11
// C0   x6 v16 v20 v24 v28
// C1  x16 v17 v21 v25 v29
// C2  x17 v18 v22 v26 v30
// C3   x7 v19 v23 v27 v31
$if DATATYPE in ["QD8_F16", "QD8_F32"]:
  // unused v13, v14 v15
$else:
  // unused v12 v13 v14 v15

// x11 temp for Cortex-A55 loads

BEGIN_FUNCTION xnn_${DATATYPE_SPEC}_igemm_minmax${REQUANTIZATION_SPEC}_ukernel_4x16c4__asm_aarch64_neondot${ISA}_cortex_a55

        # Clamp C pointers
        CMP         x0, 2                   // if mr < 2
        LDR         x8, [sp, 8]             // Load a_offset
        ADD         x16, x6, x7             // c1 = c0 + cm_stride
        LDR         x12, [sp, 16]       // Load zero
        LDR         x11, [sp, ${PARAM_OFFSET}]      // Load params pointer
        CSEL        x16, x6,  x16, LO       //   c1 = c0
        ADD         x2, x2, 3               // kc = (kc + 3) & ~3
        STP         d8,  d9, [sp, -${SP_OFFSET}]!   // Save d8-d11 on stack
        $if DATATYPE in ["QD8_F16", "QD8_F32"]:
          STR         x19, [sp, 40]       // Save x19 to stack
          LDR         x19, [sp, 72]       // Load zero_data
          STR         d12,  [sp, 32]
          LDR         x17, [sp, 88]           // &quantization_params.zero_point
          LD1         {v12.4s}, [x17]         // zero point and scale

        ADD         x17, x16, x7            // c2 = c1 + cm_stride
        STP         d10, d11, [sp, 16]
                                            // if mr <= 2
        CSEL        x17, x16, x17, LS       //   c2 = c1
        BIC         x2, x2, 3

        CMP         x0, 4                   // if mr < 4
        ADD         x7,  x17, x7            // c3 = c2 + cm_stride
        CSEL        x7,  x17, x7, LO        //   c3 = c2

        .p2align    3
0:
        # Load initial bias from w into accumulators
        LDP         q16, q20, [x5], 32
        $if DATATYPE not in ["QD8_F16", "QD8_F32"]:
          MOV         v17.16b, v16.16b
          MOV         v18.16b, v16.16b
          LDP         q24, q28, [x5], 32
          MOV         v19.16b, v16.16b
          MOV         v21.16b, v20.16b
          MOV         v22.16b, v20.16b
          MOV         v23.16b, v20.16b
          MOV         v25.16b, v24.16b
          MOV         v26.16b, v24.16b
          MOV         v27.16b, v24.16b
          MOV         v29.16b, v28.16b
          MOV         v30.16b, v28.16b
          MOV         v31.16b, v28.16b
        $else:
          MUL         v17.4s, v16.4s, v12.s[0]
          MUL         v18.4s, v16.4s, v12.s[0]
          LDP         q24, q28, [x5], 32
          MUL         v19.4s, v16.4s, v12.s[0]
          MUL         v21.4s, v20.4s, v12.s[0]
          MUL         v22.4s, v20.4s, v12.s[0]
          MUL         v23.4s, v20.4s, v12.s[0]
          MUL         v25.4s, v24.4s, v12.s[0]
          MUL         v26.4s, v24.4s, v12.s[0]
          MUL         v27.4s, v24.4s, v12.s[0]
          MUL         v29.4s, v28.4s, v12.s[0]
          MUL         v30.4s, v28.4s, v12.s[0]
          MUL         v31.4s, v28.4s, v12.s[0]
          MUL         v24.4s, v24.4s, v12.s[0]
          MUL         v28.4s, v28.4s, v12.s[0]
          MUL         v16.4s, v16.4s, v12.s[0]
          MUL         v20.4s, v20.4s, v12.s[0]
        MOV         x9, x3                  // p = ks

        .p2align    3
1:
        # Load next 4 A pointers
        LDP         x13, x14, [x4], 16
        LDP         x15, x10, [x4], 16

        CMP         x13, x12                // if a0 == zero
        ADD         x13, x13, x8            // a0 += a_offset
        $if DATATYPE in ["QD8_F16", "QD8_F32"]:
          CSEL        x13, x19, x13, EQ       //   a0 = zero_data, else a0 += a_offset
        $else:
          CSEL        x13, x12, x13, EQ       //   a0 = zero, else a0 += a_offset
        CMP         x14, x12                // if a1 == zero
        ADD         x14, x14, x8            // a1 += a_offset
        $if DATATYPE in ["QD8_F16", "QD8_F32"]:
          CSEL        x14, x19, x14, EQ       //   a1 = zero_data, else a1 += a_offset
        $else:
          CSEL        x14, x12, x14, EQ       //   a1 = zero, else a1 += a_offset
        CMP         x15, x12                // if a2 == zero
        ADD         x15, x15, x8            // a2 += a_offset
        $if DATATYPE in ["QD8_F16", "QD8_F32"]:
          CSEL        x15, x19, x15, EQ       //   a2 = zero_data, else a2 += a_offset
        $else:
          CSEL        x15, x12, x15, EQ       //   a2 = zero, else a2 += a_offset
        CMP         x10, x12                // if a3 == zero
        ADD         x10, x10, x8            // a3 += a_offset
        $if DATATYPE in ["QD8_F16", "QD8_F32"]:
          CSEL        x10, x19, x10, EQ       //   a3 = zero_data, else a3 += a_offset
        $else:
          CSEL        x10, x12, x10, EQ       //   a3 = zero, else a3 += a_offset

        # Is there at least 16 bytes for prologue/epilogue?
        SUBS        x0, x2, 16              // k = kc - 16
        B.LO        5f

        # prologue - read A and B values for block 0 and 1
        LDR         d0, [x13], 8
        LDR         q8,  [x5], 16
        LDR         d1, [x14], 8
        LDR         d2, [x15], 8
        LDR         d3, [x10], 8
        SUBS        x0, x0, 16              // is there 16 for main loop?
        LDR         d9,  [x5], 8
        LDR         x11,  [x5], 8
        # Is there at least 16 bytes for main loop?
        B.LO        3f

        # Main loop - 16 bytes of A in 4 groups.
        # 4 row of 4 vectors wide = 16 sdot instructions for 4 channels
        # 4 LD64 for A
        # 4 LD128 for W. = 2 LD64 + INS.
        # for each 4 sdot, 1 LD64 for A, 2 LD64 for W + INS.

        .p2align    3
2:
        # BLOCK 0
        SDOT        v16.4s,  v8.16b, v0.4b[0]
        LDR         d10,  [x5], 8
        SDOT        v17.4s,  v8.16b, v1.4b[0]
        INS         v9.d[1], x11
        SDOT        v18.4s,  v8.16b, v2.4b[0]
        LDR         x11,  [x5], 8
        SDOT        v19.4s,  v8.16b, v3.4b[0]
        LDR         d4,  [x13], 8

        # BLOCK 1
        SDOT        v20.4s,  v9.16b, v0.4b[0]
        LDR         d11,  [x5], 8
        SDOT        v21.4s,  v9.16b, v1.4b[0]
        INS         v10.d[1], x11
        SDOT        v22.4s,  v9.16b, v2.4b[0]
        LDR         x11,  [x5], 8
        SDOT        v23.4s,  v9.16b, v3.4b[0]
        LDR         d5, [x14], 8

        # BLOCK 2
        SDOT        v24.4s, v10.16b, v0.4b[0]
        LDR         d8,  [x5], 8
        SDOT        v25.4s, v10.16b, v1.4b[0]
        INS         v11.d[1], x11
        SDOT        v26.4s, v10.16b, v2.4b[0]
        LDR         x11,  [x5], 8
        SDOT        v27.4s, v10.16b, v3.4b[0]
        LDR         d6, [x15], 8

        # BLOCK 3
        SDOT        v28.4s, v11.16b, v0.4b[0]
        LDR         d9,  [x5], 8
        SDOT        v29.4s, v11.16b, v1.4b[0]
        INS         v8.d[1], x11
        SDOT        v30.4s, v11.16b, v2.4b[0]
        LDR         x11,  [x5], 8
        SDOT        v31.4s, v11.16b, v3.4b[0]
        LDR         d7,  [x10], 8

        # BLOCK 0
        SDOT        v16.4s,  v8.16b, v0.4b[1]
        LDR         d10,  [x5], 8
        SDOT        v17.4s,  v8.16b, v1.4b[1]
        INS         v9.d[1], x11
        SDOT        v18.4s,  v8.16b, v2.4b[1]
        LDR         x11,  [x5], 8
        SDOT        v19.4s,  v8.16b, v3.4b[1]

        # BLOCK 1
        SDOT        v20.4s,  v9.16b, v0.4b[1]
        LDR         d11,  [x5], 8
        SDOT        v21.4s,  v9.16b, v1.4b[1]
        INS         v10.d[1], x11
        SDOT        v22.4s,  v9.16b, v2.4b[1]
        LDR         x11,  [x5], 8
        SDOT        v23.4s,  v9.16b, v3.4b[1]

        # BLOCK 2
        SDOT        v24.4s, v10.16b, v0.4b[1]
        LDR         d8,  [x5], 8
        SDOT        v25.4s, v10.16b, v1.4b[1]
        INS         v11.d[1], x11
        SDOT        v26.4s, v10.16b, v2.4b[1]
        LDR         x11,  [x5], 8
        SDOT        v27.4s, v10.16b, v3.4b[1]

        # BLOCK 3
        SDOT        v28.4s, v11.16b, v0.4b[1]
        LDR         d9,  [x5], 8
        SDOT        v29.4s, v11.16b, v1.4b[1]
        INS         v8.d[1], x11
        SDOT        v30.4s, v11.16b, v2.4b[1]
        LDR         x11,  [x5], 8
        SDOT        v31.4s, v11.16b, v3.4b[1]

        # BLOCK 0
        SDOT        v16.4s,  v8.16b, v4.4b[0]
        LDR         d10,  [x5], 8
        SDOT        v17.4s,  v8.16b, v5.4b[0]
        INS         v9.d[1], x11
        SDOT        v18.4s,  v8.16b, v6.4b[0]
        LDR         x11,  [x5], 8
        SDOT        v19.4s,  v8.16b, v7.4b[0]
        LDR         d0,  [x13], 8

        # BLOCK 1
        SDOT        v20.4s,  v9.16b, v4.4b[0]
        LDR         d11,  [x5], 8
        SDOT        v21.4s,  v9.16b, v5.4b[0]
        INS         v10.d[1], x11
        SDOT        v22.4s,  v9.16b, v6.4b[0]
        LDR         x11,  [x5], 8
        SDOT        v23.4s,  v9.16b, v7.4b[0]
        LDR         d1, [x14], 8

        # BLOCK 2
        SDOT        v24.4s, v10.16b, v4.4b[0]
        LDR         d8,  [x5], 8
        SDOT        v25.4s, v10.16b, v5.4b[0]
        INS         v11.d[1], x11
        SDOT        v26.4s, v10.16b, v6.4b[0]
        LDR         x11,  [x5], 8
        SDOT        v27.4s, v10.16b, v7.4b[0]
        LDR         d2, [x15], 8

        # BLOCK 3
        SDOT        v28.4s, v11.16b, v4.4b[0]
        LDR         d9,  [x5], 8
        SDOT        v29.4s, v11.16b, v5.4b[0]
        INS         v8.d[1], x11
        SDOT        v30.4s, v11.16b, v6.4b[0]
        LDR         x11,  [x5], 8
        SDOT        v31.4s, v11.16b, v7.4b[0]
        LDR         d3,  [x10], 8

        # BLOCK 0
        SDOT        v16.4s,  v8.16b, v4.4b[1]
        LDR         d10,  [x5], 8
        SDOT        v17.4s,  v8.16b, v5.4b[1]
        INS         v9.d[1], x11
        SDOT        v18.4s,  v8.16b, v6.4b[1]
        LDR         x11,  [x5], 8
        SDOT        v19.4s,  v8.16b, v7.4b[1]

        # BLOCK 1
        SDOT        v20.4s,  v9.16b, v4.4b[1]
        LDR         d11,  [x5], 8
        SDOT        v21.4s,  v9.16b, v5.4b[1]
        INS         v10.d[1], x11
        SDOT        v22.4s,  v9.16b, v6.4b[1]
        LDR         x11,  [x5], 8
        SDOT        v23.4s,  v9.16b, v7.4b[1]

        # BLOCK 2
        SDOT        v24.4s, v10.16b, v4.4b[1]
        LDR         d8,  [x5], 8            // First B values for block 0 and 1
        SDOT        v25.4s, v10.16b, v5.4b[1]
        INS         v11.d[1], x11
        SDOT        v26.4s, v10.16b, v6.4b[1]
        LDR         x11,  [x5], 8
        SDOT        v27.4s, v10.16b, v7.4b[1]
        SUBS        x0, x0, 16

        # BLOCK 3
        SDOT        v28.4s, v11.16b, v4.4b[1]
        LDR         d9,  [x5], 8
        SDOT        v29.4s, v11.16b, v5.4b[1]
        INS         v8.d[1], x11
        SDOT        v30.4s, v11.16b, v6.4b[1]
        LDR         x11,  [x5], 8
        SDOT        v31.4s, v11.16b, v7.4b[1]
        B.HS        2b

        # Epilogue.  Same as main loop but no preloads in final group
3:
        # BLOCK 0
        SDOT        v16.4s,  v8.16b, v0.4b[0]
        LDR         d10,  [x5], 8
        SDOT        v17.4s,  v8.16b, v1.4b[0]
        INS         v9.d[1], x11
        SDOT        v18.4s,  v8.16b, v2.4b[0]
        LDR         x11,  [x5], 8
        SDOT        v19.4s,  v8.16b, v3.4b[0]
        LDR         d4,  [x13], 8

        # BLOCK 1
        SDOT        v20.4s,  v9.16b, v0.4b[0]
        LDR         d11,  [x5], 8
        SDOT        v21.4s,  v9.16b, v1.4b[0]
        INS         v10.d[1], x11
        SDOT        v22.4s,  v9.16b, v2.4b[0]
        LDR         x11,  [x5], 8
        SDOT        v23.4s,  v9.16b, v3.4b[0]
        LDR         d5, [x14], 8

        # BLOCK 2
        SDOT        v24.4s, v10.16b, v0.4b[0]
        LDR         d8,  [x5], 8
        SDOT        v25.4s, v10.16b, v1.4b[0]
        INS         v11.d[1], x11
        SDOT        v26.4s, v10.16b, v2.4b[0]
        LDR         x11,  [x5], 8
        SDOT        v27.4s, v10.16b, v3.4b[0]
        LDR         d6, [x15], 8

        # BLOCK 3
        SDOT        v28.4s, v11.16b, v0.4b[0]
        LDR         d9,  [x5], 8
        SDOT        v29.4s, v11.16b, v1.4b[0]
        INS         v8.d[1], x11
        SDOT        v30.4s, v11.16b, v2.4b[0]
        LDR         x11,  [x5], 8
        SDOT        v31.4s, v11.16b, v3.4b[0]
        LDR         d7,  [x10], 8

        # BLOCK 0
        SDOT        v16.4s,  v8.16b, v0.4b[1]
        LDR         d10,  [x5], 8
        SDOT        v17.4s,  v8.16b, v1.4b[1]
        INS         v9.d[1], x11
        SDOT        v18.4s,  v8.16b, v2.4b[1]
        LDR         x11,  [x5], 8
        SDOT        v19.4s,  v8.16b, v3.4b[1]

        # BLOCK 1
        SDOT        v20.4s,  v9.16b, v0.4b[1]
        LDR         d11,  [x5], 8
        SDOT        v21.4s,  v9.16b, v1.4b[1]
        INS         v10.d[1], x11
        SDOT        v22.4s,  v9.16b, v2.4b[1]
        LDR         x11,  [x5], 8
        SDOT        v23.4s,  v9.16b, v3.4b[1]

        # BLOCK 2
        SDOT        v24.4s, v10.16b, v0.4b[1]
        LDR         d8,  [x5], 8
        SDOT        v25.4s, v10.16b, v1.4b[1]
        INS         v11.d[1], x11
        SDOT        v26.4s, v10.16b, v2.4b[1]
        LDR         x11,  [x5], 8
        SDOT        v27.4s, v10.16b, v3.4b[1]

        # BLOCK 3
        SDOT        v28.4s, v11.16b, v0.4b[1]
        LDR         d9,  [x5], 8
        SDOT        v29.4s, v11.16b, v1.4b[1]
        INS         v8.d[1], x11
        SDOT        v30.4s, v11.16b, v2.4b[1]
        LDR         x11,  [x5], 8
        SDOT        v31.4s, v11.16b, v3.4b[1]

        # BLOCK 0
        SDOT        v16.4s,  v8.16b, v4.4b[0]
        LDR         d10,  [x5], 8
        SDOT        v17.4s,  v8.16b, v5.4b[0]
        INS         v9.d[1], x11
        SDOT        v18.4s,  v8.16b, v6.4b[0]
        LDR         x11,  [x5], 8
        SDOT        v19.4s,  v8.16b, v7.4b[0]

        # BLOCK 1
        SDOT        v20.4s,  v9.16b, v4.4b[0]
        LDR         d11,  [x5], 8
        SDOT        v21.4s,  v9.16b, v5.4b[0]
        INS         v10.d[1], x11
        SDOT        v22.4s,  v9.16b, v6.4b[0]
        LDR         x11,  [x5], 8
        SDOT        v23.4s,  v9.16b, v7.4b[0]

        # BLOCK 2
        SDOT        v24.4s, v10.16b, v4.4b[0]
        LDR         d8,  [x5], 8
        SDOT        v25.4s, v10.16b, v5.4b[0]
        INS         v11.d[1], x11
        SDOT        v26.4s, v10.16b, v6.4b[0]
        LDR         x11,  [x5], 8
        SDOT        v27.4s, v10.16b, v7.4b[0]

        # BLOCK 3
        SDOT        v28.4s, v11.16b, v4.4b[0]
        LDR         d9,  [x5], 8
        SDOT        v29.4s, v11.16b, v5.4b[0]
        INS         v8.d[1], x11
        SDOT        v30.4s, v11.16b, v6.4b[0]
        LDR         x11,  [x5], 8
        SDOT        v31.4s, v11.16b, v7.4b[0]

        # BLOCK 0
        SDOT        v16.4s,  v8.16b, v4.4b[1]
        LDR         d10,  [x5], 8
        SDOT        v17.4s,  v8.16b, v5.4b[1]
        INS         v9.d[1], x11
        SDOT        v18.4s,  v8.16b, v6.4b[1]
        LDR         x11,  [x5], 8
        SDOT        v19.4s,  v8.16b, v7.4b[1]

        # BLOCK 1
        SDOT        v20.4s,  v9.16b, v4.4b[1]
        LDR         d11,  [x5], 8
        SDOT        v21.4s,  v9.16b, v5.4b[1]
        INS         v10.d[1], x11
        SDOT        v22.4s,  v9.16b, v6.4b[1]
        LDR         x11,  [x5], 8
        SDOT        v23.4s,  v9.16b, v7.4b[1]

        # BLOCK 2
        SDOT        v24.4s, v10.16b, v4.4b[1]
        SDOT        v25.4s, v10.16b, v5.4b[1]
        INS         v11.d[1], x11
        SDOT        v26.4s, v10.16b, v6.4b[1]
        SDOT        v27.4s, v10.16b, v7.4b[1]
        AND         x0, x2, 15              // kc remainder 0 to 12

        # BLOCK 3
        SDOT        v28.4s, v11.16b, v4.4b[1]
        SDOT        v29.4s, v11.16b, v5.4b[1]
        LDR         x11, [sp, ${SP_OFFSET + PARAM_OFFSET}]                  // reload params pointer
        SDOT        v30.4s, v11.16b, v6.4b[1]
        SDOT        v31.4s, v11.16b, v7.4b[1]

        # Is there a remainder?- 4 to 12 bytes of A
        CBNZ        x0, 6f

        .p2align    3
4:
        # ks loop
        SUBS        x9, x9, 32              // ks -= MR * sizeof(int8_t*)
        B.HI        1b

        $if DATATYPE in ["QD8_F16", "QD8_F32"]:
          SCVTF       v16.4s, v16.4s
          SCVTF       v17.4s, v17.4s
          SCVTF       v18.4s, v18.4s
          SCVTF       v19.4s, v19.4s
          SCVTF       v20.4s, v20.4s
          SCVTF       v21.4s, v21.4s
          SCVTF       v22.4s, v22.4s
          SCVTF       v23.4s, v23.4s
          LDP         q0, q1, [x5], 32        // kernel_scale
          SCVTF       v24.4s, v24.4s
          SCVTF       v25.4s, v25.4s
          SCVTF       v26.4s, v26.4s
          SCVTF       v27.4s, v27.4s
          SCVTF       v28.4s, v28.4s
          SCVTF       v29.4s, v29.4s
          SCVTF       v30.4s, v30.4s
          SCVTF       v31.4s, v31.4s
          LDP         q2, q3, [x5], 32
          FMUL        v4.4s, v0.4s, v12.s[1]  // kernel_scale * scale
          FMUL        v5.4s, v1.4s, v12.s[1]
          FMUL        v6.4s, v2.4s, v12.s[1]
          FMUL        v7.4s, v3.4s, v12.s[1]
          FMUL        v8.4s, v0.4s, v12.s[1]
          FMUL        v9.4s, v1.4s, v12.s[1]
          FMUL        v10.4s, v2.4s, v12.s[1]
          FMUL        v11.4s, v3.4s, v12.s[1]
          FMUL        v16.4s, v16.4s, v4.4s
          FMUL        v20.4s, v20.4s, v5.4s
          FMUL        v24.4s, v24.4s, v6.4s
          FMUL        v28.4s, v28.4s, v7.4s
          FMUL        v17.4s, v17.4s, v8.4s
          FMUL        v21.4s, v21.4s, v9.4s
          FMUL        v25.4s, v25.4s, v10.4s
          FMUL        v29.4s, v29.4s, v11.4s
          FMUL        v4.4s, v0.4s, v12.s[1]
          FMUL        v5.4s, v1.4s, v12.s[1]
          FMUL        v6.4s, v2.4s, v12.s[1]
          FMUL        v7.4s, v3.4s, v12.s[1]
          FMUL        v8.4s, v0.4s, v12.s[1]
          FMUL        v9.4s, v1.4s, v12.s[1]
          FMUL        v10.4s, v2.4s, v12.s[1]
          FMUL        v11.4s, v3.4s, v12.s[1]
          LDP         q0, q1, [x5], 32        // bias
          FMUL        v18.4s, v18.4s, v4.4s
          FMUL        v22.4s, v22.4s, v5.4s
          FMUL        v26.4s, v26.4s, v6.4s
          FMUL        v30.4s, v30.4s, v7.4s
          FMUL        v19.4s, v19.4s, v8.4s
          FMUL        v23.4s, v23.4s, v9.4s
          FMUL        v27.4s, v27.4s, v10.4s
          FMUL        v31.4s, v31.4s, v11.4s
          LDP         q2, q3, [x5], 32
          FADD        v16.4s, v16.4s, v0.4s
          FADD        v17.4s, v17.4s, v0.4s
          FADD        v18.4s, v18.4s, v0.4s
          FADD        v19.4s, v19.4s, v0.4s
          FADD        v20.4s, v20.4s, v1.4s
          FADD        v21.4s, v21.4s, v1.4s
          FADD        v22.4s, v22.4s, v1.4s
          FADD        v23.4s, v23.4s, v1.4s
          $if DATATYPE == "QD8_F32":
            LD2R        {v0.4s, v1.4s}, [x11]       // min max
          $elif DATATYPE == "QD8_F16":
            LD2R        {v0.8h, v1.8h}, [x11]       // min max
          FADD        v24.4s, v24.4s, v2.4s
          FADD        v25.4s, v25.4s, v2.4s
          FADD        v26.4s, v26.4s, v2.4s
          FADD        v27.4s, v27.4s, v2.4s
          FADD        v28.4s, v28.4s, v3.4s
          FADD        v29.4s, v29.4s, v3.4s
          FADD        v30.4s, v30.4s, v3.4s
          FADD        v31.4s, v31.4s, v3.4s
          $if DATATYPE == "QD8_F32":
            FMAX        v16.4s, v16.4s, v0.4s
            FMAX        v17.4s, v17.4s, v0.4s
            FMAX        v18.4s, v18.4s, v0.4s
            FMAX        v19.4s, v19.4s, v0.4s
            FMAX        v20.4s, v20.4s, v0.4s
            FMAX        v21.4s, v21.4s, v0.4s
            FMAX        v22.4s, v22.4s, v0.4s
            FMAX        v23.4s, v23.4s, v0.4s
            FMAX        v24.4s, v24.4s, v0.4s
            FMAX        v25.4s, v25.4s, v0.4s
            FMAX        v26.4s, v26.4s, v0.4s
            FMAX        v27.4s, v27.4s, v0.4s
            FMAX        v28.4s, v28.4s, v0.4s
            FMAX        v29.4s, v29.4s, v0.4s
            FMAX        v30.4s, v30.4s, v0.4s
            FMAX        v31.4s, v31.4s, v0.4s
            FMIN        v16.4s, v16.4s, v1.4s
            FMIN        v17.4s, v17.4s, v1.4s
            FMIN        v18.4s, v18.4s, v1.4s
            FMIN        v19.4s, v19.4s, v1.4s
            FMIN        v20.4s, v20.4s, v1.4s
            FMIN        v21.4s, v21.4s, v1.4s
            FMIN        v22.4s, v22.4s, v1.4s
            FMIN        v23.4s, v23.4s, v1.4s
            FMIN        v24.4s, v24.4s, v1.4s
            FMIN        v25.4s, v25.4s, v1.4s
            FMIN        v26.4s, v26.4s, v1.4s
            FMIN        v27.4s, v27.4s, v1.4s
            FMIN        v28.4s, v28.4s, v1.4s
            FMIN        v29.4s, v29.4s, v1.4s
            FMIN        v30.4s, v30.4s, v1.4s
            FMIN        v31.4s, v31.4s, v1.4s
          $elif DATATYPE == "QD8_F16":
            FCVTN       v16.4h, v16.4s
            FCVTN       v17.4h, v17.4s
            FCVTN       v18.4h, v18.4s
            FCVTN       v19.4h, v19.4s
            FCVTN       v24.4h, v24.4s
            FCVTN       v25.4h, v25.4s
            FCVTN       v26.4h, v26.4s
            FCVTN       v27.4h, v27.4s
            FCVTN2      v16.8h, v20.4s
            FCVTN2      v17.8h, v21.4s
            FCVTN2      v18.8h, v22.4s
            FCVTN2      v19.8h, v23.4s
            FCVTN2      v24.8h, v28.4s
            FCVTN2      v25.8h, v29.4s
            FCVTN2      v26.8h, v30.4s
            FCVTN2      v27.8h, v31.4s
            FMAX        v16.8h, v16.8h, v0.8h
            FMAX        v17.8h, v17.8h, v0.8h
            FMAX        v18.8h, v18.8h, v0.8h
            FMAX        v19.8h, v19.8h, v0.8h
            FMAX        v24.8h, v24.8h, v0.8h
            FMAX        v25.8h, v25.8h, v0.8h
            FMAX        v26.8h, v26.8h, v0.8h
            FMAX        v27.8h, v27.8h, v0.8h
            FMIN        v16.8h, v16.8h, v1.8h
            FMIN        v17.8h, v17.8h, v1.8h
            FMIN        v18.8h, v18.8h, v1.8h
            FMIN        v19.8h, v19.8h, v1.8h
            FMIN        v24.8h, v24.8h, v1.8h
            FMIN        v25.8h, v25.8h, v1.8h
            FMIN        v26.8h, v26.8h, v1.8h
            FMIN        v27.8h, v27.8h, v1.8h
          SUBS        x1, x1, 16
          LDR         x0, [sp, 48]            // cn_stride
          B.LO        7f
          $if DATATYPE == "QD8_F32":
            STP         q19, q23, [x7]
            STP         q27, q31, [x7, #32]
            ADD         x7, x7, x0

            STP         q18, q22, [x17]
            STP         q26, q30, [x17, #32]
            ADD         x17, x17, x0

            STP         q17, q21, [x16]
            STP         q25, q29, [x16, #32]
            ADD         x16, x16, x0

            STP         q16, q20, [x6]
            STP         q24, q28, [x6, #32]
            ADD         x6, x6, x0
          $elif DATATYPE == "QD8_F16":
            STP         q19, q27, [x7]
            ADD         x7, x7, x0

            STP         q18, q26, [x17]
            ADD         x17, x17, x0

            STP         q17, q25, [x16]
            ADD         x16, x16, x0

            STP         q16, q24, [x6]
            ADD         x6, x6, x0
          SUB         x4, x4, x3              // a -= ks
          B.NE        0b
        $elif REQUANTIZATION == "RNDNU":
          # Apply params - preshift, scale, postshift, bias and clamp
          LD1R        {v4.4s}, [x11], 4
          SQSHL       v16.4s, v16.4s, v4.4s   // shift to upper bits
          SQSHL       v17.4s, v17.4s, v4.4s
          SQSHL       v18.4s, v18.4s, v4.4s
          SQSHL       v19.4s, v19.4s, v4.4s
          SQSHL       v20.4s, v20.4s, v4.4s
          SQSHL       v21.4s, v21.4s, v4.4s
          SQSHL       v22.4s, v22.4s, v4.4s
          SQSHL       v23.4s, v23.4s, v4.4s
          LD1R        {v5.4s}, [x11], 4
          SQSHL       v24.4s, v24.4s, v4.4s
          SQSHL       v25.4s, v25.4s, v4.4s
          SQSHL       v26.4s, v26.4s, v4.4s
          SQSHL       v27.4s, v27.4s, v4.4s
          SQSHL       v28.4s, v28.4s, v4.4s
          SQSHL       v29.4s, v29.4s, v4.4s
          SQSHL       v30.4s, v30.4s, v4.4s
          SQSHL       v31.4s, v31.4s, v4.4s
          LD1R        {v6.4s}, [x11], 4
          SQDMULH     v16.4s, v16.4s, v5.4s   // scale without rounding
          SQDMULH     v17.4s, v17.4s, v5.4s
          SQDMULH     v18.4s, v18.4s, v5.4s
          SQDMULH     v19.4s, v19.4s, v5.4s
          SQDMULH     v20.4s, v20.4s, v5.4s
          SQDMULH     v21.4s, v21.4s, v5.4s
          SQDMULH     v22.4s, v22.4s, v5.4s
          SQDMULH     v23.4s, v23.4s, v5.4s
          SQDMULH     v24.4s, v24.4s, v5.4s
          SQDMULH     v25.4s, v25.4s, v5.4s
          SQDMULH     v26.4s, v26.4s, v5.4s
          SQDMULH     v27.4s, v27.4s, v5.4s
          SQDMULH     v28.4s, v28.4s, v5.4s
          SQDMULH     v29.4s, v29.4s, v5.4s
          SQDMULH     v30.4s, v30.4s, v5.4s
          SQDMULH     v31.4s, v31.4s, v5.4s
          SRSHL       v16.4s, v16.4s, v6.4s   // signed rounding shift left
          SRSHL       v17.4s, v17.4s, v6.4s
          SRSHL       v18.4s, v18.4s, v6.4s
          SRSHL       v19.4s, v19.4s, v6.4s
          SRSHL       v20.4s, v20.4s, v6.4s
          SRSHL       v21.4s, v21.4s, v6.4s
          SRSHL       v22.4s, v22.4s, v6.4s
          SRSHL       v23.4s, v23.4s, v6.4s
          SRSHL       v24.4s, v24.4s, v6.4s
          SRSHL       v25.4s, v25.4s, v6.4s
          SRSHL       v26.4s, v26.4s, v6.4s
          SRSHL       v27.4s, v27.4s, v6.4s
          SRSHL       v28.4s, v28.4s, v6.4s
          SRSHL       v29.4s, v29.4s, v6.4s
          SRSHL       v30.4s, v30.4s, v6.4s
          SRSHL       v31.4s, v31.4s, v6.4s
        $elif REQUANTIZATION == "FP32":
          SCVTF       v16.4s, v16.4s
          SCVTF       v17.4s, v17.4s
          $if DATATYPE != "QC8":
            # Apply params - scale, bias and clamp
            LD1R        {v4.4s}, [x11], 4
            SCVTF       v18.4s, v18.4s
            SCVTF       v19.4s, v19.4s
          $else:
            # Load per channel scale values from weights
            LDR         q4, [x5], 16
            SCVTF       v18.4s, v18.4s
            SCVTF       v19.4s, v19.4s
            LDR         q5, [x5], 16
          SCVTF       v20.4s, v20.4s
          SCVTF       v21.4s, v21.4s
          SCVTF       v22.4s, v22.4s
          SCVTF       v23.4s, v23.4s
          SCVTF       v24.4s, v24.4s
          SCVTF       v25.4s, v25.4s
          SCVTF       v26.4s, v26.4s
          SCVTF       v27.4s, v27.4s
          SCVTF       v28.4s, v28.4s
          SCVTF       v29.4s, v29.4s
          SCVTF       v30.4s, v30.4s
          SCVTF       v31.4s, v31.4s

          $if DATATYPE == "QC8":
            LDR         q6, [x5], 16
            FMUL        v16.4s, v16.4s, v4.4s
            FMUL        v17.4s, v17.4s, v4.4s
            FMUL        v18.4s, v18.4s, v4.4s
            FMUL        v19.4s, v19.4s, v4.4s
            FMUL        v20.4s, v20.4s, v5.4s
            LDR         q4, [x5], 16
            FMUL        v21.4s, v21.4s, v5.4s
            FMUL        v22.4s, v22.4s, v5.4s
            FMUL        v23.4s, v23.4s, v5.4s
            FMUL        v24.4s, v24.4s, v6.4s
            FMUL        v25.4s, v25.4s, v6.4s
            FMUL        v26.4s, v26.4s, v6.4s
            FMUL        v27.4s, v27.4s, v6.4s
            FMUL        v28.4s, v28.4s, v4.4s
            FMUL        v29.4s, v29.4s, v4.4s
            FMUL        v30.4s, v30.4s, v4.4s
            FMUL        v31.4s, v31.4s, v4.4s
          $else:
            FMUL        v16.4s, v16.4s, v4.4s
            FMUL        v17.4s, v17.4s, v4.4s
            FMUL        v18.4s, v18.4s, v4.4s
            FMUL        v19.4s, v19.4s, v4.4s
            FMUL        v20.4s, v20.4s, v4.4s
            FMUL        v21.4s, v21.4s, v4.4s
            FMUL        v22.4s, v22.4s, v4.4s
            FMUL        v23.4s, v23.4s, v4.4s
            FMUL        v24.4s, v24.4s, v4.4s
            FMUL        v25.4s, v25.4s, v4.4s
            FMUL        v26.4s, v26.4s, v4.4s
            FMUL        v27.4s, v27.4s, v4.4s
            FMUL        v28.4s, v28.4s, v4.4s
            FMUL        v29.4s, v29.4s, v4.4s
            FMUL        v30.4s, v30.4s, v4.4s
            FMUL        v31.4s, v31.4s, v4.4s

          FCVTNS      v16.4s, v16.4s
          FCVTNS      v17.4s, v17.4s
          FCVTNS      v18.4s, v18.4s
          FCVTNS      v19.4s, v19.4s
          FCVTNS      v20.4s, v20.4s
          FCVTNS      v21.4s, v21.4s
          FCVTNS      v22.4s, v22.4s
          FCVTNS      v23.4s, v23.4s
          FCVTNS      v24.4s, v24.4s
          FCVTNS      v25.4s, v25.4s
          FCVTNS      v26.4s, v26.4s
          FCVTNS      v27.4s, v27.4s
          FCVTNS      v28.4s, v28.4s
          FCVTNS      v29.4s, v29.4s
          FCVTNS      v30.4s, v30.4s
          FCVTNS      v31.4s, v31.4s

        $if DATATYPE not in ["QD8_F16", "QD8_F32"]:
          SQXTN       v16.4h, v16.4s
          SQXTN       v17.4h, v17.4s
          SQXTN       v18.4h, v18.4s
          SQXTN       v19.4h, v19.4s
          SQXTN       v24.4h, v24.4s
          SQXTN       v25.4h, v25.4s
          SQXTN       v26.4h, v26.4s
          SQXTN       v27.4h, v27.4s
          LD1R        {v6.8h}, [x11], 2       // add bias

          SQXTN2      v16.8h, v20.4s
          SQXTN2      v17.8h, v21.4s
          SQXTN2      v18.8h, v22.4s
          SQXTN2      v19.8h, v23.4s
          SQXTN2      v24.8h, v28.4s
          SQXTN2      v25.8h, v29.4s
          SQXTN2      v26.8h, v30.4s
          SQXTN2      v27.8h, v31.4s

          SQADD       v16.8h, v16.8h, v6.8h
          SQADD       v17.8h, v17.8h, v6.8h
          SQADD       v18.8h, v18.8h, v6.8h
          SQADD       v19.8h, v19.8h, v6.8h
          SQADD       v24.8h, v24.8h, v6.8h
          SQADD       v25.8h, v25.8h, v6.8h
          SQADD       v26.8h, v26.8h, v6.8h
          SQADD       v27.8h, v27.8h, v6.8h
          LD1R        {v4.16b}, [x11], 1      // clamp min value

          SQXTN       v0.8b, v16.8h
          SQXTN       v1.8b, v17.8h
          SQXTN       v2.8b, v18.8h
          SQXTN       v3.8b, v19.8h
          LD1R        {v5.16b}, [x11]         // clamp max value
          SQXTN2      v0.16b, v24.8h
          SQXTN2      v1.16b, v25.8h
          SQXTN2      v2.16b, v26.8h
          SQXTN2      v3.16b, v27.8h
          LDR         x0, [sp, 32]            // cn_stride
          SMAX        v0.16b, v0.16b, v4.16b
          SMAX        v1.16b, v1.16b, v4.16b
          SUB         x11, x11, ${REWIND_DECREMENT}           // rewind params pointer
          SMAX        v2.16b, v2.16b, v4.16b
          SMAX        v3.16b, v3.16b, v4.16b
          SUBS        x1, x1, 16
          SMIN        v0.16b, v0.16b, v5.16b
          SMIN        v1.16b, v1.16b, v5.16b
          SMIN        v2.16b, v2.16b, v5.16b
          SMIN        v3.16b, v3.16b, v5.16b
          B.LO        7f

          # Store full 4 x 16
          ST1         {v3.16b},  [x7], x0
          ST1         {v2.16b}, [x17], x0
          ST1         {v1.16b}, [x16], x0
          ST1         {v0.16b},  [x6], x0

          SUB         x4, x4, x3              // a -= ks

          # nc loop
          B.HI        0b

        # Restore d8-d${STACK_REGISTERS} from stack
        $if DATATYPE in ["QD8_F16", "QD8_F32"]:
          LDR         x19, [sp, 40]
          LDR         d12, [sp, 32]
        LDP         d10, d11, [sp, 16]
        LDP         d8, d9, [sp], ${SP_OFFSET}
        RET

        # Remainder- 4 to 12 bytes of A
        # Although C4, its safe to read 16 bytes.
        .p2align    3
5:
        AND         x0, x2, 15              // kc remainder 4 to 12
6:
        LDR         q0, [x13]
        LDP         q8,  q9,  [x5], 32
        LDR         q1, [x14]
        LDR         q2, [x15]
        LDR         q3, [x10]
        LDP         q10, q11, [x5], 32
        SDOT        v16.4s,  v8.16b, v0.4b[0]
        SDOT        v17.4s,  v8.16b, v1.4b[0]
        SDOT        v18.4s,  v8.16b, v2.4b[0]
        SDOT        v19.4s,  v8.16b, v3.4b[0]
        SDOT        v20.4s,  v9.16b, v0.4b[0]
        SDOT        v21.4s,  v9.16b, v1.4b[0]
        SDOT        v22.4s,  v9.16b, v2.4b[0]
        SDOT        v23.4s,  v9.16b, v3.4b[0]
        SDOT        v24.4s, v10.16b, v0.4b[0]
        SDOT        v25.4s, v10.16b, v1.4b[0]
        SDOT        v26.4s, v10.16b, v2.4b[0]
        SDOT        v27.4s, v10.16b, v3.4b[0]
        SDOT        v28.4s, v11.16b, v0.4b[0]
        SDOT        v29.4s, v11.16b, v1.4b[0]
        SDOT        v30.4s, v11.16b, v2.4b[0]
        SDOT        v31.4s, v11.16b, v3.4b[0]
        CMP         x0, 4
        B.LS        4b
        LDP         q8,  q9,  [x5], 32
        LDP         q10, q11,  [x5], 32
        SDOT        v16.4s,  v8.16b, v0.4b[1]
        SDOT        v17.4s,  v8.16b, v1.4b[1]
        SDOT        v18.4s,  v8.16b, v2.4b[1]
        SDOT        v19.4s,  v8.16b, v3.4b[1]
        SDOT        v20.4s,  v9.16b, v0.4b[1]
        SDOT        v21.4s,  v9.16b, v1.4b[1]
        SDOT        v22.4s,  v9.16b, v2.4b[1]
        SDOT        v23.4s,  v9.16b, v3.4b[1]
        SDOT        v24.4s, v10.16b, v0.4b[1]
        SDOT        v25.4s, v10.16b, v1.4b[1]
        SDOT        v26.4s, v10.16b, v2.4b[1]
        SDOT        v27.4s, v10.16b, v3.4b[1]
        SDOT        v28.4s, v11.16b, v0.4b[1]
        SDOT        v29.4s, v11.16b, v1.4b[1]
        SDOT        v30.4s, v11.16b, v2.4b[1]
        SDOT        v31.4s, v11.16b, v3.4b[1]
        CMP         x0, 8
        B.LS        4b
        LDP         q8,  q9,  [x5], 32
        LDP         q10, q11,  [x5], 32
        SDOT        v16.4s,  v8.16b, v0.4b[2]
        SDOT        v17.4s,  v8.16b, v1.4b[2]
        SDOT        v18.4s,  v8.16b, v2.4b[2]
        SDOT        v19.4s,  v8.16b, v3.4b[2]
        SDOT        v20.4s,  v9.16b, v0.4b[2]
        SDOT        v21.4s,  v9.16b, v1.4b[2]
        SDOT        v22.4s,  v9.16b, v2.4b[2]
        SDOT        v23.4s,  v9.16b, v3.4b[2]
        SDOT        v24.4s, v10.16b, v0.4b[2]
        SDOT        v25.4s, v10.16b, v1.4b[2]
        SDOT        v26.4s, v10.16b, v2.4b[2]
        SDOT        v27.4s, v10.16b, v3.4b[2]
        SDOT        v28.4s, v11.16b, v0.4b[2]
        SDOT        v29.4s, v11.16b, v1.4b[2]
        SDOT        v30.4s, v11.16b, v2.4b[2]
        SDOT        v31.4s, v11.16b, v3.4b[2]
        B           4b

        # Store odd width
        .p2align    3
$if DATATYPE == "QD8_F32":
  7:
          TBZ         x1, 3, 8f
          STP         q19, q23, [x7]
          STP         q18, q22, [x17]
          STP         q17, q21, [x16]
          STP         q16, q20, [x6]
          MOV         v16.16b, v24.16b
          MOV         v17.16b, v25.16b
          MOV         v18.16b, v26.16b
          MOV         v19.16b, v27.16b
          MOV         v20.16b, v28.16b
          MOV         v21.16b, v29.16b
          MOV         v22.16b, v30.16b
          MOV         v23.16b, v31.16b
          ADD         x7, x7, #32
          ADD         x17, x17, #32
          ADD         x16, x16, #32
          ADD         x6, x6, #32
  8:
          TBZ         x1, 2, 9f
          STR         q19, [x7]
          STR         q18, [x17]
          STR         q17, [x16]
          STR         q16, [x6]
          MOV         v16.16b, v20.16b
          MOV         v17.16b, v21.16b
          MOV         v18.16b, v22.16b
          MOV         v19.16b, v23.16b
          ADD         x7, x7, #16
          ADD         x17, x17, #16
          ADD         x16, x16, #16
          ADD         x6, x6, #16
  9:
          TBZ         x1, 1, 10f
          ST1         {v19.2s}, [x7]
          ST1         {v18.2s}, [x17]
          ST1         {v17.2s}, [x16]
          ST1         {v16.2s}, [x6]
          DUP         d16, v16.d[1]
          DUP         d17, v17.d[1]
          DUP         d18, v18.d[1]
          DUP         d19, v19.d[1]
          ADD         x7, x7, #8
          ADD         x17, x17, #8
          ADD         x16, x16, #8
          ADD         x6, x6, #8
  10:
          TBZ         x1, 0, 11f
          STR         s19, [x7]
          STR         s18, [x17]
          STR         s17, [x16]
          STR         s16, [x6]
  11:
          # Restore d8-d12 from stack
          LDR         x19, [sp, 40]
          LDR         d12, [sp, 32]
          LDP         d10, d11, [sp, 16]
          LDP         d8, d9, [sp], 48
          RET
$elif DATATYPE == "QD8_F16":
  7:
          TBZ         x1, 3, 8f
          STR         q19, [x7], 16
          STR         q18, [x17], 16
          STR         q17, [x16], 16
          STR         q16, [x6], 16
          MOV         v16.16b, v24.16b
          MOV         v17.16b, v25.16b
          MOV         v18.16b, v26.16b
          MOV         v19.16b, v27.16b
  8:
          TBZ         x1, 2, 9f
          STR         d19, [x7], 8
          STR         d18, [x17], 8
          STR         d17, [x16], 8
          STR         d16, [x6], 8
          DUP         d16, v16.d[1]
          DUP         d17, v17.d[1]
          DUP         d18, v18.d[1]
          DUP         d19, v19.d[1]
  9:
          TBZ         x1, 1, 10f
          STR         s19, [x7], 4
          STR         s18, [x17], 4
          STR         s17, [x16], 4
          STR         s16, [x6], 4
          DUP         s16, v16.s[1]
          DUP         s17, v17.s[1]
          DUP         s18, v18.s[1]
          DUP         s19, v19.s[1]
  10:
          TBZ         x1, 0, 11f
          STR         h19, [x7]
          STR         h18, [x17]
          STR         h17, [x16]
          STR         h16, [x6]
  11:
          # Restore d8-d12 from stack
          LDR         x19, [sp, 40]
          LDR         d12, [sp, 32]
          LDP         d10, d11, [sp, 16]
          LDP         d8, d9, [sp], 48
          RET
$else:
  7:
          TBZ         x1, 3, 8f
          STR         d3, [x7], 8
          STR         d2, [x17], 8
          DUP         d3, v3.d[1]
          DUP         d2, v2.d[1]
          STR         d1, [x16], 8
          STR         d0, [x6], 8
          DUP         d1, v1.d[1]
          DUP         d0, v0.d[1]
  8:
          TBZ         x1, 2, 9f
          STR         s3, [x7], 4
          STR         s2, [x17], 4
          DUP         s3, v3.s[1]
          DUP         s2, v2.s[1]
          STR         s1, [x16], 4
          STR         s0, [x6], 4
          DUP         s1, v1.s[1]
          DUP         s0, v0.s[1]
  9:
          TBZ         x1, 1, 10f
          STR         h3, [x7], 2
          STR         h2, [x17], 2
          DUP         h3, v3.h[1]
          DUP         h2, v2.h[1]
          STR         h1, [x16], 2
          STR         h0, [x6], 2
          DUP         h1, v1.h[1]
          DUP         h0, v0.h[1]
  10:
          TBZ         x1, 0, 11f
          STR         b3, [x7]
          STR         b2, [x17]
          STR         b1, [x16]
          STR         b0, [x6]
  11:
          # Restore d8-d11 from stack
          LDP         d10, d11, [sp, 16]
          LDP         d8,  d9, [sp], 32
          RET

END_FUNCTION xnn_${DATATYPE_SPEC}_igemm_minmax${REQUANTIZATION_SPEC}_ukernel_4x16c4__asm_aarch64_neondot${ISA}_cortex_a55

#ifdef __ELF__
.section ".note.GNU-stack","",%progbits
#endif
