// Auto-generated file. Do not edit! // Template: src/x32-packw/scalar.c.in // Generator: tools/xngen // // Copyright 2023 Google LLC // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. #include #include #include #include "xnnpack/math.h" #include "xnnpack/packw.h" void xnn_x16_packw_gemm_goi_ukernel_x64__scalar_int_u4( size_t g, size_t nc, size_t kc, size_t nr, size_t kr, size_t sr, const uint16_t* weights, const uint16_t* bias, const void* scale, uint16_t* packed_weights, size_t extra_bytes, const void* params) { assert(g != 0); assert(nc != 0); assert(kc != 0); assert(nr == 64); assert(kr == 1); assert(sr == 1); assert(weights != NULL); assert(packed_weights != NULL); uint16_t* out = (uint16_t*) packed_weights; const uint16_t* b = (const uint16_t*) bias; do { // NC main loop multiple of 64 const uint16_t* w0 = (const uint16_t*) weights; size_t n = nc; for (;n >= 64; n -= 64) { if XNN_LIKELY(b != NULL) { out[0] = b[0]; out[1] = b[1]; out[2] = b[2]; out[3] = b[3]; out[4] = b[4]; out[5] = b[5]; out[6] = b[6]; out[7] = b[7]; out[8] = b[8]; out[9] = b[9]; out[10] = b[10]; out[11] = b[11]; out[12] = b[12]; out[13] = b[13]; out[14] = b[14]; out[15] = b[15]; out[16] = b[16]; out[17] = b[17]; out[18] = b[18]; out[19] = b[19]; out[20] = b[20]; out[21] = b[21]; out[22] = b[22]; out[23] = b[23]; out[24] = b[24]; out[25] = b[25]; out[26] = b[26]; out[27] = b[27]; out[28] = b[28]; out[29] = b[29]; out[30] = b[30]; out[31] = b[31]; out[32] = b[32]; out[33] = b[33]; out[34] = b[34]; out[35] = b[35]; out[36] = b[36]; out[37] = b[37]; out[38] = b[38]; out[39] = b[39]; out[40] = b[40]; out[41] = b[41]; out[42] = b[42]; out[43] = b[43]; out[44] = b[44]; out[45] = b[45]; out[46] = b[46]; out[47] = b[47]; out[48] = b[48]; out[49] = b[49]; out[50] = b[50]; out[51] = b[51]; out[52] = b[52]; out[53] = b[53]; out[54] = b[54]; out[55] = b[55]; out[56] = b[56]; out[57] = b[57]; out[58] = b[58]; out[59] = b[59]; out[60] = b[60]; out[61] = b[61]; out[62] = b[62]; out[63] = b[63]; b += 64; } else { out[0] = 0; out[1] = 0; out[2] = 0; out[3] = 0; out[4] = 0; out[5] = 0; out[6] = 0; out[7] = 0; out[8] = 0; out[9] = 0; out[10] = 0; out[11] = 0; out[12] = 0; out[13] = 0; out[14] = 0; out[15] = 0; out[16] = 0; out[17] = 0; out[18] = 0; out[19] = 0; out[20] = 0; out[21] = 0; out[22] = 0; out[23] = 0; out[24] = 0; out[25] = 0; out[26] = 0; out[27] = 0; out[28] = 0; out[29] = 0; out[30] = 0; out[31] = 0; out[32] = 0; out[33] = 0; out[34] = 0; out[35] = 0; out[36] = 0; out[37] = 0; out[38] = 0; out[39] = 0; out[40] = 0; out[41] = 0; out[42] = 0; out[43] = 0; out[44] = 0; out[45] = 0; out[46] = 0; out[47] = 0; out[48] = 0; out[49] = 0; out[50] = 0; out[51] = 0; out[52] = 0; out[53] = 0; out[54] = 0; out[55] = 0; out[56] = 0; out[57] = 0; out[58] = 0; out[59] = 0; out[60] = 0; out[61] = 0; out[62] = 0; out[63] = 0; } out += 64; const uint16_t* w1 = w0 + kc; const uint16_t* w2 = w1 + kc; const uint16_t* w3 = w2 + kc; const uint16_t* w4 = w3 + kc; const uint16_t* w5 = w4 + kc; const uint16_t* w6 = w5 + kc; const uint16_t* w7 = w6 + kc; const uint16_t* w8 = w7 + kc; const uint16_t* w9 = w8 + kc; const uint16_t* w10 = w9 + kc; const uint16_t* w11 = w10 + kc; const uint16_t* w12 = w11 + kc; const uint16_t* w13 = w12 + kc; const uint16_t* w14 = w13 + kc; const uint16_t* w15 = w14 + kc; const uint16_t* w16 = w15 + kc; const uint16_t* w17 = w16 + kc; const uint16_t* w18 = w17 + kc; const uint16_t* w19 = w18 + kc; const uint16_t* w20 = w19 + kc; const uint16_t* w21 = w20 + kc; const uint16_t* w22 = w21 + kc; const uint16_t* w23 = w22 + kc; const uint16_t* w24 = w23 + kc; const uint16_t* w25 = w24 + kc; const uint16_t* w26 = w25 + kc; const uint16_t* w27 = w26 + kc; const uint16_t* w28 = w27 + kc; const uint16_t* w29 = w28 + kc; const uint16_t* w30 = w29 + kc; const uint16_t* w31 = w30 + kc; const uint16_t* w32 = w31 + kc; const uint16_t* w33 = w32 + kc; const uint16_t* w34 = w33 + kc; const uint16_t* w35 = w34 + kc; const uint16_t* w36 = w35 + kc; const uint16_t* w37 = w36 + kc; const uint16_t* w38 = w37 + kc; const uint16_t* w39 = w38 + kc; const uint16_t* w40 = w39 + kc; const uint16_t* w41 = w40 + kc; const uint16_t* w42 = w41 + kc; const uint16_t* w43 = w42 + kc; const uint16_t* w44 = w43 + kc; const uint16_t* w45 = w44 + kc; const uint16_t* w46 = w45 + kc; const uint16_t* w47 = w46 + kc; const uint16_t* w48 = w47 + kc; const uint16_t* w49 = w48 + kc; const uint16_t* w50 = w49 + kc; const uint16_t* w51 = w50 + kc; const uint16_t* w52 = w51 + kc; const uint16_t* w53 = w52 + kc; const uint16_t* w54 = w53 + kc; const uint16_t* w55 = w54 + kc; const uint16_t* w56 = w55 + kc; const uint16_t* w57 = w56 + kc; const uint16_t* w58 = w57 + kc; const uint16_t* w59 = w58 + kc; const uint16_t* w60 = w59 + kc; const uint16_t* w61 = w60 + kc; const uint16_t* w62 = w61 + kc; const uint16_t* w63 = w62 + kc; // KC main loop multiple of 64x4 size_t k = kc; for (; k >= 4; k -= 4) { const uint16_t v00 = w0[0]; const uint16_t v01 = w0[1]; const uint16_t v02 = w0[2]; const uint16_t v03 = w0[3]; w0 += 4; const uint16_t v10 = w1[0]; const uint16_t v11 = w1[1]; const uint16_t v12 = w1[2]; const uint16_t v13 = w1[3]; w1 += 4; const uint16_t v20 = w2[0]; const uint16_t v21 = w2[1]; const uint16_t v22 = w2[2]; const uint16_t v23 = w2[3]; w2 += 4; const uint16_t v30 = w3[0]; const uint16_t v31 = w3[1]; const uint16_t v32 = w3[2]; const uint16_t v33 = w3[3]; w3 += 4; const uint16_t v40 = w4[0]; const uint16_t v41 = w4[1]; const uint16_t v42 = w4[2]; const uint16_t v43 = w4[3]; w4 += 4; const uint16_t v50 = w5[0]; const uint16_t v51 = w5[1]; const uint16_t v52 = w5[2]; const uint16_t v53 = w5[3]; w5 += 4; const uint16_t v60 = w6[0]; const uint16_t v61 = w6[1]; const uint16_t v62 = w6[2]; const uint16_t v63 = w6[3]; w6 += 4; const uint16_t v70 = w7[0]; const uint16_t v71 = w7[1]; const uint16_t v72 = w7[2]; const uint16_t v73 = w7[3]; w7 += 4; const uint16_t v80 = w8[0]; const uint16_t v81 = w8[1]; const uint16_t v82 = w8[2]; const uint16_t v83 = w8[3]; w8 += 4; const uint16_t v90 = w9[0]; const uint16_t v91 = w9[1]; const uint16_t v92 = w9[2]; const uint16_t v93 = w9[3]; w9 += 4; const uint16_t v100 = w10[0]; const uint16_t v101 = w10[1]; const uint16_t v102 = w10[2]; const uint16_t v103 = w10[3]; w10 += 4; const uint16_t v110 = w11[0]; const uint16_t v111 = w11[1]; const uint16_t v112 = w11[2]; const uint16_t v113 = w11[3]; w11 += 4; const uint16_t v120 = w12[0]; const uint16_t v121 = w12[1]; const uint16_t v122 = w12[2]; const uint16_t v123 = w12[3]; w12 += 4; const uint16_t v130 = w13[0]; const uint16_t v131 = w13[1]; const uint16_t v132 = w13[2]; const uint16_t v133 = w13[3]; w13 += 4; const uint16_t v140 = w14[0]; const uint16_t v141 = w14[1]; const uint16_t v142 = w14[2]; const uint16_t v143 = w14[3]; w14 += 4; const uint16_t v150 = w15[0]; const uint16_t v151 = w15[1]; const uint16_t v152 = w15[2]; const uint16_t v153 = w15[3]; w15 += 4; const uint16_t v160 = w16[0]; const uint16_t v161 = w16[1]; const uint16_t v162 = w16[2]; const uint16_t v163 = w16[3]; w16 += 4; const uint16_t v170 = w17[0]; const uint16_t v171 = w17[1]; const uint16_t v172 = w17[2]; const uint16_t v173 = w17[3]; w17 += 4; const uint16_t v180 = w18[0]; const uint16_t v181 = w18[1]; const uint16_t v182 = w18[2]; const uint16_t v183 = w18[3]; w18 += 4; const uint16_t v190 = w19[0]; const uint16_t v191 = w19[1]; const uint16_t v192 = w19[2]; const uint16_t v193 = w19[3]; w19 += 4; const uint16_t v200 = w20[0]; const uint16_t v201 = w20[1]; const uint16_t v202 = w20[2]; const uint16_t v203 = w20[3]; w20 += 4; const uint16_t v210 = w21[0]; const uint16_t v211 = w21[1]; const uint16_t v212 = w21[2]; const uint16_t v213 = w21[3]; w21 += 4; const uint16_t v220 = w22[0]; const uint16_t v221 = w22[1]; const uint16_t v222 = w22[2]; const uint16_t v223 = w22[3]; w22 += 4; const uint16_t v230 = w23[0]; const uint16_t v231 = w23[1]; const uint16_t v232 = w23[2]; const uint16_t v233 = w23[3]; w23 += 4; const uint16_t v240 = w24[0]; const uint16_t v241 = w24[1]; const uint16_t v242 = w24[2]; const uint16_t v243 = w24[3]; w24 += 4; const uint16_t v250 = w25[0]; const uint16_t v251 = w25[1]; const uint16_t v252 = w25[2]; const uint16_t v253 = w25[3]; w25 += 4; const uint16_t v260 = w26[0]; const uint16_t v261 = w26[1]; const uint16_t v262 = w26[2]; const uint16_t v263 = w26[3]; w26 += 4; const uint16_t v270 = w27[0]; const uint16_t v271 = w27[1]; const uint16_t v272 = w27[2]; const uint16_t v273 = w27[3]; w27 += 4; const uint16_t v280 = w28[0]; const uint16_t v281 = w28[1]; const uint16_t v282 = w28[2]; const uint16_t v283 = w28[3]; w28 += 4; const uint16_t v290 = w29[0]; const uint16_t v291 = w29[1]; const uint16_t v292 = w29[2]; const uint16_t v293 = w29[3]; w29 += 4; const uint16_t v300 = w30[0]; const uint16_t v301 = w30[1]; const uint16_t v302 = w30[2]; const uint16_t v303 = w30[3]; w30 += 4; const uint16_t v310 = w31[0]; const uint16_t v311 = w31[1]; const uint16_t v312 = w31[2]; const uint16_t v313 = w31[3]; w31 += 4; const uint16_t v320 = w32[0]; const uint16_t v321 = w32[1]; const uint16_t v322 = w32[2]; const uint16_t v323 = w32[3]; w32 += 4; const uint16_t v330 = w33[0]; const uint16_t v331 = w33[1]; const uint16_t v332 = w33[2]; const uint16_t v333 = w33[3]; w33 += 4; const uint16_t v340 = w34[0]; const uint16_t v341 = w34[1]; const uint16_t v342 = w34[2]; const uint16_t v343 = w34[3]; w34 += 4; const uint16_t v350 = w35[0]; const uint16_t v351 = w35[1]; const uint16_t v352 = w35[2]; const uint16_t v353 = w35[3]; w35 += 4; const uint16_t v360 = w36[0]; const uint16_t v361 = w36[1]; const uint16_t v362 = w36[2]; const uint16_t v363 = w36[3]; w36 += 4; const uint16_t v370 = w37[0]; const uint16_t v371 = w37[1]; const uint16_t v372 = w37[2]; const uint16_t v373 = w37[3]; w37 += 4; const uint16_t v380 = w38[0]; const uint16_t v381 = w38[1]; const uint16_t v382 = w38[2]; const uint16_t v383 = w38[3]; w38 += 4; const uint16_t v390 = w39[0]; const uint16_t v391 = w39[1]; const uint16_t v392 = w39[2]; const uint16_t v393 = w39[3]; w39 += 4; const uint16_t v400 = w40[0]; const uint16_t v401 = w40[1]; const uint16_t v402 = w40[2]; const uint16_t v403 = w40[3]; w40 += 4; const uint16_t v410 = w41[0]; const uint16_t v411 = w41[1]; const uint16_t v412 = w41[2]; const uint16_t v413 = w41[3]; w41 += 4; const uint16_t v420 = w42[0]; const uint16_t v421 = w42[1]; const uint16_t v422 = w42[2]; const uint16_t v423 = w42[3]; w42 += 4; const uint16_t v430 = w43[0]; const uint16_t v431 = w43[1]; const uint16_t v432 = w43[2]; const uint16_t v433 = w43[3]; w43 += 4; const uint16_t v440 = w44[0]; const uint16_t v441 = w44[1]; const uint16_t v442 = w44[2]; const uint16_t v443 = w44[3]; w44 += 4; const uint16_t v450 = w45[0]; const uint16_t v451 = w45[1]; const uint16_t v452 = w45[2]; const uint16_t v453 = w45[3]; w45 += 4; const uint16_t v460 = w46[0]; const uint16_t v461 = w46[1]; const uint16_t v462 = w46[2]; const uint16_t v463 = w46[3]; w46 += 4; const uint16_t v470 = w47[0]; const uint16_t v471 = w47[1]; const uint16_t v472 = w47[2]; const uint16_t v473 = w47[3]; w47 += 4; const uint16_t v480 = w48[0]; const uint16_t v481 = w48[1]; const uint16_t v482 = w48[2]; const uint16_t v483 = w48[3]; w48 += 4; const uint16_t v490 = w49[0]; const uint16_t v491 = w49[1]; const uint16_t v492 = w49[2]; const uint16_t v493 = w49[3]; w49 += 4; const uint16_t v500 = w50[0]; const uint16_t v501 = w50[1]; const uint16_t v502 = w50[2]; const uint16_t v503 = w50[3]; w50 += 4; const uint16_t v510 = w51[0]; const uint16_t v511 = w51[1]; const uint16_t v512 = w51[2]; const uint16_t v513 = w51[3]; w51 += 4; const uint16_t v520 = w52[0]; const uint16_t v521 = w52[1]; const uint16_t v522 = w52[2]; const uint16_t v523 = w52[3]; w52 += 4; const uint16_t v530 = w53[0]; const uint16_t v531 = w53[1]; const uint16_t v532 = w53[2]; const uint16_t v533 = w53[3]; w53 += 4; const uint16_t v540 = w54[0]; const uint16_t v541 = w54[1]; const uint16_t v542 = w54[2]; const uint16_t v543 = w54[3]; w54 += 4; const uint16_t v550 = w55[0]; const uint16_t v551 = w55[1]; const uint16_t v552 = w55[2]; const uint16_t v553 = w55[3]; w55 += 4; const uint16_t v560 = w56[0]; const uint16_t v561 = w56[1]; const uint16_t v562 = w56[2]; const uint16_t v563 = w56[3]; w56 += 4; const uint16_t v570 = w57[0]; const uint16_t v571 = w57[1]; const uint16_t v572 = w57[2]; const uint16_t v573 = w57[3]; w57 += 4; const uint16_t v580 = w58[0]; const uint16_t v581 = w58[1]; const uint16_t v582 = w58[2]; const uint16_t v583 = w58[3]; w58 += 4; const uint16_t v590 = w59[0]; const uint16_t v591 = w59[1]; const uint16_t v592 = w59[2]; const uint16_t v593 = w59[3]; w59 += 4; const uint16_t v600 = w60[0]; const uint16_t v601 = w60[1]; const uint16_t v602 = w60[2]; const uint16_t v603 = w60[3]; w60 += 4; const uint16_t v610 = w61[0]; const uint16_t v611 = w61[1]; const uint16_t v612 = w61[2]; const uint16_t v613 = w61[3]; w61 += 4; const uint16_t v620 = w62[0]; const uint16_t v621 = w62[1]; const uint16_t v622 = w62[2]; const uint16_t v623 = w62[3]; w62 += 4; const uint16_t v630 = w63[0]; const uint16_t v631 = w63[1]; const uint16_t v632 = w63[2]; const uint16_t v633 = w63[3]; w63 += 4; out[0] = v00; out[1] = v10; out[2] = v20; out[3] = v30; out[4] = v40; out[5] = v50; out[6] = v60; out[7] = v70; out[8] = v80; out[9] = v90; out[10] = v100; out[11] = v110; out[12] = v120; out[13] = v130; out[14] = v140; out[15] = v150; out[16] = v160; out[17] = v170; out[18] = v180; out[19] = v190; out[20] = v200; out[21] = v210; out[22] = v220; out[23] = v230; out[24] = v240; out[25] = v250; out[26] = v260; out[27] = v270; out[28] = v280; out[29] = v290; out[30] = v300; out[31] = v310; out[32] = v320; out[33] = v330; out[34] = v340; out[35] = v350; out[36] = v360; out[37] = v370; out[38] = v380; out[39] = v390; out[40] = v400; out[41] = v410; out[42] = v420; out[43] = v430; out[44] = v440; out[45] = v450; out[46] = v460; out[47] = v470; out[48] = v480; out[49] = v490; out[50] = v500; out[51] = v510; out[52] = v520; out[53] = v530; out[54] = v540; out[55] = v550; out[56] = v560; out[57] = v570; out[58] = v580; out[59] = v590; out[60] = v600; out[61] = v610; out[62] = v620; out[63] = v630; out[64] = v01; out[65] = v11; out[66] = v21; out[67] = v31; out[68] = v41; out[69] = v51; out[70] = v61; out[71] = v71; out[72] = v81; out[73] = v91; out[74] = v101; out[75] = v111; out[76] = v121; out[77] = v131; out[78] = v141; out[79] = v151; out[80] = v161; out[81] = v171; out[82] = v181; out[83] = v191; out[84] = v201; out[85] = v211; out[86] = v221; out[87] = v231; out[88] = v241; out[89] = v251; out[90] = v261; out[91] = v271; out[92] = v281; out[93] = v291; out[94] = v301; out[95] = v311; out[96] = v321; out[97] = v331; out[98] = v341; out[99] = v351; out[100] = v361; out[101] = v371; out[102] = v381; out[103] = v391; out[104] = v401; out[105] = v411; out[106] = v421; out[107] = v431; out[108] = v441; out[109] = v451; out[110] = v461; out[111] = v471; out[112] = v481; out[113] = v491; out[114] = v501; out[115] = v511; out[116] = v521; out[117] = v531; out[118] = v541; out[119] = v551; out[120] = v561; out[121] = v571; out[122] = v581; out[123] = v591; out[124] = v601; out[125] = v611; out[126] = v621; out[127] = v631; out[128] = v02; out[129] = v12; out[130] = v22; out[131] = v32; out[132] = v42; out[133] = v52; out[134] = v62; out[135] = v72; out[136] = v82; out[137] = v92; out[138] = v102; out[139] = v112; out[140] = v122; out[141] = v132; out[142] = v142; out[143] = v152; out[144] = v162; out[145] = v172; out[146] = v182; out[147] = v192; out[148] = v202; out[149] = v212; out[150] = v222; out[151] = v232; out[152] = v242; out[153] = v252; out[154] = v262; out[155] = v272; out[156] = v282; out[157] = v292; out[158] = v302; out[159] = v312; out[160] = v322; out[161] = v332; out[162] = v342; out[163] = v352; out[164] = v362; out[165] = v372; out[166] = v382; out[167] = v392; out[168] = v402; out[169] = v412; out[170] = v422; out[171] = v432; out[172] = v442; out[173] = v452; out[174] = v462; out[175] = v472; out[176] = v482; out[177] = v492; out[178] = v502; out[179] = v512; out[180] = v522; out[181] = v532; out[182] = v542; out[183] = v552; out[184] = v562; out[185] = v572; out[186] = v582; out[187] = v592; out[188] = v602; out[189] = v612; out[190] = v622; out[191] = v632; out[192] = v03; out[193] = v13; out[194] = v23; out[195] = v33; out[196] = v43; out[197] = v53; out[198] = v63; out[199] = v73; out[200] = v83; out[201] = v93; out[202] = v103; out[203] = v113; out[204] = v123; out[205] = v133; out[206] = v143; out[207] = v153; out[208] = v163; out[209] = v173; out[210] = v183; out[211] = v193; out[212] = v203; out[213] = v213; out[214] = v223; out[215] = v233; out[216] = v243; out[217] = v253; out[218] = v263; out[219] = v273; out[220] = v283; out[221] = v293; out[222] = v303; out[223] = v313; out[224] = v323; out[225] = v333; out[226] = v343; out[227] = v353; out[228] = v363; out[229] = v373; out[230] = v383; out[231] = v393; out[232] = v403; out[233] = v413; out[234] = v423; out[235] = v433; out[236] = v443; out[237] = v453; out[238] = v463; out[239] = v473; out[240] = v483; out[241] = v493; out[242] = v503; out[243] = v513; out[244] = v523; out[245] = v533; out[246] = v543; out[247] = v553; out[248] = v563; out[249] = v573; out[250] = v583; out[251] = v593; out[252] = v603; out[253] = v613; out[254] = v623; out[255] = v633; out += 256; } // KC remainder for (; k != 0; --k) { const uint16_t v0 = *w0++; out[0] = v0; const uint16_t v1 = *w1++; out[1] = v1; const uint16_t v2 = *w2++; out[2] = v2; const uint16_t v3 = *w3++; out[3] = v3; const uint16_t v4 = *w4++; out[4] = v4; const uint16_t v5 = *w5++; out[5] = v5; const uint16_t v6 = *w6++; out[6] = v6; const uint16_t v7 = *w7++; out[7] = v7; const uint16_t v8 = *w8++; out[8] = v8; const uint16_t v9 = *w9++; out[9] = v9; const uint16_t v10 = *w10++; out[10] = v10; const uint16_t v11 = *w11++; out[11] = v11; const uint16_t v12 = *w12++; out[12] = v12; const uint16_t v13 = *w13++; out[13] = v13; const uint16_t v14 = *w14++; out[14] = v14; const uint16_t v15 = *w15++; out[15] = v15; const uint16_t v16 = *w16++; out[16] = v16; const uint16_t v17 = *w17++; out[17] = v17; const uint16_t v18 = *w18++; out[18] = v18; const uint16_t v19 = *w19++; out[19] = v19; const uint16_t v20 = *w20++; out[20] = v20; const uint16_t v21 = *w21++; out[21] = v21; const uint16_t v22 = *w22++; out[22] = v22; const uint16_t v23 = *w23++; out[23] = v23; const uint16_t v24 = *w24++; out[24] = v24; const uint16_t v25 = *w25++; out[25] = v25; const uint16_t v26 = *w26++; out[26] = v26; const uint16_t v27 = *w27++; out[27] = v27; const uint16_t v28 = *w28++; out[28] = v28; const uint16_t v29 = *w29++; out[29] = v29; const uint16_t v30 = *w30++; out[30] = v30; const uint16_t v31 = *w31++; out[31] = v31; const uint16_t v32 = *w32++; out[32] = v32; const uint16_t v33 = *w33++; out[33] = v33; const uint16_t v34 = *w34++; out[34] = v34; const uint16_t v35 = *w35++; out[35] = v35; const uint16_t v36 = *w36++; out[36] = v36; const uint16_t v37 = *w37++; out[37] = v37; const uint16_t v38 = *w38++; out[38] = v38; const uint16_t v39 = *w39++; out[39] = v39; const uint16_t v40 = *w40++; out[40] = v40; const uint16_t v41 = *w41++; out[41] = v41; const uint16_t v42 = *w42++; out[42] = v42; const uint16_t v43 = *w43++; out[43] = v43; const uint16_t v44 = *w44++; out[44] = v44; const uint16_t v45 = *w45++; out[45] = v45; const uint16_t v46 = *w46++; out[46] = v46; const uint16_t v47 = *w47++; out[47] = v47; const uint16_t v48 = *w48++; out[48] = v48; const uint16_t v49 = *w49++; out[49] = v49; const uint16_t v50 = *w50++; out[50] = v50; const uint16_t v51 = *w51++; out[51] = v51; const uint16_t v52 = *w52++; out[52] = v52; const uint16_t v53 = *w53++; out[53] = v53; const uint16_t v54 = *w54++; out[54] = v54; const uint16_t v55 = *w55++; out[55] = v55; const uint16_t v56 = *w56++; out[56] = v56; const uint16_t v57 = *w57++; out[57] = v57; const uint16_t v58 = *w58++; out[58] = v58; const uint16_t v59 = *w59++; out[59] = v59; const uint16_t v60 = *w60++; out[60] = v60; const uint16_t v61 = *w61++; out[61] = v61; const uint16_t v62 = *w62++; out[62] = v62; const uint16_t v63 = *w63++; out[63] = v63; out += 64; } out = (uint16_t*) ((uintptr_t) out + extra_bytes); w0 = w63; } // NC remainder (1..63) if XNN_UNLIKELY(n != 0) { if XNN_LIKELY(b != NULL) { size_t nb = n; do { *out++ = *b++; } while (--nb != 0); } else { size_t nb = n; do { *out++ = 0; } while (--nb != 0); } out += (64 - n); // NR remainder has less than 64 rows so last row is not loaded const uint16_t* w1 = w0 + kc; if XNN_UNPREDICTABLE(n < 2) { w1 = w0; } const uint16_t* w2 = w1 + kc; if XNN_UNPREDICTABLE(n <= 2) { w2 = w1; } const uint16_t* w3 = w2 + kc; if XNN_UNPREDICTABLE(n < 4) { w3 = w2; } const uint16_t* w4 = w3 + kc; if XNN_UNPREDICTABLE(n <= 4) { w4 = w3; } const uint16_t* w5 = w4 + kc; if XNN_UNPREDICTABLE(n < 6) { w5 = w4; } const uint16_t* w6 = w5 + kc; if XNN_UNPREDICTABLE(n <= 6) { w6 = w5; } const uint16_t* w7 = w6 + kc; if XNN_UNPREDICTABLE(n < 8) { w7 = w6; } const uint16_t* w8 = w7 + kc; if XNN_UNPREDICTABLE(n <= 8) { w8 = w7; } const uint16_t* w9 = w8 + kc; if XNN_UNPREDICTABLE(n < 10) { w9 = w8; } const uint16_t* w10 = w9 + kc; if XNN_UNPREDICTABLE(n <= 10) { w10 = w9; } const uint16_t* w11 = w10 + kc; if XNN_UNPREDICTABLE(n < 12) { w11 = w10; } const uint16_t* w12 = w11 + kc; if XNN_UNPREDICTABLE(n <= 12) { w12 = w11; } const uint16_t* w13 = w12 + kc; if XNN_UNPREDICTABLE(n < 14) { w13 = w12; } const uint16_t* w14 = w13 + kc; if XNN_UNPREDICTABLE(n <= 14) { w14 = w13; } const uint16_t* w15 = w14 + kc; if XNN_UNPREDICTABLE(n < 16) { w15 = w14; } const uint16_t* w16 = w15 + kc; if XNN_UNPREDICTABLE(n <= 16) { w16 = w15; } const uint16_t* w17 = w16 + kc; if XNN_UNPREDICTABLE(n < 18) { w17 = w16; } const uint16_t* w18 = w17 + kc; if XNN_UNPREDICTABLE(n <= 18) { w18 = w17; } const uint16_t* w19 = w18 + kc; if XNN_UNPREDICTABLE(n < 20) { w19 = w18; } const uint16_t* w20 = w19 + kc; if XNN_UNPREDICTABLE(n <= 20) { w20 = w19; } const uint16_t* w21 = w20 + kc; if XNN_UNPREDICTABLE(n < 22) { w21 = w20; } const uint16_t* w22 = w21 + kc; if XNN_UNPREDICTABLE(n <= 22) { w22 = w21; } const uint16_t* w23 = w22 + kc; if XNN_UNPREDICTABLE(n < 24) { w23 = w22; } const uint16_t* w24 = w23 + kc; if XNN_UNPREDICTABLE(n <= 24) { w24 = w23; } const uint16_t* w25 = w24 + kc; if XNN_UNPREDICTABLE(n < 26) { w25 = w24; } const uint16_t* w26 = w25 + kc; if XNN_UNPREDICTABLE(n <= 26) { w26 = w25; } const uint16_t* w27 = w26 + kc; if XNN_UNPREDICTABLE(n < 28) { w27 = w26; } const uint16_t* w28 = w27 + kc; if XNN_UNPREDICTABLE(n <= 28) { w28 = w27; } const uint16_t* w29 = w28 + kc; if XNN_UNPREDICTABLE(n < 30) { w29 = w28; } const uint16_t* w30 = w29 + kc; if XNN_UNPREDICTABLE(n <= 30) { w30 = w29; } const uint16_t* w31 = w30 + kc; if XNN_UNPREDICTABLE(n < 32) { w31 = w30; } const uint16_t* w32 = w31 + kc; if XNN_UNPREDICTABLE(n <= 32) { w32 = w31; } const uint16_t* w33 = w32 + kc; if XNN_UNPREDICTABLE(n < 34) { w33 = w32; } const uint16_t* w34 = w33 + kc; if XNN_UNPREDICTABLE(n <= 34) { w34 = w33; } const uint16_t* w35 = w34 + kc; if XNN_UNPREDICTABLE(n < 36) { w35 = w34; } const uint16_t* w36 = w35 + kc; if XNN_UNPREDICTABLE(n <= 36) { w36 = w35; } const uint16_t* w37 = w36 + kc; if XNN_UNPREDICTABLE(n < 38) { w37 = w36; } const uint16_t* w38 = w37 + kc; if XNN_UNPREDICTABLE(n <= 38) { w38 = w37; } const uint16_t* w39 = w38 + kc; if XNN_UNPREDICTABLE(n < 40) { w39 = w38; } const uint16_t* w40 = w39 + kc; if XNN_UNPREDICTABLE(n <= 40) { w40 = w39; } const uint16_t* w41 = w40 + kc; if XNN_UNPREDICTABLE(n < 42) { w41 = w40; } const uint16_t* w42 = w41 + kc; if XNN_UNPREDICTABLE(n <= 42) { w42 = w41; } const uint16_t* w43 = w42 + kc; if XNN_UNPREDICTABLE(n < 44) { w43 = w42; } const uint16_t* w44 = w43 + kc; if XNN_UNPREDICTABLE(n <= 44) { w44 = w43; } const uint16_t* w45 = w44 + kc; if XNN_UNPREDICTABLE(n < 46) { w45 = w44; } const uint16_t* w46 = w45 + kc; if XNN_UNPREDICTABLE(n <= 46) { w46 = w45; } const uint16_t* w47 = w46 + kc; if XNN_UNPREDICTABLE(n < 48) { w47 = w46; } const uint16_t* w48 = w47 + kc; if XNN_UNPREDICTABLE(n <= 48) { w48 = w47; } const uint16_t* w49 = w48 + kc; if XNN_UNPREDICTABLE(n < 50) { w49 = w48; } const uint16_t* w50 = w49 + kc; if XNN_UNPREDICTABLE(n <= 50) { w50 = w49; } const uint16_t* w51 = w50 + kc; if XNN_UNPREDICTABLE(n < 52) { w51 = w50; } const uint16_t* w52 = w51 + kc; if XNN_UNPREDICTABLE(n <= 52) { w52 = w51; } const uint16_t* w53 = w52 + kc; if XNN_UNPREDICTABLE(n < 54) { w53 = w52; } const uint16_t* w54 = w53 + kc; if XNN_UNPREDICTABLE(n <= 54) { w54 = w53; } const uint16_t* w55 = w54 + kc; if XNN_UNPREDICTABLE(n < 56) { w55 = w54; } const uint16_t* w56 = w55 + kc; if XNN_UNPREDICTABLE(n <= 56) { w56 = w55; } const uint16_t* w57 = w56 + kc; if XNN_UNPREDICTABLE(n < 58) { w57 = w56; } const uint16_t* w58 = w57 + kc; if XNN_UNPREDICTABLE(n <= 58) { w58 = w57; } const uint16_t* w59 = w58 + kc; if XNN_UNPREDICTABLE(n < 60) { w59 = w58; } const uint16_t* w60 = w59 + kc; if XNN_UNPREDICTABLE(n <= 60) { w60 = w59; } const uint16_t* w61 = w60 + kc; if XNN_UNPREDICTABLE(n < 62) { w61 = w60; } const uint16_t* w62 = w61 + kc; if XNN_UNPREDICTABLE(n <= 62) { w62 = w61; } // KC main loop multiple of 64x4 size_t k = kc; for (; k >= 4; k -= 4) { const uint16_t v00 = w0[0]; const uint16_t v01 = w0[1]; const uint16_t v02 = w0[2]; const uint16_t v03 = w0[3]; w0 += 4; const uint16_t v10 = w1[0]; const uint16_t v11 = w1[1]; const uint16_t v12 = w1[2]; const uint16_t v13 = w1[3]; w1 += 4; const uint16_t v20 = w2[0]; const uint16_t v21 = w2[1]; const uint16_t v22 = w2[2]; const uint16_t v23 = w2[3]; w2 += 4; const uint16_t v30 = w3[0]; const uint16_t v31 = w3[1]; const uint16_t v32 = w3[2]; const uint16_t v33 = w3[3]; w3 += 4; const uint16_t v40 = w4[0]; const uint16_t v41 = w4[1]; const uint16_t v42 = w4[2]; const uint16_t v43 = w4[3]; w4 += 4; const uint16_t v50 = w5[0]; const uint16_t v51 = w5[1]; const uint16_t v52 = w5[2]; const uint16_t v53 = w5[3]; w5 += 4; const uint16_t v60 = w6[0]; const uint16_t v61 = w6[1]; const uint16_t v62 = w6[2]; const uint16_t v63 = w6[3]; w6 += 4; const uint16_t v70 = w7[0]; const uint16_t v71 = w7[1]; const uint16_t v72 = w7[2]; const uint16_t v73 = w7[3]; w7 += 4; const uint16_t v80 = w8[0]; const uint16_t v81 = w8[1]; const uint16_t v82 = w8[2]; const uint16_t v83 = w8[3]; w8 += 4; const uint16_t v90 = w9[0]; const uint16_t v91 = w9[1]; const uint16_t v92 = w9[2]; const uint16_t v93 = w9[3]; w9 += 4; const uint16_t v100 = w10[0]; const uint16_t v101 = w10[1]; const uint16_t v102 = w10[2]; const uint16_t v103 = w10[3]; w10 += 4; const uint16_t v110 = w11[0]; const uint16_t v111 = w11[1]; const uint16_t v112 = w11[2]; const uint16_t v113 = w11[3]; w11 += 4; const uint16_t v120 = w12[0]; const uint16_t v121 = w12[1]; const uint16_t v122 = w12[2]; const uint16_t v123 = w12[3]; w12 += 4; const uint16_t v130 = w13[0]; const uint16_t v131 = w13[1]; const uint16_t v132 = w13[2]; const uint16_t v133 = w13[3]; w13 += 4; const uint16_t v140 = w14[0]; const uint16_t v141 = w14[1]; const uint16_t v142 = w14[2]; const uint16_t v143 = w14[3]; w14 += 4; const uint16_t v150 = w15[0]; const uint16_t v151 = w15[1]; const uint16_t v152 = w15[2]; const uint16_t v153 = w15[3]; w15 += 4; const uint16_t v160 = w16[0]; const uint16_t v161 = w16[1]; const uint16_t v162 = w16[2]; const uint16_t v163 = w16[3]; w16 += 4; const uint16_t v170 = w17[0]; const uint16_t v171 = w17[1]; const uint16_t v172 = w17[2]; const uint16_t v173 = w17[3]; w17 += 4; const uint16_t v180 = w18[0]; const uint16_t v181 = w18[1]; const uint16_t v182 = w18[2]; const uint16_t v183 = w18[3]; w18 += 4; const uint16_t v190 = w19[0]; const uint16_t v191 = w19[1]; const uint16_t v192 = w19[2]; const uint16_t v193 = w19[3]; w19 += 4; const uint16_t v200 = w20[0]; const uint16_t v201 = w20[1]; const uint16_t v202 = w20[2]; const uint16_t v203 = w20[3]; w20 += 4; const uint16_t v210 = w21[0]; const uint16_t v211 = w21[1]; const uint16_t v212 = w21[2]; const uint16_t v213 = w21[3]; w21 += 4; const uint16_t v220 = w22[0]; const uint16_t v221 = w22[1]; const uint16_t v222 = w22[2]; const uint16_t v223 = w22[3]; w22 += 4; const uint16_t v230 = w23[0]; const uint16_t v231 = w23[1]; const uint16_t v232 = w23[2]; const uint16_t v233 = w23[3]; w23 += 4; const uint16_t v240 = w24[0]; const uint16_t v241 = w24[1]; const uint16_t v242 = w24[2]; const uint16_t v243 = w24[3]; w24 += 4; const uint16_t v250 = w25[0]; const uint16_t v251 = w25[1]; const uint16_t v252 = w25[2]; const uint16_t v253 = w25[3]; w25 += 4; const uint16_t v260 = w26[0]; const uint16_t v261 = w26[1]; const uint16_t v262 = w26[2]; const uint16_t v263 = w26[3]; w26 += 4; const uint16_t v270 = w27[0]; const uint16_t v271 = w27[1]; const uint16_t v272 = w27[2]; const uint16_t v273 = w27[3]; w27 += 4; const uint16_t v280 = w28[0]; const uint16_t v281 = w28[1]; const uint16_t v282 = w28[2]; const uint16_t v283 = w28[3]; w28 += 4; const uint16_t v290 = w29[0]; const uint16_t v291 = w29[1]; const uint16_t v292 = w29[2]; const uint16_t v293 = w29[3]; w29 += 4; const uint16_t v300 = w30[0]; const uint16_t v301 = w30[1]; const uint16_t v302 = w30[2]; const uint16_t v303 = w30[3]; w30 += 4; const uint16_t v310 = w31[0]; const uint16_t v311 = w31[1]; const uint16_t v312 = w31[2]; const uint16_t v313 = w31[3]; w31 += 4; const uint16_t v320 = w32[0]; const uint16_t v321 = w32[1]; const uint16_t v322 = w32[2]; const uint16_t v323 = w32[3]; w32 += 4; const uint16_t v330 = w33[0]; const uint16_t v331 = w33[1]; const uint16_t v332 = w33[2]; const uint16_t v333 = w33[3]; w33 += 4; const uint16_t v340 = w34[0]; const uint16_t v341 = w34[1]; const uint16_t v342 = w34[2]; const uint16_t v343 = w34[3]; w34 += 4; const uint16_t v350 = w35[0]; const uint16_t v351 = w35[1]; const uint16_t v352 = w35[2]; const uint16_t v353 = w35[3]; w35 += 4; const uint16_t v360 = w36[0]; const uint16_t v361 = w36[1]; const uint16_t v362 = w36[2]; const uint16_t v363 = w36[3]; w36 += 4; const uint16_t v370 = w37[0]; const uint16_t v371 = w37[1]; const uint16_t v372 = w37[2]; const uint16_t v373 = w37[3]; w37 += 4; const uint16_t v380 = w38[0]; const uint16_t v381 = w38[1]; const uint16_t v382 = w38[2]; const uint16_t v383 = w38[3]; w38 += 4; const uint16_t v390 = w39[0]; const uint16_t v391 = w39[1]; const uint16_t v392 = w39[2]; const uint16_t v393 = w39[3]; w39 += 4; const uint16_t v400 = w40[0]; const uint16_t v401 = w40[1]; const uint16_t v402 = w40[2]; const uint16_t v403 = w40[3]; w40 += 4; const uint16_t v410 = w41[0]; const uint16_t v411 = w41[1]; const uint16_t v412 = w41[2]; const uint16_t v413 = w41[3]; w41 += 4; const uint16_t v420 = w42[0]; const uint16_t v421 = w42[1]; const uint16_t v422 = w42[2]; const uint16_t v423 = w42[3]; w42 += 4; const uint16_t v430 = w43[0]; const uint16_t v431 = w43[1]; const uint16_t v432 = w43[2]; const uint16_t v433 = w43[3]; w43 += 4; const uint16_t v440 = w44[0]; const uint16_t v441 = w44[1]; const uint16_t v442 = w44[2]; const uint16_t v443 = w44[3]; w44 += 4; const uint16_t v450 = w45[0]; const uint16_t v451 = w45[1]; const uint16_t v452 = w45[2]; const uint16_t v453 = w45[3]; w45 += 4; const uint16_t v460 = w46[0]; const uint16_t v461 = w46[1]; const uint16_t v462 = w46[2]; const uint16_t v463 = w46[3]; w46 += 4; const uint16_t v470 = w47[0]; const uint16_t v471 = w47[1]; const uint16_t v472 = w47[2]; const uint16_t v473 = w47[3]; w47 += 4; const uint16_t v480 = w48[0]; const uint16_t v481 = w48[1]; const uint16_t v482 = w48[2]; const uint16_t v483 = w48[3]; w48 += 4; const uint16_t v490 = w49[0]; const uint16_t v491 = w49[1]; const uint16_t v492 = w49[2]; const uint16_t v493 = w49[3]; w49 += 4; const uint16_t v500 = w50[0]; const uint16_t v501 = w50[1]; const uint16_t v502 = w50[2]; const uint16_t v503 = w50[3]; w50 += 4; const uint16_t v510 = w51[0]; const uint16_t v511 = w51[1]; const uint16_t v512 = w51[2]; const uint16_t v513 = w51[3]; w51 += 4; const uint16_t v520 = w52[0]; const uint16_t v521 = w52[1]; const uint16_t v522 = w52[2]; const uint16_t v523 = w52[3]; w52 += 4; const uint16_t v530 = w53[0]; const uint16_t v531 = w53[1]; const uint16_t v532 = w53[2]; const uint16_t v533 = w53[3]; w53 += 4; const uint16_t v540 = w54[0]; const uint16_t v541 = w54[1]; const uint16_t v542 = w54[2]; const uint16_t v543 = w54[3]; w54 += 4; const uint16_t v550 = w55[0]; const uint16_t v551 = w55[1]; const uint16_t v552 = w55[2]; const uint16_t v553 = w55[3]; w55 += 4; const uint16_t v560 = w56[0]; const uint16_t v561 = w56[1]; const uint16_t v562 = w56[2]; const uint16_t v563 = w56[3]; w56 += 4; const uint16_t v570 = w57[0]; const uint16_t v571 = w57[1]; const uint16_t v572 = w57[2]; const uint16_t v573 = w57[3]; w57 += 4; const uint16_t v580 = w58[0]; const uint16_t v581 = w58[1]; const uint16_t v582 = w58[2]; const uint16_t v583 = w58[3]; w58 += 4; const uint16_t v590 = w59[0]; const uint16_t v591 = w59[1]; const uint16_t v592 = w59[2]; const uint16_t v593 = w59[3]; w59 += 4; const uint16_t v600 = w60[0]; const uint16_t v601 = w60[1]; const uint16_t v602 = w60[2]; const uint16_t v603 = w60[3]; w60 += 4; const uint16_t v610 = w61[0]; const uint16_t v611 = w61[1]; const uint16_t v612 = w61[2]; const uint16_t v613 = w61[3]; w61 += 4; const uint16_t v620 = w62[0]; const uint16_t v621 = w62[1]; const uint16_t v622 = w62[2]; const uint16_t v623 = w62[3]; w62 += 4; out[0] = v00; out[1] = v10; out[2] = v20; out[3] = v30; out[4] = v40; out[5] = v50; out[6] = v60; out[7] = v70; out[8] = v80; out[9] = v90; out[10] = v100; out[11] = v110; out[12] = v120; out[13] = v130; out[14] = v140; out[15] = v150; out[16] = v160; out[17] = v170; out[18] = v180; out[19] = v190; out[20] = v200; out[21] = v210; out[22] = v220; out[23] = v230; out[24] = v240; out[25] = v250; out[26] = v260; out[27] = v270; out[28] = v280; out[29] = v290; out[30] = v300; out[31] = v310; out[32] = v320; out[33] = v330; out[34] = v340; out[35] = v350; out[36] = v360; out[37] = v370; out[38] = v380; out[39] = v390; out[40] = v400; out[41] = v410; out[42] = v420; out[43] = v430; out[44] = v440; out[45] = v450; out[46] = v460; out[47] = v470; out[48] = v480; out[49] = v490; out[50] = v500; out[51] = v510; out[52] = v520; out[53] = v530; out[54] = v540; out[55] = v550; out[56] = v560; out[57] = v570; out[58] = v580; out[59] = v590; out[60] = v600; out[61] = v610; out[62] = v620; out[64] = v01; out[65] = v11; out[66] = v21; out[67] = v31; out[68] = v41; out[69] = v51; out[70] = v61; out[71] = v71; out[72] = v81; out[73] = v91; out[74] = v101; out[75] = v111; out[76] = v121; out[77] = v131; out[78] = v141; out[79] = v151; out[80] = v161; out[81] = v171; out[82] = v181; out[83] = v191; out[84] = v201; out[85] = v211; out[86] = v221; out[87] = v231; out[88] = v241; out[89] = v251; out[90] = v261; out[91] = v271; out[92] = v281; out[93] = v291; out[94] = v301; out[95] = v311; out[96] = v321; out[97] = v331; out[98] = v341; out[99] = v351; out[100] = v361; out[101] = v371; out[102] = v381; out[103] = v391; out[104] = v401; out[105] = v411; out[106] = v421; out[107] = v431; out[108] = v441; out[109] = v451; out[110] = v461; out[111] = v471; out[112] = v481; out[113] = v491; out[114] = v501; out[115] = v511; out[116] = v521; out[117] = v531; out[118] = v541; out[119] = v551; out[120] = v561; out[121] = v571; out[122] = v581; out[123] = v591; out[124] = v601; out[125] = v611; out[126] = v621; out[128] = v02; out[129] = v12; out[130] = v22; out[131] = v32; out[132] = v42; out[133] = v52; out[134] = v62; out[135] = v72; out[136] = v82; out[137] = v92; out[138] = v102; out[139] = v112; out[140] = v122; out[141] = v132; out[142] = v142; out[143] = v152; out[144] = v162; out[145] = v172; out[146] = v182; out[147] = v192; out[148] = v202; out[149] = v212; out[150] = v222; out[151] = v232; out[152] = v242; out[153] = v252; out[154] = v262; out[155] = v272; out[156] = v282; out[157] = v292; out[158] = v302; out[159] = v312; out[160] = v322; out[161] = v332; out[162] = v342; out[163] = v352; out[164] = v362; out[165] = v372; out[166] = v382; out[167] = v392; out[168] = v402; out[169] = v412; out[170] = v422; out[171] = v432; out[172] = v442; out[173] = v452; out[174] = v462; out[175] = v472; out[176] = v482; out[177] = v492; out[178] = v502; out[179] = v512; out[180] = v522; out[181] = v532; out[182] = v542; out[183] = v552; out[184] = v562; out[185] = v572; out[186] = v582; out[187] = v592; out[188] = v602; out[189] = v612; out[190] = v622; out[192] = v03; out[193] = v13; out[194] = v23; out[195] = v33; out[196] = v43; out[197] = v53; out[198] = v63; out[199] = v73; out[200] = v83; out[201] = v93; out[202] = v103; out[203] = v113; out[204] = v123; out[205] = v133; out[206] = v143; out[207] = v153; out[208] = v163; out[209] = v173; out[210] = v183; out[211] = v193; out[212] = v203; out[213] = v213; out[214] = v223; out[215] = v233; out[216] = v243; out[217] = v253; out[218] = v263; out[219] = v273; out[220] = v283; out[221] = v293; out[222] = v303; out[223] = v313; out[224] = v323; out[225] = v333; out[226] = v343; out[227] = v353; out[228] = v363; out[229] = v373; out[230] = v383; out[231] = v393; out[232] = v403; out[233] = v413; out[234] = v423; out[235] = v433; out[236] = v443; out[237] = v453; out[238] = v463; out[239] = v473; out[240] = v483; out[241] = v493; out[242] = v503; out[243] = v513; out[244] = v523; out[245] = v533; out[246] = v543; out[247] = v553; out[248] = v563; out[249] = v573; out[250] = v583; out[251] = v593; out[252] = v603; out[253] = v613; out[254] = v623; out += 256; } // KC remainder of 1..3 for (; k != 0; --k) { const uint16_t v0 = *w0++; out[0] = v0; const uint16_t v1 = *w1++; out[1] = v1; const uint16_t v2 = *w2++; out[2] = v2; const uint16_t v3 = *w3++; out[3] = v3; const uint16_t v4 = *w4++; out[4] = v4; const uint16_t v5 = *w5++; out[5] = v5; const uint16_t v6 = *w6++; out[6] = v6; const uint16_t v7 = *w7++; out[7] = v7; const uint16_t v8 = *w8++; out[8] = v8; const uint16_t v9 = *w9++; out[9] = v9; const uint16_t v10 = *w10++; out[10] = v10; const uint16_t v11 = *w11++; out[11] = v11; const uint16_t v12 = *w12++; out[12] = v12; const uint16_t v13 = *w13++; out[13] = v13; const uint16_t v14 = *w14++; out[14] = v14; const uint16_t v15 = *w15++; out[15] = v15; const uint16_t v16 = *w16++; out[16] = v16; const uint16_t v17 = *w17++; out[17] = v17; const uint16_t v18 = *w18++; out[18] = v18; const uint16_t v19 = *w19++; out[19] = v19; const uint16_t v20 = *w20++; out[20] = v20; const uint16_t v21 = *w21++; out[21] = v21; const uint16_t v22 = *w22++; out[22] = v22; const uint16_t v23 = *w23++; out[23] = v23; const uint16_t v24 = *w24++; out[24] = v24; const uint16_t v25 = *w25++; out[25] = v25; const uint16_t v26 = *w26++; out[26] = v26; const uint16_t v27 = *w27++; out[27] = v27; const uint16_t v28 = *w28++; out[28] = v28; const uint16_t v29 = *w29++; out[29] = v29; const uint16_t v30 = *w30++; out[30] = v30; const uint16_t v31 = *w31++; out[31] = v31; const uint16_t v32 = *w32++; out[32] = v32; const uint16_t v33 = *w33++; out[33] = v33; const uint16_t v34 = *w34++; out[34] = v34; const uint16_t v35 = *w35++; out[35] = v35; const uint16_t v36 = *w36++; out[36] = v36; const uint16_t v37 = *w37++; out[37] = v37; const uint16_t v38 = *w38++; out[38] = v38; const uint16_t v39 = *w39++; out[39] = v39; const uint16_t v40 = *w40++; out[40] = v40; const uint16_t v41 = *w41++; out[41] = v41; const uint16_t v42 = *w42++; out[42] = v42; const uint16_t v43 = *w43++; out[43] = v43; const uint16_t v44 = *w44++; out[44] = v44; const uint16_t v45 = *w45++; out[45] = v45; const uint16_t v46 = *w46++; out[46] = v46; const uint16_t v47 = *w47++; out[47] = v47; const uint16_t v48 = *w48++; out[48] = v48; const uint16_t v49 = *w49++; out[49] = v49; const uint16_t v50 = *w50++; out[50] = v50; const uint16_t v51 = *w51++; out[51] = v51; const uint16_t v52 = *w52++; out[52] = v52; const uint16_t v53 = *w53++; out[53] = v53; const uint16_t v54 = *w54++; out[54] = v54; const uint16_t v55 = *w55++; out[55] = v55; const uint16_t v56 = *w56++; out[56] = v56; const uint16_t v57 = *w57++; out[57] = v57; const uint16_t v58 = *w58++; out[58] = v58; const uint16_t v59 = *w59++; out[59] = v59; const uint16_t v60 = *w60++; out[60] = v60; const uint16_t v61 = *w61++; out[61] = v61; const uint16_t v62 = *w62++; out[62] = v62; out += 64; } out = (uint16_t*) ((uintptr_t) out + extra_bytes); } weights += nc * kc; } while (--g != 0); }