//// -------------------------- //// ATTENTION: //// THIS CODE IS AUTOGENERATED //// BY sve_emblookup_codegen.py //// DO NOT MODIFY!!! //// -------------------------- #include #include #include #include #include namespace caffe2 { template static bool EmbeddingLookupIdx_int32_t_float_float__sve( const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const float* input, const int32_t* indices, const int32_t* offsets, const float* weights, const float* scale_bias, bool normalize_by_lengths, float* out) { const svbool_t svAll = svptrue_b32(); const auto vLen = static_cast(svcntw()); int64_t pos = 0; for (int64_t i = 0; i < output_size; ++i) { float* const op = &out[i * block_size]; memset(op, 0, sizeof(float) * block_size); if (pos != offsets[i] - offsets[0]) { return false; } int64_t start_offset = offsets[i]; int64_t end_offset = offsets[i + 1]; int64_t j = start_offset; // unrolling 16 times while (j + 15 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; const auto idx2 = indices[pos + 2]; const auto idx3 = indices[pos + 3]; const auto idx4 = indices[pos + 4]; const auto idx5 = indices[pos + 5]; const auto idx6 = indices[pos + 6]; const auto idx7 = indices[pos + 7]; const auto idx8 = indices[pos + 8]; const auto idx9 = indices[pos + 9]; const auto idx10 = indices[pos + 10]; const auto idx11 = indices[pos + 11]; const auto idx12 = indices[pos + 12]; const auto idx13 = indices[pos + 13]; const auto idx14 = indices[pos + 14]; const auto idx15 = indices[pos + 15]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } if (idx2 < 0 || idx2 >= data_size) { return false; } if (idx3 < 0 || idx3 >= data_size) { return false; } if (idx4 < 0 || idx4 >= data_size) { return false; } if (idx5 < 0 || idx5 >= data_size) { return false; } if (idx6 < 0 || idx6 >= data_size) { return false; } if (idx7 < 0 || idx7 >= data_size) { return false; } if (idx8 < 0 || idx8 >= data_size) { return false; } if (idx9 < 0 || idx9 >= data_size) { return false; } if (idx10 < 0 || idx10 >= data_size) { return false; } if (idx11 < 0 || idx11 >= data_size) { return false; } if (idx12 < 0 || idx12 >= data_size) { return false; } if (idx13 < 0 || idx13 >= data_size) { return false; } if (idx14 < 0 || idx14 >= data_size) { return false; } if (idx15 < 0 || idx15 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float wgt2 = 1.f; float wgt3 = 1.f; float wgt4 = 1.f; float wgt5 = 1.f; float wgt6 = 1.f; float wgt7 = 1.f; float wgt8 = 1.f; float wgt9 = 1.f; float wgt10 = 1.f; float wgt11 = 1.f; float wgt12 = 1.f; float wgt13 = 1.f; float wgt14 = 1.f; float wgt15 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2]; wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3]; wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4]; wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5]; wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6]; wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7]; wgt8 = weights[IS_WEIGHT_POSITIONAL ? (j + 8 - start_offset) : pos + 8]; wgt9 = weights[IS_WEIGHT_POSITIONAL ? (j + 9 - start_offset) : pos + 9]; wgt10 = weights[IS_WEIGHT_POSITIONAL ? (j + 10 - start_offset) : pos + 10]; wgt11 = weights[IS_WEIGHT_POSITIONAL ? (j + 11 - start_offset) : pos + 11]; wgt12 = weights[IS_WEIGHT_POSITIONAL ? (j + 12 - start_offset) : pos + 12]; wgt13 = weights[IS_WEIGHT_POSITIONAL ? (j + 13 - start_offset) : pos + 13]; wgt14 = weights[IS_WEIGHT_POSITIONAL ? (j + 14 - start_offset) : pos + 14]; wgt15 = weights[IS_WEIGHT_POSITIONAL ? (j + 15 - start_offset) : pos + 15]; } const float* const ip0 = &input[idx0 * block_size]; const float* const ip1 = &input[idx1 * block_size]; const float* const ip2 = &input[idx2 * block_size]; const float* const ip3 = &input[idx3 * block_size]; const float* const ip4 = &input[idx4 * block_size]; const float* const ip5 = &input[idx5 * block_size]; const float* const ip6 = &input[idx6 * block_size]; const float* const ip7 = &input[idx7 * block_size]; const float* const ip8 = &input[idx8 * block_size]; const float* const ip9 = &input[idx9 * block_size]; const float* const ip10 = &input[idx10 * block_size]; const float* const ip11 = &input[idx11 * block_size]; const float* const ip12 = &input[idx12 * block_size]; const float* const ip13 = &input[idx13 * block_size]; const float* const ip14 = &input[idx14 * block_size]; const float* const ip15 = &input[idx15 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); output = svmla_x(svAll, output, svld1(svAll, &ip0[k]), wgt0); output = svmla_x(svAll, output, svld1(svAll, &ip1[k]), wgt1); output = svmla_x(svAll, output, svld1(svAll, &ip2[k]), wgt2); output = svmla_x(svAll, output, svld1(svAll, &ip3[k]), wgt3); output = svmla_x(svAll, output, svld1(svAll, &ip4[k]), wgt4); output = svmla_x(svAll, output, svld1(svAll, &ip5[k]), wgt5); output = svmla_x(svAll, output, svld1(svAll, &ip6[k]), wgt6); output = svmla_x(svAll, output, svld1(svAll, &ip7[k]), wgt7); output = svmla_x(svAll, output, svld1(svAll, &ip8[k]), wgt8); output = svmla_x(svAll, output, svld1(svAll, &ip9[k]), wgt9); output = svmla_x(svAll, output, svld1(svAll, &ip10[k]), wgt10); output = svmla_x(svAll, output, svld1(svAll, &ip11[k]), wgt11); output = svmla_x(svAll, output, svld1(svAll, &ip12[k]), wgt12); output = svmla_x(svAll, output, svld1(svAll, &ip13[k]), wgt13); output = svmla_x(svAll, output, svld1(svAll, &ip14[k]), wgt14); output = svmla_x(svAll, output, svld1(svAll, &ip15[k]), wgt15); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); output = svmla_x(pg, output, svld1(svAll, &ip0[k]), wgt0); output = svmla_x(pg, output, svld1(svAll, &ip1[k]), wgt1); output = svmla_x(pg, output, svld1(svAll, &ip2[k]), wgt2); output = svmla_x(pg, output, svld1(svAll, &ip3[k]), wgt3); output = svmla_x(pg, output, svld1(svAll, &ip4[k]), wgt4); output = svmla_x(pg, output, svld1(svAll, &ip5[k]), wgt5); output = svmla_x(pg, output, svld1(svAll, &ip6[k]), wgt6); output = svmla_x(pg, output, svld1(svAll, &ip7[k]), wgt7); output = svmla_x(pg, output, svld1(svAll, &ip8[k]), wgt8); output = svmla_x(pg, output, svld1(svAll, &ip9[k]), wgt9); output = svmla_x(pg, output, svld1(svAll, &ip10[k]), wgt10); output = svmla_x(pg, output, svld1(svAll, &ip11[k]), wgt11); output = svmla_x(pg, output, svld1(svAll, &ip12[k]), wgt12); output = svmla_x(pg, output, svld1(svAll, &ip13[k]), wgt13); output = svmla_x(pg, output, svld1(svAll, &ip14[k]), wgt14); output = svmla_x(pg, output, svld1(svAll, &ip15[k]), wgt15); svst1(pg, &op[k], output); k += vLen; } j += 16; pos += 16; } // unrolling 8 times while (j + 7 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; const auto idx2 = indices[pos + 2]; const auto idx3 = indices[pos + 3]; const auto idx4 = indices[pos + 4]; const auto idx5 = indices[pos + 5]; const auto idx6 = indices[pos + 6]; const auto idx7 = indices[pos + 7]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } if (idx2 < 0 || idx2 >= data_size) { return false; } if (idx3 < 0 || idx3 >= data_size) { return false; } if (idx4 < 0 || idx4 >= data_size) { return false; } if (idx5 < 0 || idx5 >= data_size) { return false; } if (idx6 < 0 || idx6 >= data_size) { return false; } if (idx7 < 0 || idx7 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float wgt2 = 1.f; float wgt3 = 1.f; float wgt4 = 1.f; float wgt5 = 1.f; float wgt6 = 1.f; float wgt7 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2]; wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3]; wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4]; wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5]; wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6]; wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7]; } const float* const ip0 = &input[idx0 * block_size]; const float* const ip1 = &input[idx1 * block_size]; const float* const ip2 = &input[idx2 * block_size]; const float* const ip3 = &input[idx3 * block_size]; const float* const ip4 = &input[idx4 * block_size]; const float* const ip5 = &input[idx5 * block_size]; const float* const ip6 = &input[idx6 * block_size]; const float* const ip7 = &input[idx7 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); output = svmla_x(svAll, output, svld1(svAll, &ip0[k]), wgt0); output = svmla_x(svAll, output, svld1(svAll, &ip1[k]), wgt1); output = svmla_x(svAll, output, svld1(svAll, &ip2[k]), wgt2); output = svmla_x(svAll, output, svld1(svAll, &ip3[k]), wgt3); output = svmla_x(svAll, output, svld1(svAll, &ip4[k]), wgt4); output = svmla_x(svAll, output, svld1(svAll, &ip5[k]), wgt5); output = svmla_x(svAll, output, svld1(svAll, &ip6[k]), wgt6); output = svmla_x(svAll, output, svld1(svAll, &ip7[k]), wgt7); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); output = svmla_x(pg, output, svld1(svAll, &ip0[k]), wgt0); output = svmla_x(pg, output, svld1(svAll, &ip1[k]), wgt1); output = svmla_x(pg, output, svld1(svAll, &ip2[k]), wgt2); output = svmla_x(pg, output, svld1(svAll, &ip3[k]), wgt3); output = svmla_x(pg, output, svld1(svAll, &ip4[k]), wgt4); output = svmla_x(pg, output, svld1(svAll, &ip5[k]), wgt5); output = svmla_x(pg, output, svld1(svAll, &ip6[k]), wgt6); output = svmla_x(pg, output, svld1(svAll, &ip7[k]), wgt7); svst1(pg, &op[k], output); k += vLen; } j += 8; pos += 8; } // unrolling 4 times while (j + 3 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; const auto idx2 = indices[pos + 2]; const auto idx3 = indices[pos + 3]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } if (idx2 < 0 || idx2 >= data_size) { return false; } if (idx3 < 0 || idx3 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float wgt2 = 1.f; float wgt3 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2]; wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3]; } const float* const ip0 = &input[idx0 * block_size]; const float* const ip1 = &input[idx1 * block_size]; const float* const ip2 = &input[idx2 * block_size]; const float* const ip3 = &input[idx3 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); output = svmla_x(svAll, output, svld1(svAll, &ip0[k]), wgt0); output = svmla_x(svAll, output, svld1(svAll, &ip1[k]), wgt1); output = svmla_x(svAll, output, svld1(svAll, &ip2[k]), wgt2); output = svmla_x(svAll, output, svld1(svAll, &ip3[k]), wgt3); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); output = svmla_x(pg, output, svld1(svAll, &ip0[k]), wgt0); output = svmla_x(pg, output, svld1(svAll, &ip1[k]), wgt1); output = svmla_x(pg, output, svld1(svAll, &ip2[k]), wgt2); output = svmla_x(pg, output, svld1(svAll, &ip3[k]), wgt3); svst1(pg, &op[k], output); k += vLen; } j += 4; pos += 4; } // unrolling 2 times while (j + 1 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; } const float* const ip0 = &input[idx0 * block_size]; const float* const ip1 = &input[idx1 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); output = svmla_x(svAll, output, svld1(svAll, &ip0[k]), wgt0); output = svmla_x(svAll, output, svld1(svAll, &ip1[k]), wgt1); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); output = svmla_x(pg, output, svld1(svAll, &ip0[k]), wgt0); output = svmla_x(pg, output, svld1(svAll, &ip1[k]), wgt1); svst1(pg, &op[k], output); k += vLen; } j += 2; pos += 2; } // tail loop if (j < end_offset) { const auto idx0 = indices[pos + 0]; if (idx0 < 0 || idx0 >= data_size) { return false; } float wgt0 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; } const float* const ip0 = &input[idx0 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); output = svmla_x(svAll, output, svld1(svAll, &ip0[k]), wgt0); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); output = svmla_x(pg, output, svld1(svAll, &ip0[k]), wgt0); svst1(pg, &op[k], output); k += vLen; } pos ++; } const int64_t length = end_offset - start_offset; if (normalize_by_lengths && length != 0) { const float len_inv = 1.0f / length; svbool_t pg; int64_t j = 0; while (j + vLen - 1 < block_size) { svst1(svAll, &op[j], svmul_x(svAll, svld1(svAll, &op[j]), len_inv)); j += vLen; } if (j < block_size) { pg = svwhilelt_b32_s64(j, block_size); svst1(pg, &op[j], svmul_x(pg, svld1(pg, &op[j]), len_inv)); } } } return pos == index_size; } bool EmbeddingLookupIdx_int32_t_float_float_false__sve( const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const float* input, const int32_t* indices, const int32_t* offsets, const float* weights, const float* scale_bias, bool normalize_by_lengths, float* out) { return EmbeddingLookupIdx_int32_t_float_float__sve( block_size, output_size, index_size, data_size, input, indices, offsets, weights, scale_bias, normalize_by_lengths, out); } bool EmbeddingLookupIdx_int32_t_float_float_true__sve( const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const float* input, const int32_t* indices, const int32_t* offsets, const float* weights, const float* scale_bias, bool normalize_by_lengths, float* out) { return EmbeddingLookupIdx_int32_t_float_float__sve( block_size, output_size, index_size, data_size, input, indices, offsets, weights, scale_bias, normalize_by_lengths, out); } template static bool EmbeddingLookupIdx_int64_t_float_float__sve( const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const float* input, const int64_t* indices, const int64_t* offsets, const float* weights, const float* scale_bias, bool normalize_by_lengths, float* out) { const svbool_t svAll = svptrue_b32(); const auto vLen = static_cast(svcntw()); int64_t pos = 0; for (int64_t i = 0; i < output_size; ++i) { float* const op = &out[i * block_size]; memset(op, 0, sizeof(float) * block_size); if (pos != offsets[i] - offsets[0]) { return false; } int64_t start_offset = offsets[i]; int64_t end_offset = offsets[i + 1]; int64_t j = start_offset; // unrolling 16 times while (j + 15 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; const auto idx2 = indices[pos + 2]; const auto idx3 = indices[pos + 3]; const auto idx4 = indices[pos + 4]; const auto idx5 = indices[pos + 5]; const auto idx6 = indices[pos + 6]; const auto idx7 = indices[pos + 7]; const auto idx8 = indices[pos + 8]; const auto idx9 = indices[pos + 9]; const auto idx10 = indices[pos + 10]; const auto idx11 = indices[pos + 11]; const auto idx12 = indices[pos + 12]; const auto idx13 = indices[pos + 13]; const auto idx14 = indices[pos + 14]; const auto idx15 = indices[pos + 15]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } if (idx2 < 0 || idx2 >= data_size) { return false; } if (idx3 < 0 || idx3 >= data_size) { return false; } if (idx4 < 0 || idx4 >= data_size) { return false; } if (idx5 < 0 || idx5 >= data_size) { return false; } if (idx6 < 0 || idx6 >= data_size) { return false; } if (idx7 < 0 || idx7 >= data_size) { return false; } if (idx8 < 0 || idx8 >= data_size) { return false; } if (idx9 < 0 || idx9 >= data_size) { return false; } if (idx10 < 0 || idx10 >= data_size) { return false; } if (idx11 < 0 || idx11 >= data_size) { return false; } if (idx12 < 0 || idx12 >= data_size) { return false; } if (idx13 < 0 || idx13 >= data_size) { return false; } if (idx14 < 0 || idx14 >= data_size) { return false; } if (idx15 < 0 || idx15 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float wgt2 = 1.f; float wgt3 = 1.f; float wgt4 = 1.f; float wgt5 = 1.f; float wgt6 = 1.f; float wgt7 = 1.f; float wgt8 = 1.f; float wgt9 = 1.f; float wgt10 = 1.f; float wgt11 = 1.f; float wgt12 = 1.f; float wgt13 = 1.f; float wgt14 = 1.f; float wgt15 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2]; wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3]; wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4]; wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5]; wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6]; wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7]; wgt8 = weights[IS_WEIGHT_POSITIONAL ? (j + 8 - start_offset) : pos + 8]; wgt9 = weights[IS_WEIGHT_POSITIONAL ? (j + 9 - start_offset) : pos + 9]; wgt10 = weights[IS_WEIGHT_POSITIONAL ? (j + 10 - start_offset) : pos + 10]; wgt11 = weights[IS_WEIGHT_POSITIONAL ? (j + 11 - start_offset) : pos + 11]; wgt12 = weights[IS_WEIGHT_POSITIONAL ? (j + 12 - start_offset) : pos + 12]; wgt13 = weights[IS_WEIGHT_POSITIONAL ? (j + 13 - start_offset) : pos + 13]; wgt14 = weights[IS_WEIGHT_POSITIONAL ? (j + 14 - start_offset) : pos + 14]; wgt15 = weights[IS_WEIGHT_POSITIONAL ? (j + 15 - start_offset) : pos + 15]; } const float* const ip0 = &input[idx0 * block_size]; const float* const ip1 = &input[idx1 * block_size]; const float* const ip2 = &input[idx2 * block_size]; const float* const ip3 = &input[idx3 * block_size]; const float* const ip4 = &input[idx4 * block_size]; const float* const ip5 = &input[idx5 * block_size]; const float* const ip6 = &input[idx6 * block_size]; const float* const ip7 = &input[idx7 * block_size]; const float* const ip8 = &input[idx8 * block_size]; const float* const ip9 = &input[idx9 * block_size]; const float* const ip10 = &input[idx10 * block_size]; const float* const ip11 = &input[idx11 * block_size]; const float* const ip12 = &input[idx12 * block_size]; const float* const ip13 = &input[idx13 * block_size]; const float* const ip14 = &input[idx14 * block_size]; const float* const ip15 = &input[idx15 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); output = svmla_x(svAll, output, svld1(svAll, &ip0[k]), wgt0); output = svmla_x(svAll, output, svld1(svAll, &ip1[k]), wgt1); output = svmla_x(svAll, output, svld1(svAll, &ip2[k]), wgt2); output = svmla_x(svAll, output, svld1(svAll, &ip3[k]), wgt3); output = svmla_x(svAll, output, svld1(svAll, &ip4[k]), wgt4); output = svmla_x(svAll, output, svld1(svAll, &ip5[k]), wgt5); output = svmla_x(svAll, output, svld1(svAll, &ip6[k]), wgt6); output = svmla_x(svAll, output, svld1(svAll, &ip7[k]), wgt7); output = svmla_x(svAll, output, svld1(svAll, &ip8[k]), wgt8); output = svmla_x(svAll, output, svld1(svAll, &ip9[k]), wgt9); output = svmla_x(svAll, output, svld1(svAll, &ip10[k]), wgt10); output = svmla_x(svAll, output, svld1(svAll, &ip11[k]), wgt11); output = svmla_x(svAll, output, svld1(svAll, &ip12[k]), wgt12); output = svmla_x(svAll, output, svld1(svAll, &ip13[k]), wgt13); output = svmla_x(svAll, output, svld1(svAll, &ip14[k]), wgt14); output = svmla_x(svAll, output, svld1(svAll, &ip15[k]), wgt15); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); output = svmla_x(pg, output, svld1(svAll, &ip0[k]), wgt0); output = svmla_x(pg, output, svld1(svAll, &ip1[k]), wgt1); output = svmla_x(pg, output, svld1(svAll, &ip2[k]), wgt2); output = svmla_x(pg, output, svld1(svAll, &ip3[k]), wgt3); output = svmla_x(pg, output, svld1(svAll, &ip4[k]), wgt4); output = svmla_x(pg, output, svld1(svAll, &ip5[k]), wgt5); output = svmla_x(pg, output, svld1(svAll, &ip6[k]), wgt6); output = svmla_x(pg, output, svld1(svAll, &ip7[k]), wgt7); output = svmla_x(pg, output, svld1(svAll, &ip8[k]), wgt8); output = svmla_x(pg, output, svld1(svAll, &ip9[k]), wgt9); output = svmla_x(pg, output, svld1(svAll, &ip10[k]), wgt10); output = svmla_x(pg, output, svld1(svAll, &ip11[k]), wgt11); output = svmla_x(pg, output, svld1(svAll, &ip12[k]), wgt12); output = svmla_x(pg, output, svld1(svAll, &ip13[k]), wgt13); output = svmla_x(pg, output, svld1(svAll, &ip14[k]), wgt14); output = svmla_x(pg, output, svld1(svAll, &ip15[k]), wgt15); svst1(pg, &op[k], output); k += vLen; } j += 16; pos += 16; } // unrolling 8 times while (j + 7 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; const auto idx2 = indices[pos + 2]; const auto idx3 = indices[pos + 3]; const auto idx4 = indices[pos + 4]; const auto idx5 = indices[pos + 5]; const auto idx6 = indices[pos + 6]; const auto idx7 = indices[pos + 7]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } if (idx2 < 0 || idx2 >= data_size) { return false; } if (idx3 < 0 || idx3 >= data_size) { return false; } if (idx4 < 0 || idx4 >= data_size) { return false; } if (idx5 < 0 || idx5 >= data_size) { return false; } if (idx6 < 0 || idx6 >= data_size) { return false; } if (idx7 < 0 || idx7 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float wgt2 = 1.f; float wgt3 = 1.f; float wgt4 = 1.f; float wgt5 = 1.f; float wgt6 = 1.f; float wgt7 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2]; wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3]; wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4]; wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5]; wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6]; wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7]; } const float* const ip0 = &input[idx0 * block_size]; const float* const ip1 = &input[idx1 * block_size]; const float* const ip2 = &input[idx2 * block_size]; const float* const ip3 = &input[idx3 * block_size]; const float* const ip4 = &input[idx4 * block_size]; const float* const ip5 = &input[idx5 * block_size]; const float* const ip6 = &input[idx6 * block_size]; const float* const ip7 = &input[idx7 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); output = svmla_x(svAll, output, svld1(svAll, &ip0[k]), wgt0); output = svmla_x(svAll, output, svld1(svAll, &ip1[k]), wgt1); output = svmla_x(svAll, output, svld1(svAll, &ip2[k]), wgt2); output = svmla_x(svAll, output, svld1(svAll, &ip3[k]), wgt3); output = svmla_x(svAll, output, svld1(svAll, &ip4[k]), wgt4); output = svmla_x(svAll, output, svld1(svAll, &ip5[k]), wgt5); output = svmla_x(svAll, output, svld1(svAll, &ip6[k]), wgt6); output = svmla_x(svAll, output, svld1(svAll, &ip7[k]), wgt7); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); output = svmla_x(pg, output, svld1(svAll, &ip0[k]), wgt0); output = svmla_x(pg, output, svld1(svAll, &ip1[k]), wgt1); output = svmla_x(pg, output, svld1(svAll, &ip2[k]), wgt2); output = svmla_x(pg, output, svld1(svAll, &ip3[k]), wgt3); output = svmla_x(pg, output, svld1(svAll, &ip4[k]), wgt4); output = svmla_x(pg, output, svld1(svAll, &ip5[k]), wgt5); output = svmla_x(pg, output, svld1(svAll, &ip6[k]), wgt6); output = svmla_x(pg, output, svld1(svAll, &ip7[k]), wgt7); svst1(pg, &op[k], output); k += vLen; } j += 8; pos += 8; } // unrolling 4 times while (j + 3 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; const auto idx2 = indices[pos + 2]; const auto idx3 = indices[pos + 3]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } if (idx2 < 0 || idx2 >= data_size) { return false; } if (idx3 < 0 || idx3 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float wgt2 = 1.f; float wgt3 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2]; wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3]; } const float* const ip0 = &input[idx0 * block_size]; const float* const ip1 = &input[idx1 * block_size]; const float* const ip2 = &input[idx2 * block_size]; const float* const ip3 = &input[idx3 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); output = svmla_x(svAll, output, svld1(svAll, &ip0[k]), wgt0); output = svmla_x(svAll, output, svld1(svAll, &ip1[k]), wgt1); output = svmla_x(svAll, output, svld1(svAll, &ip2[k]), wgt2); output = svmla_x(svAll, output, svld1(svAll, &ip3[k]), wgt3); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); output = svmla_x(pg, output, svld1(svAll, &ip0[k]), wgt0); output = svmla_x(pg, output, svld1(svAll, &ip1[k]), wgt1); output = svmla_x(pg, output, svld1(svAll, &ip2[k]), wgt2); output = svmla_x(pg, output, svld1(svAll, &ip3[k]), wgt3); svst1(pg, &op[k], output); k += vLen; } j += 4; pos += 4; } // unrolling 2 times while (j + 1 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; } const float* const ip0 = &input[idx0 * block_size]; const float* const ip1 = &input[idx1 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); output = svmla_x(svAll, output, svld1(svAll, &ip0[k]), wgt0); output = svmla_x(svAll, output, svld1(svAll, &ip1[k]), wgt1); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); output = svmla_x(pg, output, svld1(svAll, &ip0[k]), wgt0); output = svmla_x(pg, output, svld1(svAll, &ip1[k]), wgt1); svst1(pg, &op[k], output); k += vLen; } j += 2; pos += 2; } // tail loop if (j < end_offset) { const auto idx0 = indices[pos + 0]; if (idx0 < 0 || idx0 >= data_size) { return false; } float wgt0 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; } const float* const ip0 = &input[idx0 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); output = svmla_x(svAll, output, svld1(svAll, &ip0[k]), wgt0); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); output = svmla_x(pg, output, svld1(svAll, &ip0[k]), wgt0); svst1(pg, &op[k], output); k += vLen; } pos ++; } const int64_t length = end_offset - start_offset; if (normalize_by_lengths && length != 0) { const float len_inv = 1.0f / length; svbool_t pg; int64_t j = 0; while (j + vLen - 1 < block_size) { svst1(svAll, &op[j], svmul_x(svAll, svld1(svAll, &op[j]), len_inv)); j += vLen; } if (j < block_size) { pg = svwhilelt_b32_s64(j, block_size); svst1(pg, &op[j], svmul_x(pg, svld1(pg, &op[j]), len_inv)); } } } return pos == index_size; } bool EmbeddingLookupIdx_int64_t_float_float_false__sve( const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const float* input, const int64_t* indices, const int64_t* offsets, const float* weights, const float* scale_bias, bool normalize_by_lengths, float* out) { return EmbeddingLookupIdx_int64_t_float_float__sve( block_size, output_size, index_size, data_size, input, indices, offsets, weights, scale_bias, normalize_by_lengths, out); } bool EmbeddingLookupIdx_int64_t_float_float_true__sve( const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const float* input, const int64_t* indices, const int64_t* offsets, const float* weights, const float* scale_bias, bool normalize_by_lengths, float* out) { return EmbeddingLookupIdx_int64_t_float_float__sve( block_size, output_size, index_size, data_size, input, indices, offsets, weights, scale_bias, normalize_by_lengths, out); } template static bool EmbeddingLookupIdx_int32_t_half_float__sve( const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const at::Half* input, const int32_t* indices, const int32_t* offsets, const float* weights, const float* scale_bias, bool normalize_by_lengths, float* out) { const svbool_t svAll = svptrue_b32(); const auto vLen = static_cast(svcntw()); int64_t pos = 0; for (int64_t i = 0; i < output_size; ++i) { float* const op = &out[i * block_size]; memset(op, 0, sizeof(float) * block_size); if (pos != offsets[i] - offsets[0]) { return false; } int64_t start_offset = offsets[i]; int64_t end_offset = offsets[i + 1]; int64_t j = start_offset; // unrolling 16 times while (j + 15 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; const auto idx2 = indices[pos + 2]; const auto idx3 = indices[pos + 3]; const auto idx4 = indices[pos + 4]; const auto idx5 = indices[pos + 5]; const auto idx6 = indices[pos + 6]; const auto idx7 = indices[pos + 7]; const auto idx8 = indices[pos + 8]; const auto idx9 = indices[pos + 9]; const auto idx10 = indices[pos + 10]; const auto idx11 = indices[pos + 11]; const auto idx12 = indices[pos + 12]; const auto idx13 = indices[pos + 13]; const auto idx14 = indices[pos + 14]; const auto idx15 = indices[pos + 15]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } if (idx2 < 0 || idx2 >= data_size) { return false; } if (idx3 < 0 || idx3 >= data_size) { return false; } if (idx4 < 0 || idx4 >= data_size) { return false; } if (idx5 < 0 || idx5 >= data_size) { return false; } if (idx6 < 0 || idx6 >= data_size) { return false; } if (idx7 < 0 || idx7 >= data_size) { return false; } if (idx8 < 0 || idx8 >= data_size) { return false; } if (idx9 < 0 || idx9 >= data_size) { return false; } if (idx10 < 0 || idx10 >= data_size) { return false; } if (idx11 < 0 || idx11 >= data_size) { return false; } if (idx12 < 0 || idx12 >= data_size) { return false; } if (idx13 < 0 || idx13 >= data_size) { return false; } if (idx14 < 0 || idx14 >= data_size) { return false; } if (idx15 < 0 || idx15 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float wgt2 = 1.f; float wgt3 = 1.f; float wgt4 = 1.f; float wgt5 = 1.f; float wgt6 = 1.f; float wgt7 = 1.f; float wgt8 = 1.f; float wgt9 = 1.f; float wgt10 = 1.f; float wgt11 = 1.f; float wgt12 = 1.f; float wgt13 = 1.f; float wgt14 = 1.f; float wgt15 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2]; wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3]; wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4]; wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5]; wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6]; wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7]; wgt8 = weights[IS_WEIGHT_POSITIONAL ? (j + 8 - start_offset) : pos + 8]; wgt9 = weights[IS_WEIGHT_POSITIONAL ? (j + 9 - start_offset) : pos + 9]; wgt10 = weights[IS_WEIGHT_POSITIONAL ? (j + 10 - start_offset) : pos + 10]; wgt11 = weights[IS_WEIGHT_POSITIONAL ? (j + 11 - start_offset) : pos + 11]; wgt12 = weights[IS_WEIGHT_POSITIONAL ? (j + 12 - start_offset) : pos + 12]; wgt13 = weights[IS_WEIGHT_POSITIONAL ? (j + 13 - start_offset) : pos + 13]; wgt14 = weights[IS_WEIGHT_POSITIONAL ? (j + 14 - start_offset) : pos + 14]; wgt15 = weights[IS_WEIGHT_POSITIONAL ? (j + 15 - start_offset) : pos + 15]; } const at::Half* const ip0 = &input[idx0 * block_size]; const at::Half* const ip1 = &input[idx1 * block_size]; const at::Half* const ip2 = &input[idx2 * block_size]; const at::Half* const ip3 = &input[idx3 * block_size]; const at::Half* const ip4 = &input[idx4 * block_size]; const at::Half* const ip5 = &input[idx5 * block_size]; const at::Half* const ip6 = &input[idx6 * block_size]; const at::Half* const ip7 = &input[idx7 * block_size]; const at::Half* const ip8 = &input[idx8 * block_size]; const at::Half* const ip9 = &input[idx9 * block_size]; const at::Half* const ip10 = &input[idx10 * block_size]; const at::Half* const ip11 = &input[idx11 * block_size]; const at::Half* const ip12 = &input[idx12 * block_size]; const at::Half* const ip13 = &input[idx13 * block_size]; const at::Half* const ip14 = &input[idx14 * block_size]; const at::Half* const ip15 = &input[idx15 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); auto input0 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip0[k])))); auto input1 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip1[k])))); auto input2 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip2[k])))); auto input3 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip3[k])))); auto input4 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip4[k])))); auto input5 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip5[k])))); auto input6 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip6[k])))); auto input7 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip7[k])))); auto input8 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip8[k])))); auto input9 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip9[k])))); auto input10 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip10[k])))); auto input11 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip11[k])))); auto input12 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip12[k])))); auto input13 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip13[k])))); auto input14 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip14[k])))); auto input15 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip15[k])))); output = svmla_x(svAll, output, input0, wgt0); output = svmla_x(svAll, output, input1, wgt1); output = svmla_x(svAll, output, input2, wgt2); output = svmla_x(svAll, output, input3, wgt3); output = svmla_x(svAll, output, input4, wgt4); output = svmla_x(svAll, output, input5, wgt5); output = svmla_x(svAll, output, input6, wgt6); output = svmla_x(svAll, output, input7, wgt7); output = svmla_x(svAll, output, input8, wgt8); output = svmla_x(svAll, output, input9, wgt9); output = svmla_x(svAll, output, input10, wgt10); output = svmla_x(svAll, output, input11, wgt11); output = svmla_x(svAll, output, input12, wgt12); output = svmla_x(svAll, output, input13, wgt13); output = svmla_x(svAll, output, input14, wgt14); output = svmla_x(svAll, output, input15, wgt15); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); auto input0 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip0[k])))); auto input1 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip1[k])))); auto input2 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip2[k])))); auto input3 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip3[k])))); auto input4 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip4[k])))); auto input5 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip5[k])))); auto input6 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip6[k])))); auto input7 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip7[k])))); auto input8 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip8[k])))); auto input9 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip9[k])))); auto input10 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip10[k])))); auto input11 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip11[k])))); auto input12 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip12[k])))); auto input13 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip13[k])))); auto input14 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip14[k])))); auto input15 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip15[k])))); output = svmla_x(pg, output, input0, wgt0); output = svmla_x(pg, output, input1, wgt1); output = svmla_x(pg, output, input2, wgt2); output = svmla_x(pg, output, input3, wgt3); output = svmla_x(pg, output, input4, wgt4); output = svmla_x(pg, output, input5, wgt5); output = svmla_x(pg, output, input6, wgt6); output = svmla_x(pg, output, input7, wgt7); output = svmla_x(pg, output, input8, wgt8); output = svmla_x(pg, output, input9, wgt9); output = svmla_x(pg, output, input10, wgt10); output = svmla_x(pg, output, input11, wgt11); output = svmla_x(pg, output, input12, wgt12); output = svmla_x(pg, output, input13, wgt13); output = svmla_x(pg, output, input14, wgt14); output = svmla_x(pg, output, input15, wgt15); svst1(pg, &op[k], output); k += vLen; } j += 16; pos += 16; } // unrolling 8 times while (j + 7 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; const auto idx2 = indices[pos + 2]; const auto idx3 = indices[pos + 3]; const auto idx4 = indices[pos + 4]; const auto idx5 = indices[pos + 5]; const auto idx6 = indices[pos + 6]; const auto idx7 = indices[pos + 7]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } if (idx2 < 0 || idx2 >= data_size) { return false; } if (idx3 < 0 || idx3 >= data_size) { return false; } if (idx4 < 0 || idx4 >= data_size) { return false; } if (idx5 < 0 || idx5 >= data_size) { return false; } if (idx6 < 0 || idx6 >= data_size) { return false; } if (idx7 < 0 || idx7 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float wgt2 = 1.f; float wgt3 = 1.f; float wgt4 = 1.f; float wgt5 = 1.f; float wgt6 = 1.f; float wgt7 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2]; wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3]; wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4]; wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5]; wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6]; wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7]; } const at::Half* const ip0 = &input[idx0 * block_size]; const at::Half* const ip1 = &input[idx1 * block_size]; const at::Half* const ip2 = &input[idx2 * block_size]; const at::Half* const ip3 = &input[idx3 * block_size]; const at::Half* const ip4 = &input[idx4 * block_size]; const at::Half* const ip5 = &input[idx5 * block_size]; const at::Half* const ip6 = &input[idx6 * block_size]; const at::Half* const ip7 = &input[idx7 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); auto input0 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip0[k])))); auto input1 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip1[k])))); auto input2 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip2[k])))); auto input3 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip3[k])))); auto input4 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip4[k])))); auto input5 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip5[k])))); auto input6 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip6[k])))); auto input7 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip7[k])))); output = svmla_x(svAll, output, input0, wgt0); output = svmla_x(svAll, output, input1, wgt1); output = svmla_x(svAll, output, input2, wgt2); output = svmla_x(svAll, output, input3, wgt3); output = svmla_x(svAll, output, input4, wgt4); output = svmla_x(svAll, output, input5, wgt5); output = svmla_x(svAll, output, input6, wgt6); output = svmla_x(svAll, output, input7, wgt7); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); auto input0 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip0[k])))); auto input1 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip1[k])))); auto input2 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip2[k])))); auto input3 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip3[k])))); auto input4 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip4[k])))); auto input5 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip5[k])))); auto input6 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip6[k])))); auto input7 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip7[k])))); output = svmla_x(pg, output, input0, wgt0); output = svmla_x(pg, output, input1, wgt1); output = svmla_x(pg, output, input2, wgt2); output = svmla_x(pg, output, input3, wgt3); output = svmla_x(pg, output, input4, wgt4); output = svmla_x(pg, output, input5, wgt5); output = svmla_x(pg, output, input6, wgt6); output = svmla_x(pg, output, input7, wgt7); svst1(pg, &op[k], output); k += vLen; } j += 8; pos += 8; } // unrolling 4 times while (j + 3 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; const auto idx2 = indices[pos + 2]; const auto idx3 = indices[pos + 3]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } if (idx2 < 0 || idx2 >= data_size) { return false; } if (idx3 < 0 || idx3 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float wgt2 = 1.f; float wgt3 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2]; wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3]; } const at::Half* const ip0 = &input[idx0 * block_size]; const at::Half* const ip1 = &input[idx1 * block_size]; const at::Half* const ip2 = &input[idx2 * block_size]; const at::Half* const ip3 = &input[idx3 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); auto input0 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip0[k])))); auto input1 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip1[k])))); auto input2 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip2[k])))); auto input3 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip3[k])))); output = svmla_x(svAll, output, input0, wgt0); output = svmla_x(svAll, output, input1, wgt1); output = svmla_x(svAll, output, input2, wgt2); output = svmla_x(svAll, output, input3, wgt3); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); auto input0 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip0[k])))); auto input1 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip1[k])))); auto input2 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip2[k])))); auto input3 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip3[k])))); output = svmla_x(pg, output, input0, wgt0); output = svmla_x(pg, output, input1, wgt1); output = svmla_x(pg, output, input2, wgt2); output = svmla_x(pg, output, input3, wgt3); svst1(pg, &op[k], output); k += vLen; } j += 4; pos += 4; } // unrolling 2 times while (j + 1 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; } const at::Half* const ip0 = &input[idx0 * block_size]; const at::Half* const ip1 = &input[idx1 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); auto input0 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip0[k])))); auto input1 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip1[k])))); output = svmla_x(svAll, output, input0, wgt0); output = svmla_x(svAll, output, input1, wgt1); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); auto input0 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip0[k])))); auto input1 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip1[k])))); output = svmla_x(pg, output, input0, wgt0); output = svmla_x(pg, output, input1, wgt1); svst1(pg, &op[k], output); k += vLen; } j += 2; pos += 2; } // tail loop if (j < end_offset) { const auto idx0 = indices[pos + 0]; if (idx0 < 0 || idx0 >= data_size) { return false; } float wgt0 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; } const at::Half* const ip0 = &input[idx0 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); auto input0 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip0[k])))); output = svmla_x(svAll, output, input0, wgt0); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); auto input0 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip0[k])))); output = svmla_x(pg, output, input0, wgt0); svst1(pg, &op[k], output); k += vLen; } pos ++; } const int64_t length = end_offset - start_offset; if (normalize_by_lengths && length != 0) { const float len_inv = 1.0f / length; svbool_t pg; int64_t j = 0; while (j + vLen - 1 < block_size) { svst1(svAll, &op[j], svmul_x(svAll, svld1(svAll, &op[j]), len_inv)); j += vLen; } if (j < block_size) { pg = svwhilelt_b32_s64(j, block_size); svst1(pg, &op[j], svmul_x(pg, svld1(pg, &op[j]), len_inv)); } } } return pos == index_size; } bool EmbeddingLookupIdx_int32_t_half_float_false__sve( const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const at::Half* input, const int32_t* indices, const int32_t* offsets, const float* weights, const float* scale_bias, bool normalize_by_lengths, float* out) { return EmbeddingLookupIdx_int32_t_half_float__sve( block_size, output_size, index_size, data_size, input, indices, offsets, weights, scale_bias, normalize_by_lengths, out); } bool EmbeddingLookupIdx_int32_t_half_float_true__sve( const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const at::Half* input, const int32_t* indices, const int32_t* offsets, const float* weights, const float* scale_bias, bool normalize_by_lengths, float* out) { return EmbeddingLookupIdx_int32_t_half_float__sve( block_size, output_size, index_size, data_size, input, indices, offsets, weights, scale_bias, normalize_by_lengths, out); } template static bool EmbeddingLookupIdx_int64_t_half_float__sve( const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const at::Half* input, const int64_t* indices, const int64_t* offsets, const float* weights, const float* scale_bias, bool normalize_by_lengths, float* out) { const svbool_t svAll = svptrue_b32(); const auto vLen = static_cast(svcntw()); int64_t pos = 0; for (int64_t i = 0; i < output_size; ++i) { float* const op = &out[i * block_size]; memset(op, 0, sizeof(float) * block_size); if (pos != offsets[i] - offsets[0]) { return false; } int64_t start_offset = offsets[i]; int64_t end_offset = offsets[i + 1]; int64_t j = start_offset; // unrolling 16 times while (j + 15 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; const auto idx2 = indices[pos + 2]; const auto idx3 = indices[pos + 3]; const auto idx4 = indices[pos + 4]; const auto idx5 = indices[pos + 5]; const auto idx6 = indices[pos + 6]; const auto idx7 = indices[pos + 7]; const auto idx8 = indices[pos + 8]; const auto idx9 = indices[pos + 9]; const auto idx10 = indices[pos + 10]; const auto idx11 = indices[pos + 11]; const auto idx12 = indices[pos + 12]; const auto idx13 = indices[pos + 13]; const auto idx14 = indices[pos + 14]; const auto idx15 = indices[pos + 15]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } if (idx2 < 0 || idx2 >= data_size) { return false; } if (idx3 < 0 || idx3 >= data_size) { return false; } if (idx4 < 0 || idx4 >= data_size) { return false; } if (idx5 < 0 || idx5 >= data_size) { return false; } if (idx6 < 0 || idx6 >= data_size) { return false; } if (idx7 < 0 || idx7 >= data_size) { return false; } if (idx8 < 0 || idx8 >= data_size) { return false; } if (idx9 < 0 || idx9 >= data_size) { return false; } if (idx10 < 0 || idx10 >= data_size) { return false; } if (idx11 < 0 || idx11 >= data_size) { return false; } if (idx12 < 0 || idx12 >= data_size) { return false; } if (idx13 < 0 || idx13 >= data_size) { return false; } if (idx14 < 0 || idx14 >= data_size) { return false; } if (idx15 < 0 || idx15 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float wgt2 = 1.f; float wgt3 = 1.f; float wgt4 = 1.f; float wgt5 = 1.f; float wgt6 = 1.f; float wgt7 = 1.f; float wgt8 = 1.f; float wgt9 = 1.f; float wgt10 = 1.f; float wgt11 = 1.f; float wgt12 = 1.f; float wgt13 = 1.f; float wgt14 = 1.f; float wgt15 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2]; wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3]; wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4]; wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5]; wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6]; wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7]; wgt8 = weights[IS_WEIGHT_POSITIONAL ? (j + 8 - start_offset) : pos + 8]; wgt9 = weights[IS_WEIGHT_POSITIONAL ? (j + 9 - start_offset) : pos + 9]; wgt10 = weights[IS_WEIGHT_POSITIONAL ? (j + 10 - start_offset) : pos + 10]; wgt11 = weights[IS_WEIGHT_POSITIONAL ? (j + 11 - start_offset) : pos + 11]; wgt12 = weights[IS_WEIGHT_POSITIONAL ? (j + 12 - start_offset) : pos + 12]; wgt13 = weights[IS_WEIGHT_POSITIONAL ? (j + 13 - start_offset) : pos + 13]; wgt14 = weights[IS_WEIGHT_POSITIONAL ? (j + 14 - start_offset) : pos + 14]; wgt15 = weights[IS_WEIGHT_POSITIONAL ? (j + 15 - start_offset) : pos + 15]; } const at::Half* const ip0 = &input[idx0 * block_size]; const at::Half* const ip1 = &input[idx1 * block_size]; const at::Half* const ip2 = &input[idx2 * block_size]; const at::Half* const ip3 = &input[idx3 * block_size]; const at::Half* const ip4 = &input[idx4 * block_size]; const at::Half* const ip5 = &input[idx5 * block_size]; const at::Half* const ip6 = &input[idx6 * block_size]; const at::Half* const ip7 = &input[idx7 * block_size]; const at::Half* const ip8 = &input[idx8 * block_size]; const at::Half* const ip9 = &input[idx9 * block_size]; const at::Half* const ip10 = &input[idx10 * block_size]; const at::Half* const ip11 = &input[idx11 * block_size]; const at::Half* const ip12 = &input[idx12 * block_size]; const at::Half* const ip13 = &input[idx13 * block_size]; const at::Half* const ip14 = &input[idx14 * block_size]; const at::Half* const ip15 = &input[idx15 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); auto input0 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip0[k])))); auto input1 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip1[k])))); auto input2 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip2[k])))); auto input3 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip3[k])))); auto input4 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip4[k])))); auto input5 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip5[k])))); auto input6 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip6[k])))); auto input7 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip7[k])))); auto input8 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip8[k])))); auto input9 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip9[k])))); auto input10 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip10[k])))); auto input11 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip11[k])))); auto input12 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip12[k])))); auto input13 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip13[k])))); auto input14 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip14[k])))); auto input15 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip15[k])))); output = svmla_x(svAll, output, input0, wgt0); output = svmla_x(svAll, output, input1, wgt1); output = svmla_x(svAll, output, input2, wgt2); output = svmla_x(svAll, output, input3, wgt3); output = svmla_x(svAll, output, input4, wgt4); output = svmla_x(svAll, output, input5, wgt5); output = svmla_x(svAll, output, input6, wgt6); output = svmla_x(svAll, output, input7, wgt7); output = svmla_x(svAll, output, input8, wgt8); output = svmla_x(svAll, output, input9, wgt9); output = svmla_x(svAll, output, input10, wgt10); output = svmla_x(svAll, output, input11, wgt11); output = svmla_x(svAll, output, input12, wgt12); output = svmla_x(svAll, output, input13, wgt13); output = svmla_x(svAll, output, input14, wgt14); output = svmla_x(svAll, output, input15, wgt15); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); auto input0 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip0[k])))); auto input1 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip1[k])))); auto input2 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip2[k])))); auto input3 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip3[k])))); auto input4 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip4[k])))); auto input5 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip5[k])))); auto input6 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip6[k])))); auto input7 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip7[k])))); auto input8 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip8[k])))); auto input9 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip9[k])))); auto input10 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip10[k])))); auto input11 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip11[k])))); auto input12 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip12[k])))); auto input13 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip13[k])))); auto input14 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip14[k])))); auto input15 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip15[k])))); output = svmla_x(pg, output, input0, wgt0); output = svmla_x(pg, output, input1, wgt1); output = svmla_x(pg, output, input2, wgt2); output = svmla_x(pg, output, input3, wgt3); output = svmla_x(pg, output, input4, wgt4); output = svmla_x(pg, output, input5, wgt5); output = svmla_x(pg, output, input6, wgt6); output = svmla_x(pg, output, input7, wgt7); output = svmla_x(pg, output, input8, wgt8); output = svmla_x(pg, output, input9, wgt9); output = svmla_x(pg, output, input10, wgt10); output = svmla_x(pg, output, input11, wgt11); output = svmla_x(pg, output, input12, wgt12); output = svmla_x(pg, output, input13, wgt13); output = svmla_x(pg, output, input14, wgt14); output = svmla_x(pg, output, input15, wgt15); svst1(pg, &op[k], output); k += vLen; } j += 16; pos += 16; } // unrolling 8 times while (j + 7 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; const auto idx2 = indices[pos + 2]; const auto idx3 = indices[pos + 3]; const auto idx4 = indices[pos + 4]; const auto idx5 = indices[pos + 5]; const auto idx6 = indices[pos + 6]; const auto idx7 = indices[pos + 7]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } if (idx2 < 0 || idx2 >= data_size) { return false; } if (idx3 < 0 || idx3 >= data_size) { return false; } if (idx4 < 0 || idx4 >= data_size) { return false; } if (idx5 < 0 || idx5 >= data_size) { return false; } if (idx6 < 0 || idx6 >= data_size) { return false; } if (idx7 < 0 || idx7 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float wgt2 = 1.f; float wgt3 = 1.f; float wgt4 = 1.f; float wgt5 = 1.f; float wgt6 = 1.f; float wgt7 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2]; wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3]; wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4]; wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5]; wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6]; wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7]; } const at::Half* const ip0 = &input[idx0 * block_size]; const at::Half* const ip1 = &input[idx1 * block_size]; const at::Half* const ip2 = &input[idx2 * block_size]; const at::Half* const ip3 = &input[idx3 * block_size]; const at::Half* const ip4 = &input[idx4 * block_size]; const at::Half* const ip5 = &input[idx5 * block_size]; const at::Half* const ip6 = &input[idx6 * block_size]; const at::Half* const ip7 = &input[idx7 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); auto input0 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip0[k])))); auto input1 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip1[k])))); auto input2 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip2[k])))); auto input3 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip3[k])))); auto input4 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip4[k])))); auto input5 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip5[k])))); auto input6 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip6[k])))); auto input7 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip7[k])))); output = svmla_x(svAll, output, input0, wgt0); output = svmla_x(svAll, output, input1, wgt1); output = svmla_x(svAll, output, input2, wgt2); output = svmla_x(svAll, output, input3, wgt3); output = svmla_x(svAll, output, input4, wgt4); output = svmla_x(svAll, output, input5, wgt5); output = svmla_x(svAll, output, input6, wgt6); output = svmla_x(svAll, output, input7, wgt7); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); auto input0 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip0[k])))); auto input1 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip1[k])))); auto input2 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip2[k])))); auto input3 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip3[k])))); auto input4 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip4[k])))); auto input5 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip5[k])))); auto input6 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip6[k])))); auto input7 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip7[k])))); output = svmla_x(pg, output, input0, wgt0); output = svmla_x(pg, output, input1, wgt1); output = svmla_x(pg, output, input2, wgt2); output = svmla_x(pg, output, input3, wgt3); output = svmla_x(pg, output, input4, wgt4); output = svmla_x(pg, output, input5, wgt5); output = svmla_x(pg, output, input6, wgt6); output = svmla_x(pg, output, input7, wgt7); svst1(pg, &op[k], output); k += vLen; } j += 8; pos += 8; } // unrolling 4 times while (j + 3 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; const auto idx2 = indices[pos + 2]; const auto idx3 = indices[pos + 3]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } if (idx2 < 0 || idx2 >= data_size) { return false; } if (idx3 < 0 || idx3 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float wgt2 = 1.f; float wgt3 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2]; wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3]; } const at::Half* const ip0 = &input[idx0 * block_size]; const at::Half* const ip1 = &input[idx1 * block_size]; const at::Half* const ip2 = &input[idx2 * block_size]; const at::Half* const ip3 = &input[idx3 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); auto input0 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip0[k])))); auto input1 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip1[k])))); auto input2 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip2[k])))); auto input3 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip3[k])))); output = svmla_x(svAll, output, input0, wgt0); output = svmla_x(svAll, output, input1, wgt1); output = svmla_x(svAll, output, input2, wgt2); output = svmla_x(svAll, output, input3, wgt3); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); auto input0 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip0[k])))); auto input1 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip1[k])))); auto input2 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip2[k])))); auto input3 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip3[k])))); output = svmla_x(pg, output, input0, wgt0); output = svmla_x(pg, output, input1, wgt1); output = svmla_x(pg, output, input2, wgt2); output = svmla_x(pg, output, input3, wgt3); svst1(pg, &op[k], output); k += vLen; } j += 4; pos += 4; } // unrolling 2 times while (j + 1 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; } const at::Half* const ip0 = &input[idx0 * block_size]; const at::Half* const ip1 = &input[idx1 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); auto input0 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip0[k])))); auto input1 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip1[k])))); output = svmla_x(svAll, output, input0, wgt0); output = svmla_x(svAll, output, input1, wgt1); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); auto input0 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip0[k])))); auto input1 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip1[k])))); output = svmla_x(pg, output, input0, wgt0); output = svmla_x(pg, output, input1, wgt1); svst1(pg, &op[k], output); k += vLen; } j += 2; pos += 2; } // tail loop if (j < end_offset) { const auto idx0 = indices[pos + 0]; if (idx0 < 0 || idx0 >= data_size) { return false; } float wgt0 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; } const at::Half* const ip0 = &input[idx0 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); auto input0 = svcvt_f32_x(svAll, svreinterpret_f16( svld1uh_u32(svAll, reinterpret_cast(&ip0[k])))); output = svmla_x(svAll, output, input0, wgt0); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); auto input0 = svcvt_f32_x(pg, svreinterpret_f16( svld1uh_u32(pg, reinterpret_cast(&ip0[k])))); output = svmla_x(pg, output, input0, wgt0); svst1(pg, &op[k], output); k += vLen; } pos ++; } const int64_t length = end_offset - start_offset; if (normalize_by_lengths && length != 0) { const float len_inv = 1.0f / length; svbool_t pg; int64_t j = 0; while (j + vLen - 1 < block_size) { svst1(svAll, &op[j], svmul_x(svAll, svld1(svAll, &op[j]), len_inv)); j += vLen; } if (j < block_size) { pg = svwhilelt_b32_s64(j, block_size); svst1(pg, &op[j], svmul_x(pg, svld1(pg, &op[j]), len_inv)); } } } return pos == index_size; } bool EmbeddingLookupIdx_int64_t_half_float_false__sve( const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const at::Half* input, const int64_t* indices, const int64_t* offsets, const float* weights, const float* scale_bias, bool normalize_by_lengths, float* out) { return EmbeddingLookupIdx_int64_t_half_float__sve( block_size, output_size, index_size, data_size, input, indices, offsets, weights, scale_bias, normalize_by_lengths, out); } bool EmbeddingLookupIdx_int64_t_half_float_true__sve( const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const at::Half* input, const int64_t* indices, const int64_t* offsets, const float* weights, const float* scale_bias, bool normalize_by_lengths, float* out) { return EmbeddingLookupIdx_int64_t_half_float__sve( block_size, output_size, index_size, data_size, input, indices, offsets, weights, scale_bias, normalize_by_lengths, out); } template static bool EmbeddingLookupIdx_int32_t_bfloat16_float__sve( const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const at::BFloat16* input, const int32_t* indices, const int32_t* offsets, const float* weights, const float* scale_bias, bool normalize_by_lengths, float* out) { const svbool_t svAll = svptrue_b32(); const auto vLen = static_cast(svcntw()); int64_t pos = 0; for (int64_t i = 0; i < output_size; ++i) { float* const op = &out[i * block_size]; memset(op, 0, sizeof(float) * block_size); if (pos != offsets[i] - offsets[0]) { return false; } int64_t start_offset = offsets[i]; int64_t end_offset = offsets[i + 1]; int64_t j = start_offset; // unrolling 16 times while (j + 15 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; const auto idx2 = indices[pos + 2]; const auto idx3 = indices[pos + 3]; const auto idx4 = indices[pos + 4]; const auto idx5 = indices[pos + 5]; const auto idx6 = indices[pos + 6]; const auto idx7 = indices[pos + 7]; const auto idx8 = indices[pos + 8]; const auto idx9 = indices[pos + 9]; const auto idx10 = indices[pos + 10]; const auto idx11 = indices[pos + 11]; const auto idx12 = indices[pos + 12]; const auto idx13 = indices[pos + 13]; const auto idx14 = indices[pos + 14]; const auto idx15 = indices[pos + 15]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } if (idx2 < 0 || idx2 >= data_size) { return false; } if (idx3 < 0 || idx3 >= data_size) { return false; } if (idx4 < 0 || idx4 >= data_size) { return false; } if (idx5 < 0 || idx5 >= data_size) { return false; } if (idx6 < 0 || idx6 >= data_size) { return false; } if (idx7 < 0 || idx7 >= data_size) { return false; } if (idx8 < 0 || idx8 >= data_size) { return false; } if (idx9 < 0 || idx9 >= data_size) { return false; } if (idx10 < 0 || idx10 >= data_size) { return false; } if (idx11 < 0 || idx11 >= data_size) { return false; } if (idx12 < 0 || idx12 >= data_size) { return false; } if (idx13 < 0 || idx13 >= data_size) { return false; } if (idx14 < 0 || idx14 >= data_size) { return false; } if (idx15 < 0 || idx15 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float wgt2 = 1.f; float wgt3 = 1.f; float wgt4 = 1.f; float wgt5 = 1.f; float wgt6 = 1.f; float wgt7 = 1.f; float wgt8 = 1.f; float wgt9 = 1.f; float wgt10 = 1.f; float wgt11 = 1.f; float wgt12 = 1.f; float wgt13 = 1.f; float wgt14 = 1.f; float wgt15 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2]; wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3]; wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4]; wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5]; wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6]; wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7]; wgt8 = weights[IS_WEIGHT_POSITIONAL ? (j + 8 - start_offset) : pos + 8]; wgt9 = weights[IS_WEIGHT_POSITIONAL ? (j + 9 - start_offset) : pos + 9]; wgt10 = weights[IS_WEIGHT_POSITIONAL ? (j + 10 - start_offset) : pos + 10]; wgt11 = weights[IS_WEIGHT_POSITIONAL ? (j + 11 - start_offset) : pos + 11]; wgt12 = weights[IS_WEIGHT_POSITIONAL ? (j + 12 - start_offset) : pos + 12]; wgt13 = weights[IS_WEIGHT_POSITIONAL ? (j + 13 - start_offset) : pos + 13]; wgt14 = weights[IS_WEIGHT_POSITIONAL ? (j + 14 - start_offset) : pos + 14]; wgt15 = weights[IS_WEIGHT_POSITIONAL ? (j + 15 - start_offset) : pos + 15]; } const at::BFloat16* const ip0 = &input[idx0 * block_size]; const at::BFloat16* const ip1 = &input[idx1 * block_size]; const at::BFloat16* const ip2 = &input[idx2 * block_size]; const at::BFloat16* const ip3 = &input[idx3 * block_size]; const at::BFloat16* const ip4 = &input[idx4 * block_size]; const at::BFloat16* const ip5 = &input[idx5 * block_size]; const at::BFloat16* const ip6 = &input[idx6 * block_size]; const at::BFloat16* const ip7 = &input[idx7 * block_size]; const at::BFloat16* const ip8 = &input[idx8 * block_size]; const at::BFloat16* const ip9 = &input[idx9 * block_size]; const at::BFloat16* const ip10 = &input[idx10 * block_size]; const at::BFloat16* const ip11 = &input[idx11 * block_size]; const at::BFloat16* const ip12 = &input[idx12 * block_size]; const at::BFloat16* const ip13 = &input[idx13 * block_size]; const at::BFloat16* const ip14 = &input[idx14 * block_size]; const at::BFloat16* const ip15 = &input[idx15 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); auto input0 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip0[k])), 16)); auto input1 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip1[k])), 16)); auto input2 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip2[k])), 16)); auto input3 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip3[k])), 16)); auto input4 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip4[k])), 16)); auto input5 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip5[k])), 16)); auto input6 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip6[k])), 16)); auto input7 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip7[k])), 16)); auto input8 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip8[k])), 16)); auto input9 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip9[k])), 16)); auto input10 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip10[k])), 16)); auto input11 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip11[k])), 16)); auto input12 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip12[k])), 16)); auto input13 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip13[k])), 16)); auto input14 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip14[k])), 16)); auto input15 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip15[k])), 16)); output = svmla_x(svAll, output, input0, wgt0); output = svmla_x(svAll, output, input1, wgt1); output = svmla_x(svAll, output, input2, wgt2); output = svmla_x(svAll, output, input3, wgt3); output = svmla_x(svAll, output, input4, wgt4); output = svmla_x(svAll, output, input5, wgt5); output = svmla_x(svAll, output, input6, wgt6); output = svmla_x(svAll, output, input7, wgt7); output = svmla_x(svAll, output, input8, wgt8); output = svmla_x(svAll, output, input9, wgt9); output = svmla_x(svAll, output, input10, wgt10); output = svmla_x(svAll, output, input11, wgt11); output = svmla_x(svAll, output, input12, wgt12); output = svmla_x(svAll, output, input13, wgt13); output = svmla_x(svAll, output, input14, wgt14); output = svmla_x(svAll, output, input15, wgt15); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); auto input0 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip0[k])), 16)); auto input1 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip1[k])), 16)); auto input2 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip2[k])), 16)); auto input3 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip3[k])), 16)); auto input4 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip4[k])), 16)); auto input5 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip5[k])), 16)); auto input6 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip6[k])), 16)); auto input7 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip7[k])), 16)); auto input8 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip8[k])), 16)); auto input9 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip9[k])), 16)); auto input10 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip10[k])), 16)); auto input11 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip11[k])), 16)); auto input12 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip12[k])), 16)); auto input13 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip13[k])), 16)); auto input14 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip14[k])), 16)); auto input15 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip15[k])), 16)); output = svmla_x(pg, output, input0, wgt0); output = svmla_x(pg, output, input1, wgt1); output = svmla_x(pg, output, input2, wgt2); output = svmla_x(pg, output, input3, wgt3); output = svmla_x(pg, output, input4, wgt4); output = svmla_x(pg, output, input5, wgt5); output = svmla_x(pg, output, input6, wgt6); output = svmla_x(pg, output, input7, wgt7); output = svmla_x(pg, output, input8, wgt8); output = svmla_x(pg, output, input9, wgt9); output = svmla_x(pg, output, input10, wgt10); output = svmla_x(pg, output, input11, wgt11); output = svmla_x(pg, output, input12, wgt12); output = svmla_x(pg, output, input13, wgt13); output = svmla_x(pg, output, input14, wgt14); output = svmla_x(pg, output, input15, wgt15); svst1(pg, &op[k], output); k += vLen; } j += 16; pos += 16; } // unrolling 8 times while (j + 7 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; const auto idx2 = indices[pos + 2]; const auto idx3 = indices[pos + 3]; const auto idx4 = indices[pos + 4]; const auto idx5 = indices[pos + 5]; const auto idx6 = indices[pos + 6]; const auto idx7 = indices[pos + 7]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } if (idx2 < 0 || idx2 >= data_size) { return false; } if (idx3 < 0 || idx3 >= data_size) { return false; } if (idx4 < 0 || idx4 >= data_size) { return false; } if (idx5 < 0 || idx5 >= data_size) { return false; } if (idx6 < 0 || idx6 >= data_size) { return false; } if (idx7 < 0 || idx7 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float wgt2 = 1.f; float wgt3 = 1.f; float wgt4 = 1.f; float wgt5 = 1.f; float wgt6 = 1.f; float wgt7 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2]; wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3]; wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4]; wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5]; wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6]; wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7]; } const at::BFloat16* const ip0 = &input[idx0 * block_size]; const at::BFloat16* const ip1 = &input[idx1 * block_size]; const at::BFloat16* const ip2 = &input[idx2 * block_size]; const at::BFloat16* const ip3 = &input[idx3 * block_size]; const at::BFloat16* const ip4 = &input[idx4 * block_size]; const at::BFloat16* const ip5 = &input[idx5 * block_size]; const at::BFloat16* const ip6 = &input[idx6 * block_size]; const at::BFloat16* const ip7 = &input[idx7 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); auto input0 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip0[k])), 16)); auto input1 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip1[k])), 16)); auto input2 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip2[k])), 16)); auto input3 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip3[k])), 16)); auto input4 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip4[k])), 16)); auto input5 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip5[k])), 16)); auto input6 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip6[k])), 16)); auto input7 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip7[k])), 16)); output = svmla_x(svAll, output, input0, wgt0); output = svmla_x(svAll, output, input1, wgt1); output = svmla_x(svAll, output, input2, wgt2); output = svmla_x(svAll, output, input3, wgt3); output = svmla_x(svAll, output, input4, wgt4); output = svmla_x(svAll, output, input5, wgt5); output = svmla_x(svAll, output, input6, wgt6); output = svmla_x(svAll, output, input7, wgt7); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); auto input0 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip0[k])), 16)); auto input1 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip1[k])), 16)); auto input2 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip2[k])), 16)); auto input3 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip3[k])), 16)); auto input4 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip4[k])), 16)); auto input5 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip5[k])), 16)); auto input6 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip6[k])), 16)); auto input7 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip7[k])), 16)); output = svmla_x(pg, output, input0, wgt0); output = svmla_x(pg, output, input1, wgt1); output = svmla_x(pg, output, input2, wgt2); output = svmla_x(pg, output, input3, wgt3); output = svmla_x(pg, output, input4, wgt4); output = svmla_x(pg, output, input5, wgt5); output = svmla_x(pg, output, input6, wgt6); output = svmla_x(pg, output, input7, wgt7); svst1(pg, &op[k], output); k += vLen; } j += 8; pos += 8; } // unrolling 4 times while (j + 3 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; const auto idx2 = indices[pos + 2]; const auto idx3 = indices[pos + 3]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } if (idx2 < 0 || idx2 >= data_size) { return false; } if (idx3 < 0 || idx3 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float wgt2 = 1.f; float wgt3 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2]; wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3]; } const at::BFloat16* const ip0 = &input[idx0 * block_size]; const at::BFloat16* const ip1 = &input[idx1 * block_size]; const at::BFloat16* const ip2 = &input[idx2 * block_size]; const at::BFloat16* const ip3 = &input[idx3 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); auto input0 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip0[k])), 16)); auto input1 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip1[k])), 16)); auto input2 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip2[k])), 16)); auto input3 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip3[k])), 16)); output = svmla_x(svAll, output, input0, wgt0); output = svmla_x(svAll, output, input1, wgt1); output = svmla_x(svAll, output, input2, wgt2); output = svmla_x(svAll, output, input3, wgt3); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); auto input0 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip0[k])), 16)); auto input1 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip1[k])), 16)); auto input2 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip2[k])), 16)); auto input3 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip3[k])), 16)); output = svmla_x(pg, output, input0, wgt0); output = svmla_x(pg, output, input1, wgt1); output = svmla_x(pg, output, input2, wgt2); output = svmla_x(pg, output, input3, wgt3); svst1(pg, &op[k], output); k += vLen; } j += 4; pos += 4; } // unrolling 2 times while (j + 1 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; } const at::BFloat16* const ip0 = &input[idx0 * block_size]; const at::BFloat16* const ip1 = &input[idx1 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); auto input0 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip0[k])), 16)); auto input1 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip1[k])), 16)); output = svmla_x(svAll, output, input0, wgt0); output = svmla_x(svAll, output, input1, wgt1); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); auto input0 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip0[k])), 16)); auto input1 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip1[k])), 16)); output = svmla_x(pg, output, input0, wgt0); output = svmla_x(pg, output, input1, wgt1); svst1(pg, &op[k], output); k += vLen; } j += 2; pos += 2; } // tail loop if (j < end_offset) { const auto idx0 = indices[pos + 0]; if (idx0 < 0 || idx0 >= data_size) { return false; } float wgt0 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; } const at::BFloat16* const ip0 = &input[idx0 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); auto input0 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip0[k])), 16)); output = svmla_x(svAll, output, input0, wgt0); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); auto input0 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip0[k])), 16)); output = svmla_x(pg, output, input0, wgt0); svst1(pg, &op[k], output); k += vLen; } pos ++; } const int64_t length = end_offset - start_offset; if (normalize_by_lengths && length != 0) { const float len_inv = 1.0f / length; svbool_t pg; int64_t j = 0; while (j + vLen - 1 < block_size) { svst1(svAll, &op[j], svmul_x(svAll, svld1(svAll, &op[j]), len_inv)); j += vLen; } if (j < block_size) { pg = svwhilelt_b32_s64(j, block_size); svst1(pg, &op[j], svmul_x(pg, svld1(pg, &op[j]), len_inv)); } } } return pos == index_size; } bool EmbeddingLookupIdx_int32_t_bfloat16_float_false__sve( const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const at::BFloat16* input, const int32_t* indices, const int32_t* offsets, const float* weights, const float* scale_bias, bool normalize_by_lengths, float* out) { return EmbeddingLookupIdx_int32_t_bfloat16_float__sve( block_size, output_size, index_size, data_size, input, indices, offsets, weights, scale_bias, normalize_by_lengths, out); } bool EmbeddingLookupIdx_int32_t_bfloat16_float_true__sve( const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const at::BFloat16* input, const int32_t* indices, const int32_t* offsets, const float* weights, const float* scale_bias, bool normalize_by_lengths, float* out) { return EmbeddingLookupIdx_int32_t_bfloat16_float__sve( block_size, output_size, index_size, data_size, input, indices, offsets, weights, scale_bias, normalize_by_lengths, out); } template static bool EmbeddingLookupIdx_int64_t_bfloat16_float__sve( const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const at::BFloat16* input, const int64_t* indices, const int64_t* offsets, const float* weights, const float* scale_bias, bool normalize_by_lengths, float* out) { const svbool_t svAll = svptrue_b32(); const auto vLen = static_cast(svcntw()); int64_t pos = 0; for (int64_t i = 0; i < output_size; ++i) { float* const op = &out[i * block_size]; memset(op, 0, sizeof(float) * block_size); if (pos != offsets[i] - offsets[0]) { return false; } int64_t start_offset = offsets[i]; int64_t end_offset = offsets[i + 1]; int64_t j = start_offset; // unrolling 16 times while (j + 15 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; const auto idx2 = indices[pos + 2]; const auto idx3 = indices[pos + 3]; const auto idx4 = indices[pos + 4]; const auto idx5 = indices[pos + 5]; const auto idx6 = indices[pos + 6]; const auto idx7 = indices[pos + 7]; const auto idx8 = indices[pos + 8]; const auto idx9 = indices[pos + 9]; const auto idx10 = indices[pos + 10]; const auto idx11 = indices[pos + 11]; const auto idx12 = indices[pos + 12]; const auto idx13 = indices[pos + 13]; const auto idx14 = indices[pos + 14]; const auto idx15 = indices[pos + 15]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } if (idx2 < 0 || idx2 >= data_size) { return false; } if (idx3 < 0 || idx3 >= data_size) { return false; } if (idx4 < 0 || idx4 >= data_size) { return false; } if (idx5 < 0 || idx5 >= data_size) { return false; } if (idx6 < 0 || idx6 >= data_size) { return false; } if (idx7 < 0 || idx7 >= data_size) { return false; } if (idx8 < 0 || idx8 >= data_size) { return false; } if (idx9 < 0 || idx9 >= data_size) { return false; } if (idx10 < 0 || idx10 >= data_size) { return false; } if (idx11 < 0 || idx11 >= data_size) { return false; } if (idx12 < 0 || idx12 >= data_size) { return false; } if (idx13 < 0 || idx13 >= data_size) { return false; } if (idx14 < 0 || idx14 >= data_size) { return false; } if (idx15 < 0 || idx15 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float wgt2 = 1.f; float wgt3 = 1.f; float wgt4 = 1.f; float wgt5 = 1.f; float wgt6 = 1.f; float wgt7 = 1.f; float wgt8 = 1.f; float wgt9 = 1.f; float wgt10 = 1.f; float wgt11 = 1.f; float wgt12 = 1.f; float wgt13 = 1.f; float wgt14 = 1.f; float wgt15 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2]; wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3]; wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4]; wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5]; wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6]; wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7]; wgt8 = weights[IS_WEIGHT_POSITIONAL ? (j + 8 - start_offset) : pos + 8]; wgt9 = weights[IS_WEIGHT_POSITIONAL ? (j + 9 - start_offset) : pos + 9]; wgt10 = weights[IS_WEIGHT_POSITIONAL ? (j + 10 - start_offset) : pos + 10]; wgt11 = weights[IS_WEIGHT_POSITIONAL ? (j + 11 - start_offset) : pos + 11]; wgt12 = weights[IS_WEIGHT_POSITIONAL ? (j + 12 - start_offset) : pos + 12]; wgt13 = weights[IS_WEIGHT_POSITIONAL ? (j + 13 - start_offset) : pos + 13]; wgt14 = weights[IS_WEIGHT_POSITIONAL ? (j + 14 - start_offset) : pos + 14]; wgt15 = weights[IS_WEIGHT_POSITIONAL ? (j + 15 - start_offset) : pos + 15]; } const at::BFloat16* const ip0 = &input[idx0 * block_size]; const at::BFloat16* const ip1 = &input[idx1 * block_size]; const at::BFloat16* const ip2 = &input[idx2 * block_size]; const at::BFloat16* const ip3 = &input[idx3 * block_size]; const at::BFloat16* const ip4 = &input[idx4 * block_size]; const at::BFloat16* const ip5 = &input[idx5 * block_size]; const at::BFloat16* const ip6 = &input[idx6 * block_size]; const at::BFloat16* const ip7 = &input[idx7 * block_size]; const at::BFloat16* const ip8 = &input[idx8 * block_size]; const at::BFloat16* const ip9 = &input[idx9 * block_size]; const at::BFloat16* const ip10 = &input[idx10 * block_size]; const at::BFloat16* const ip11 = &input[idx11 * block_size]; const at::BFloat16* const ip12 = &input[idx12 * block_size]; const at::BFloat16* const ip13 = &input[idx13 * block_size]; const at::BFloat16* const ip14 = &input[idx14 * block_size]; const at::BFloat16* const ip15 = &input[idx15 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); auto input0 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip0[k])), 16)); auto input1 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip1[k])), 16)); auto input2 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip2[k])), 16)); auto input3 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip3[k])), 16)); auto input4 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip4[k])), 16)); auto input5 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip5[k])), 16)); auto input6 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip6[k])), 16)); auto input7 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip7[k])), 16)); auto input8 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip8[k])), 16)); auto input9 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip9[k])), 16)); auto input10 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip10[k])), 16)); auto input11 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip11[k])), 16)); auto input12 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip12[k])), 16)); auto input13 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip13[k])), 16)); auto input14 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip14[k])), 16)); auto input15 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip15[k])), 16)); output = svmla_x(svAll, output, input0, wgt0); output = svmla_x(svAll, output, input1, wgt1); output = svmla_x(svAll, output, input2, wgt2); output = svmla_x(svAll, output, input3, wgt3); output = svmla_x(svAll, output, input4, wgt4); output = svmla_x(svAll, output, input5, wgt5); output = svmla_x(svAll, output, input6, wgt6); output = svmla_x(svAll, output, input7, wgt7); output = svmla_x(svAll, output, input8, wgt8); output = svmla_x(svAll, output, input9, wgt9); output = svmla_x(svAll, output, input10, wgt10); output = svmla_x(svAll, output, input11, wgt11); output = svmla_x(svAll, output, input12, wgt12); output = svmla_x(svAll, output, input13, wgt13); output = svmla_x(svAll, output, input14, wgt14); output = svmla_x(svAll, output, input15, wgt15); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); auto input0 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip0[k])), 16)); auto input1 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip1[k])), 16)); auto input2 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip2[k])), 16)); auto input3 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip3[k])), 16)); auto input4 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip4[k])), 16)); auto input5 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip5[k])), 16)); auto input6 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip6[k])), 16)); auto input7 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip7[k])), 16)); auto input8 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip8[k])), 16)); auto input9 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip9[k])), 16)); auto input10 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip10[k])), 16)); auto input11 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip11[k])), 16)); auto input12 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip12[k])), 16)); auto input13 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip13[k])), 16)); auto input14 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip14[k])), 16)); auto input15 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip15[k])), 16)); output = svmla_x(pg, output, input0, wgt0); output = svmla_x(pg, output, input1, wgt1); output = svmla_x(pg, output, input2, wgt2); output = svmla_x(pg, output, input3, wgt3); output = svmla_x(pg, output, input4, wgt4); output = svmla_x(pg, output, input5, wgt5); output = svmla_x(pg, output, input6, wgt6); output = svmla_x(pg, output, input7, wgt7); output = svmla_x(pg, output, input8, wgt8); output = svmla_x(pg, output, input9, wgt9); output = svmla_x(pg, output, input10, wgt10); output = svmla_x(pg, output, input11, wgt11); output = svmla_x(pg, output, input12, wgt12); output = svmla_x(pg, output, input13, wgt13); output = svmla_x(pg, output, input14, wgt14); output = svmla_x(pg, output, input15, wgt15); svst1(pg, &op[k], output); k += vLen; } j += 16; pos += 16; } // unrolling 8 times while (j + 7 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; const auto idx2 = indices[pos + 2]; const auto idx3 = indices[pos + 3]; const auto idx4 = indices[pos + 4]; const auto idx5 = indices[pos + 5]; const auto idx6 = indices[pos + 6]; const auto idx7 = indices[pos + 7]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } if (idx2 < 0 || idx2 >= data_size) { return false; } if (idx3 < 0 || idx3 >= data_size) { return false; } if (idx4 < 0 || idx4 >= data_size) { return false; } if (idx5 < 0 || idx5 >= data_size) { return false; } if (idx6 < 0 || idx6 >= data_size) { return false; } if (idx7 < 0 || idx7 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float wgt2 = 1.f; float wgt3 = 1.f; float wgt4 = 1.f; float wgt5 = 1.f; float wgt6 = 1.f; float wgt7 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2]; wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3]; wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4]; wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5]; wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6]; wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7]; } const at::BFloat16* const ip0 = &input[idx0 * block_size]; const at::BFloat16* const ip1 = &input[idx1 * block_size]; const at::BFloat16* const ip2 = &input[idx2 * block_size]; const at::BFloat16* const ip3 = &input[idx3 * block_size]; const at::BFloat16* const ip4 = &input[idx4 * block_size]; const at::BFloat16* const ip5 = &input[idx5 * block_size]; const at::BFloat16* const ip6 = &input[idx6 * block_size]; const at::BFloat16* const ip7 = &input[idx7 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); auto input0 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip0[k])), 16)); auto input1 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip1[k])), 16)); auto input2 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip2[k])), 16)); auto input3 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip3[k])), 16)); auto input4 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip4[k])), 16)); auto input5 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip5[k])), 16)); auto input6 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip6[k])), 16)); auto input7 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip7[k])), 16)); output = svmla_x(svAll, output, input0, wgt0); output = svmla_x(svAll, output, input1, wgt1); output = svmla_x(svAll, output, input2, wgt2); output = svmla_x(svAll, output, input3, wgt3); output = svmla_x(svAll, output, input4, wgt4); output = svmla_x(svAll, output, input5, wgt5); output = svmla_x(svAll, output, input6, wgt6); output = svmla_x(svAll, output, input7, wgt7); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); auto input0 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip0[k])), 16)); auto input1 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip1[k])), 16)); auto input2 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip2[k])), 16)); auto input3 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip3[k])), 16)); auto input4 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip4[k])), 16)); auto input5 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip5[k])), 16)); auto input6 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip6[k])), 16)); auto input7 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip7[k])), 16)); output = svmla_x(pg, output, input0, wgt0); output = svmla_x(pg, output, input1, wgt1); output = svmla_x(pg, output, input2, wgt2); output = svmla_x(pg, output, input3, wgt3); output = svmla_x(pg, output, input4, wgt4); output = svmla_x(pg, output, input5, wgt5); output = svmla_x(pg, output, input6, wgt6); output = svmla_x(pg, output, input7, wgt7); svst1(pg, &op[k], output); k += vLen; } j += 8; pos += 8; } // unrolling 4 times while (j + 3 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; const auto idx2 = indices[pos + 2]; const auto idx3 = indices[pos + 3]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } if (idx2 < 0 || idx2 >= data_size) { return false; } if (idx3 < 0 || idx3 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float wgt2 = 1.f; float wgt3 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2]; wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3]; } const at::BFloat16* const ip0 = &input[idx0 * block_size]; const at::BFloat16* const ip1 = &input[idx1 * block_size]; const at::BFloat16* const ip2 = &input[idx2 * block_size]; const at::BFloat16* const ip3 = &input[idx3 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); auto input0 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip0[k])), 16)); auto input1 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip1[k])), 16)); auto input2 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip2[k])), 16)); auto input3 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip3[k])), 16)); output = svmla_x(svAll, output, input0, wgt0); output = svmla_x(svAll, output, input1, wgt1); output = svmla_x(svAll, output, input2, wgt2); output = svmla_x(svAll, output, input3, wgt3); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); auto input0 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip0[k])), 16)); auto input1 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip1[k])), 16)); auto input2 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip2[k])), 16)); auto input3 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip3[k])), 16)); output = svmla_x(pg, output, input0, wgt0); output = svmla_x(pg, output, input1, wgt1); output = svmla_x(pg, output, input2, wgt2); output = svmla_x(pg, output, input3, wgt3); svst1(pg, &op[k], output); k += vLen; } j += 4; pos += 4; } // unrolling 2 times while (j + 1 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; } const at::BFloat16* const ip0 = &input[idx0 * block_size]; const at::BFloat16* const ip1 = &input[idx1 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); auto input0 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip0[k])), 16)); auto input1 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip1[k])), 16)); output = svmla_x(svAll, output, input0, wgt0); output = svmla_x(svAll, output, input1, wgt1); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); auto input0 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip0[k])), 16)); auto input1 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip1[k])), 16)); output = svmla_x(pg, output, input0, wgt0); output = svmla_x(pg, output, input1, wgt1); svst1(pg, &op[k], output); k += vLen; } j += 2; pos += 2; } // tail loop if (j < end_offset) { const auto idx0 = indices[pos + 0]; if (idx0 < 0 || idx0 >= data_size) { return false; } float wgt0 = 1.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; } const at::BFloat16* const ip0 = &input[idx0 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); auto input0 = svreinterpret_f32(svlsl_x(svAll, svld1uh_u32(svAll, reinterpret_cast(&ip0[k])), 16)); output = svmla_x(svAll, output, input0, wgt0); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); auto input0 = svreinterpret_f32(svlsl_x(pg, svld1uh_u32(pg, reinterpret_cast(&ip0[k])), 16)); output = svmla_x(pg, output, input0, wgt0); svst1(pg, &op[k], output); k += vLen; } pos ++; } const int64_t length = end_offset - start_offset; if (normalize_by_lengths && length != 0) { const float len_inv = 1.0f / length; svbool_t pg; int64_t j = 0; while (j + vLen - 1 < block_size) { svst1(svAll, &op[j], svmul_x(svAll, svld1(svAll, &op[j]), len_inv)); j += vLen; } if (j < block_size) { pg = svwhilelt_b32_s64(j, block_size); svst1(pg, &op[j], svmul_x(pg, svld1(pg, &op[j]), len_inv)); } } } return pos == index_size; } bool EmbeddingLookupIdx_int64_t_bfloat16_float_false__sve( const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const at::BFloat16* input, const int64_t* indices, const int64_t* offsets, const float* weights, const float* scale_bias, bool normalize_by_lengths, float* out) { return EmbeddingLookupIdx_int64_t_bfloat16_float__sve( block_size, output_size, index_size, data_size, input, indices, offsets, weights, scale_bias, normalize_by_lengths, out); } bool EmbeddingLookupIdx_int64_t_bfloat16_float_true__sve( const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const at::BFloat16* input, const int64_t* indices, const int64_t* offsets, const float* weights, const float* scale_bias, bool normalize_by_lengths, float* out) { return EmbeddingLookupIdx_int64_t_bfloat16_float__sve( block_size, output_size, index_size, data_size, input, indices, offsets, weights, scale_bias, normalize_by_lengths, out); } template static bool EmbeddingLookupIdx_int32_t_uint8_t_float__sve( const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const uint8_t* input, const int32_t* indices, const int32_t* offsets, const float* weights, const float* scale_bias, bool normalize_by_lengths, float* out) { const svbool_t svAll = svptrue_b32(); const auto vLen = static_cast(svcntw()); int64_t pos = 0; for (int64_t i = 0; i < output_size; ++i) { float* const op = &out[i * block_size]; memset(op, 0, sizeof(float) * block_size); if (pos != offsets[i] - offsets[0]) { return false; } int64_t start_offset = offsets[i]; int64_t end_offset = offsets[i + 1]; int64_t j = start_offset; // unrolling 16 times while (j + 15 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; const auto idx2 = indices[pos + 2]; const auto idx3 = indices[pos + 3]; const auto idx4 = indices[pos + 4]; const auto idx5 = indices[pos + 5]; const auto idx6 = indices[pos + 6]; const auto idx7 = indices[pos + 7]; const auto idx8 = indices[pos + 8]; const auto idx9 = indices[pos + 9]; const auto idx10 = indices[pos + 10]; const auto idx11 = indices[pos + 11]; const auto idx12 = indices[pos + 12]; const auto idx13 = indices[pos + 13]; const auto idx14 = indices[pos + 14]; const auto idx15 = indices[pos + 15]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } if (idx2 < 0 || idx2 >= data_size) { return false; } if (idx3 < 0 || idx3 >= data_size) { return false; } if (idx4 < 0 || idx4 >= data_size) { return false; } if (idx5 < 0 || idx5 >= data_size) { return false; } if (idx6 < 0 || idx6 >= data_size) { return false; } if (idx7 < 0 || idx7 >= data_size) { return false; } if (idx8 < 0 || idx8 >= data_size) { return false; } if (idx9 < 0 || idx9 >= data_size) { return false; } if (idx10 < 0 || idx10 >= data_size) { return false; } if (idx11 < 0 || idx11 >= data_size) { return false; } if (idx12 < 0 || idx12 >= data_size) { return false; } if (idx13 < 0 || idx13 >= data_size) { return false; } if (idx14 < 0 || idx14 >= data_size) { return false; } if (idx15 < 0 || idx15 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float wgt2 = 1.f; float wgt3 = 1.f; float wgt4 = 1.f; float wgt5 = 1.f; float wgt6 = 1.f; float wgt7 = 1.f; float wgt8 = 1.f; float wgt9 = 1.f; float wgt10 = 1.f; float wgt11 = 1.f; float wgt12 = 1.f; float wgt13 = 1.f; float wgt14 = 1.f; float wgt15 = 1.f; float bio = 0.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2]; wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3]; wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4]; wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5]; wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6]; wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7]; wgt8 = weights[IS_WEIGHT_POSITIONAL ? (j + 8 - start_offset) : pos + 8]; wgt9 = weights[IS_WEIGHT_POSITIONAL ? (j + 9 - start_offset) : pos + 9]; wgt10 = weights[IS_WEIGHT_POSITIONAL ? (j + 10 - start_offset) : pos + 10]; wgt11 = weights[IS_WEIGHT_POSITIONAL ? (j + 11 - start_offset) : pos + 11]; wgt12 = weights[IS_WEIGHT_POSITIONAL ? (j + 12 - start_offset) : pos + 12]; wgt13 = weights[IS_WEIGHT_POSITIONAL ? (j + 13 - start_offset) : pos + 13]; wgt14 = weights[IS_WEIGHT_POSITIONAL ? (j + 14 - start_offset) : pos + 14]; wgt15 = weights[IS_WEIGHT_POSITIONAL ? (j + 15 - start_offset) : pos + 15]; } if (scale_bias) { bio += wgt0 * scale_bias[2 * idx0 + 1]; wgt0 = wgt0 * scale_bias[2 * idx0]; bio += wgt1 * scale_bias[2 * idx1 + 1]; wgt1 = wgt1 * scale_bias[2 * idx1]; bio += wgt2 * scale_bias[2 * idx2 + 1]; wgt2 = wgt2 * scale_bias[2 * idx2]; bio += wgt3 * scale_bias[2 * idx3 + 1]; wgt3 = wgt3 * scale_bias[2 * idx3]; bio += wgt4 * scale_bias[2 * idx4 + 1]; wgt4 = wgt4 * scale_bias[2 * idx4]; bio += wgt5 * scale_bias[2 * idx5 + 1]; wgt5 = wgt5 * scale_bias[2 * idx5]; bio += wgt6 * scale_bias[2 * idx6 + 1]; wgt6 = wgt6 * scale_bias[2 * idx6]; bio += wgt7 * scale_bias[2 * idx7 + 1]; wgt7 = wgt7 * scale_bias[2 * idx7]; bio += wgt8 * scale_bias[2 * idx8 + 1]; wgt8 = wgt8 * scale_bias[2 * idx8]; bio += wgt9 * scale_bias[2 * idx9 + 1]; wgt9 = wgt9 * scale_bias[2 * idx9]; bio += wgt10 * scale_bias[2 * idx10 + 1]; wgt10 = wgt10 * scale_bias[2 * idx10]; bio += wgt11 * scale_bias[2 * idx11 + 1]; wgt11 = wgt11 * scale_bias[2 * idx11]; bio += wgt12 * scale_bias[2 * idx12 + 1]; wgt12 = wgt12 * scale_bias[2 * idx12]; bio += wgt13 * scale_bias[2 * idx13 + 1]; wgt13 = wgt13 * scale_bias[2 * idx13]; bio += wgt14 * scale_bias[2 * idx14 + 1]; wgt14 = wgt14 * scale_bias[2 * idx14]; bio += wgt15 * scale_bias[2 * idx15 + 1]; wgt15 = wgt15 * scale_bias[2 * idx15]; } const uint8_t* const ip0 = &input[idx0 * block_size]; const uint8_t* const ip1 = &input[idx1 * block_size]; const uint8_t* const ip2 = &input[idx2 * block_size]; const uint8_t* const ip3 = &input[idx3 * block_size]; const uint8_t* const ip4 = &input[idx4 * block_size]; const uint8_t* const ip5 = &input[idx5 * block_size]; const uint8_t* const ip6 = &input[idx6 * block_size]; const uint8_t* const ip7 = &input[idx7 * block_size]; const uint8_t* const ip8 = &input[idx8 * block_size]; const uint8_t* const ip9 = &input[idx9 * block_size]; const uint8_t* const ip10 = &input[idx10 * block_size]; const uint8_t* const ip11 = &input[idx11 * block_size]; const uint8_t* const ip12 = &input[idx12 * block_size]; const uint8_t* const ip13 = &input[idx13 * block_size]; const uint8_t* const ip14 = &input[idx14 * block_size]; const uint8_t* const ip15 = &input[idx15 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); output = svadd_x(svAll, output, bio); auto input0 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip0[k])); auto input1 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip1[k])); auto input2 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip2[k])); auto input3 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip3[k])); auto input4 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip4[k])); auto input5 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip5[k])); auto input6 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip6[k])); auto input7 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip7[k])); auto input8 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip8[k])); auto input9 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip9[k])); auto input10 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip10[k])); auto input11 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip11[k])); auto input12 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip12[k])); auto input13 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip13[k])); auto input14 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip14[k])); auto input15 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip15[k])); output = svmla_x(svAll, output, input0, wgt0); output = svmla_x(svAll, output, input1, wgt1); output = svmla_x(svAll, output, input2, wgt2); output = svmla_x(svAll, output, input3, wgt3); output = svmla_x(svAll, output, input4, wgt4); output = svmla_x(svAll, output, input5, wgt5); output = svmla_x(svAll, output, input6, wgt6); output = svmla_x(svAll, output, input7, wgt7); output = svmla_x(svAll, output, input8, wgt8); output = svmla_x(svAll, output, input9, wgt9); output = svmla_x(svAll, output, input10, wgt10); output = svmla_x(svAll, output, input11, wgt11); output = svmla_x(svAll, output, input12, wgt12); output = svmla_x(svAll, output, input13, wgt13); output = svmla_x(svAll, output, input14, wgt14); output = svmla_x(svAll, output, input15, wgt15); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); output = svadd_x(pg, output, bio); auto input0 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip0[k])); auto input1 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip1[k])); auto input2 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip2[k])); auto input3 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip3[k])); auto input4 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip4[k])); auto input5 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip5[k])); auto input6 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip6[k])); auto input7 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip7[k])); auto input8 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip8[k])); auto input9 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip9[k])); auto input10 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip10[k])); auto input11 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip11[k])); auto input12 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip12[k])); auto input13 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip13[k])); auto input14 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip14[k])); auto input15 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip15[k])); output = svmla_x(pg, output, input0, wgt0); output = svmla_x(pg, output, input1, wgt1); output = svmla_x(pg, output, input2, wgt2); output = svmla_x(pg, output, input3, wgt3); output = svmla_x(pg, output, input4, wgt4); output = svmla_x(pg, output, input5, wgt5); output = svmla_x(pg, output, input6, wgt6); output = svmla_x(pg, output, input7, wgt7); output = svmla_x(pg, output, input8, wgt8); output = svmla_x(pg, output, input9, wgt9); output = svmla_x(pg, output, input10, wgt10); output = svmla_x(pg, output, input11, wgt11); output = svmla_x(pg, output, input12, wgt12); output = svmla_x(pg, output, input13, wgt13); output = svmla_x(pg, output, input14, wgt14); output = svmla_x(pg, output, input15, wgt15); svst1(pg, &op[k], output); k += vLen; } j += 16; pos += 16; } // unrolling 8 times while (j + 7 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; const auto idx2 = indices[pos + 2]; const auto idx3 = indices[pos + 3]; const auto idx4 = indices[pos + 4]; const auto idx5 = indices[pos + 5]; const auto idx6 = indices[pos + 6]; const auto idx7 = indices[pos + 7]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } if (idx2 < 0 || idx2 >= data_size) { return false; } if (idx3 < 0 || idx3 >= data_size) { return false; } if (idx4 < 0 || idx4 >= data_size) { return false; } if (idx5 < 0 || idx5 >= data_size) { return false; } if (idx6 < 0 || idx6 >= data_size) { return false; } if (idx7 < 0 || idx7 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float wgt2 = 1.f; float wgt3 = 1.f; float wgt4 = 1.f; float wgt5 = 1.f; float wgt6 = 1.f; float wgt7 = 1.f; float bio = 0.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2]; wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3]; wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4]; wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5]; wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6]; wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7]; } if (scale_bias) { bio += wgt0 * scale_bias[2 * idx0 + 1]; wgt0 = wgt0 * scale_bias[2 * idx0]; bio += wgt1 * scale_bias[2 * idx1 + 1]; wgt1 = wgt1 * scale_bias[2 * idx1]; bio += wgt2 * scale_bias[2 * idx2 + 1]; wgt2 = wgt2 * scale_bias[2 * idx2]; bio += wgt3 * scale_bias[2 * idx3 + 1]; wgt3 = wgt3 * scale_bias[2 * idx3]; bio += wgt4 * scale_bias[2 * idx4 + 1]; wgt4 = wgt4 * scale_bias[2 * idx4]; bio += wgt5 * scale_bias[2 * idx5 + 1]; wgt5 = wgt5 * scale_bias[2 * idx5]; bio += wgt6 * scale_bias[2 * idx6 + 1]; wgt6 = wgt6 * scale_bias[2 * idx6]; bio += wgt7 * scale_bias[2 * idx7 + 1]; wgt7 = wgt7 * scale_bias[2 * idx7]; } const uint8_t* const ip0 = &input[idx0 * block_size]; const uint8_t* const ip1 = &input[idx1 * block_size]; const uint8_t* const ip2 = &input[idx2 * block_size]; const uint8_t* const ip3 = &input[idx3 * block_size]; const uint8_t* const ip4 = &input[idx4 * block_size]; const uint8_t* const ip5 = &input[idx5 * block_size]; const uint8_t* const ip6 = &input[idx6 * block_size]; const uint8_t* const ip7 = &input[idx7 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); output = svadd_x(svAll, output, bio); auto input0 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip0[k])); auto input1 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip1[k])); auto input2 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip2[k])); auto input3 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip3[k])); auto input4 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip4[k])); auto input5 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip5[k])); auto input6 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip6[k])); auto input7 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip7[k])); output = svmla_x(svAll, output, input0, wgt0); output = svmla_x(svAll, output, input1, wgt1); output = svmla_x(svAll, output, input2, wgt2); output = svmla_x(svAll, output, input3, wgt3); output = svmla_x(svAll, output, input4, wgt4); output = svmla_x(svAll, output, input5, wgt5); output = svmla_x(svAll, output, input6, wgt6); output = svmla_x(svAll, output, input7, wgt7); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); output = svadd_x(pg, output, bio); auto input0 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip0[k])); auto input1 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip1[k])); auto input2 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip2[k])); auto input3 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip3[k])); auto input4 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip4[k])); auto input5 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip5[k])); auto input6 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip6[k])); auto input7 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip7[k])); output = svmla_x(pg, output, input0, wgt0); output = svmla_x(pg, output, input1, wgt1); output = svmla_x(pg, output, input2, wgt2); output = svmla_x(pg, output, input3, wgt3); output = svmla_x(pg, output, input4, wgt4); output = svmla_x(pg, output, input5, wgt5); output = svmla_x(pg, output, input6, wgt6); output = svmla_x(pg, output, input7, wgt7); svst1(pg, &op[k], output); k += vLen; } j += 8; pos += 8; } // unrolling 4 times while (j + 3 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; const auto idx2 = indices[pos + 2]; const auto idx3 = indices[pos + 3]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } if (idx2 < 0 || idx2 >= data_size) { return false; } if (idx3 < 0 || idx3 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float wgt2 = 1.f; float wgt3 = 1.f; float bio = 0.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2]; wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3]; } if (scale_bias) { bio += wgt0 * scale_bias[2 * idx0 + 1]; wgt0 = wgt0 * scale_bias[2 * idx0]; bio += wgt1 * scale_bias[2 * idx1 + 1]; wgt1 = wgt1 * scale_bias[2 * idx1]; bio += wgt2 * scale_bias[2 * idx2 + 1]; wgt2 = wgt2 * scale_bias[2 * idx2]; bio += wgt3 * scale_bias[2 * idx3 + 1]; wgt3 = wgt3 * scale_bias[2 * idx3]; } const uint8_t* const ip0 = &input[idx0 * block_size]; const uint8_t* const ip1 = &input[idx1 * block_size]; const uint8_t* const ip2 = &input[idx2 * block_size]; const uint8_t* const ip3 = &input[idx3 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); output = svadd_x(svAll, output, bio); auto input0 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip0[k])); auto input1 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip1[k])); auto input2 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip2[k])); auto input3 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip3[k])); output = svmla_x(svAll, output, input0, wgt0); output = svmla_x(svAll, output, input1, wgt1); output = svmla_x(svAll, output, input2, wgt2); output = svmla_x(svAll, output, input3, wgt3); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); output = svadd_x(pg, output, bio); auto input0 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip0[k])); auto input1 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip1[k])); auto input2 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip2[k])); auto input3 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip3[k])); output = svmla_x(pg, output, input0, wgt0); output = svmla_x(pg, output, input1, wgt1); output = svmla_x(pg, output, input2, wgt2); output = svmla_x(pg, output, input3, wgt3); svst1(pg, &op[k], output); k += vLen; } j += 4; pos += 4; } // unrolling 2 times while (j + 1 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float bio = 0.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; } if (scale_bias) { bio += wgt0 * scale_bias[2 * idx0 + 1]; wgt0 = wgt0 * scale_bias[2 * idx0]; bio += wgt1 * scale_bias[2 * idx1 + 1]; wgt1 = wgt1 * scale_bias[2 * idx1]; } const uint8_t* const ip0 = &input[idx0 * block_size]; const uint8_t* const ip1 = &input[idx1 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); output = svadd_x(svAll, output, bio); auto input0 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip0[k])); auto input1 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip1[k])); output = svmla_x(svAll, output, input0, wgt0); output = svmla_x(svAll, output, input1, wgt1); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); output = svadd_x(pg, output, bio); auto input0 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip0[k])); auto input1 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip1[k])); output = svmla_x(pg, output, input0, wgt0); output = svmla_x(pg, output, input1, wgt1); svst1(pg, &op[k], output); k += vLen; } j += 2; pos += 2; } // tail loop if (j < end_offset) { const auto idx0 = indices[pos + 0]; if (idx0 < 0 || idx0 >= data_size) { return false; } float wgt0 = 1.f; float bio = 0.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; } if (scale_bias) { bio += wgt0 * scale_bias[2 * idx0 + 1]; wgt0 = wgt0 * scale_bias[2 * idx0]; } const uint8_t* const ip0 = &input[idx0 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); output = svadd_x(svAll, output, bio); auto input0 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip0[k])); output = svmla_x(svAll, output, input0, wgt0); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); output = svadd_x(pg, output, bio); auto input0 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip0[k])); output = svmla_x(pg, output, input0, wgt0); svst1(pg, &op[k], output); k += vLen; } pos ++; } const int64_t length = end_offset - start_offset; if (normalize_by_lengths && length != 0) { const float len_inv = 1.0f / length; svbool_t pg; int64_t j = 0; while (j + vLen - 1 < block_size) { svst1(svAll, &op[j], svmul_x(svAll, svld1(svAll, &op[j]), len_inv)); j += vLen; } if (j < block_size) { pg = svwhilelt_b32_s64(j, block_size); svst1(pg, &op[j], svmul_x(pg, svld1(pg, &op[j]), len_inv)); } } } return pos == index_size; } bool EmbeddingLookupIdx_int32_t_uint8_t_float_false__sve( const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const uint8_t* input, const int32_t* indices, const int32_t* offsets, const float* weights, const float* scale_bias, bool normalize_by_lengths, float* out) { return EmbeddingLookupIdx_int32_t_uint8_t_float__sve( block_size, output_size, index_size, data_size, input, indices, offsets, weights, scale_bias, normalize_by_lengths, out); } bool EmbeddingLookupIdx_int32_t_uint8_t_float_true__sve( const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const uint8_t* input, const int32_t* indices, const int32_t* offsets, const float* weights, const float* scale_bias, bool normalize_by_lengths, float* out) { return EmbeddingLookupIdx_int32_t_uint8_t_float__sve( block_size, output_size, index_size, data_size, input, indices, offsets, weights, scale_bias, normalize_by_lengths, out); } template static bool EmbeddingLookupIdx_int64_t_uint8_t_float__sve( const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const uint8_t* input, const int64_t* indices, const int64_t* offsets, const float* weights, const float* scale_bias, bool normalize_by_lengths, float* out) { const svbool_t svAll = svptrue_b32(); const auto vLen = static_cast(svcntw()); int64_t pos = 0; for (int64_t i = 0; i < output_size; ++i) { float* const op = &out[i * block_size]; memset(op, 0, sizeof(float) * block_size); if (pos != offsets[i] - offsets[0]) { return false; } int64_t start_offset = offsets[i]; int64_t end_offset = offsets[i + 1]; int64_t j = start_offset; // unrolling 16 times while (j + 15 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; const auto idx2 = indices[pos + 2]; const auto idx3 = indices[pos + 3]; const auto idx4 = indices[pos + 4]; const auto idx5 = indices[pos + 5]; const auto idx6 = indices[pos + 6]; const auto idx7 = indices[pos + 7]; const auto idx8 = indices[pos + 8]; const auto idx9 = indices[pos + 9]; const auto idx10 = indices[pos + 10]; const auto idx11 = indices[pos + 11]; const auto idx12 = indices[pos + 12]; const auto idx13 = indices[pos + 13]; const auto idx14 = indices[pos + 14]; const auto idx15 = indices[pos + 15]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } if (idx2 < 0 || idx2 >= data_size) { return false; } if (idx3 < 0 || idx3 >= data_size) { return false; } if (idx4 < 0 || idx4 >= data_size) { return false; } if (idx5 < 0 || idx5 >= data_size) { return false; } if (idx6 < 0 || idx6 >= data_size) { return false; } if (idx7 < 0 || idx7 >= data_size) { return false; } if (idx8 < 0 || idx8 >= data_size) { return false; } if (idx9 < 0 || idx9 >= data_size) { return false; } if (idx10 < 0 || idx10 >= data_size) { return false; } if (idx11 < 0 || idx11 >= data_size) { return false; } if (idx12 < 0 || idx12 >= data_size) { return false; } if (idx13 < 0 || idx13 >= data_size) { return false; } if (idx14 < 0 || idx14 >= data_size) { return false; } if (idx15 < 0 || idx15 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float wgt2 = 1.f; float wgt3 = 1.f; float wgt4 = 1.f; float wgt5 = 1.f; float wgt6 = 1.f; float wgt7 = 1.f; float wgt8 = 1.f; float wgt9 = 1.f; float wgt10 = 1.f; float wgt11 = 1.f; float wgt12 = 1.f; float wgt13 = 1.f; float wgt14 = 1.f; float wgt15 = 1.f; float bio = 0.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2]; wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3]; wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4]; wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5]; wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6]; wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7]; wgt8 = weights[IS_WEIGHT_POSITIONAL ? (j + 8 - start_offset) : pos + 8]; wgt9 = weights[IS_WEIGHT_POSITIONAL ? (j + 9 - start_offset) : pos + 9]; wgt10 = weights[IS_WEIGHT_POSITIONAL ? (j + 10 - start_offset) : pos + 10]; wgt11 = weights[IS_WEIGHT_POSITIONAL ? (j + 11 - start_offset) : pos + 11]; wgt12 = weights[IS_WEIGHT_POSITIONAL ? (j + 12 - start_offset) : pos + 12]; wgt13 = weights[IS_WEIGHT_POSITIONAL ? (j + 13 - start_offset) : pos + 13]; wgt14 = weights[IS_WEIGHT_POSITIONAL ? (j + 14 - start_offset) : pos + 14]; wgt15 = weights[IS_WEIGHT_POSITIONAL ? (j + 15 - start_offset) : pos + 15]; } if (scale_bias) { bio += wgt0 * scale_bias[2 * idx0 + 1]; wgt0 = wgt0 * scale_bias[2 * idx0]; bio += wgt1 * scale_bias[2 * idx1 + 1]; wgt1 = wgt1 * scale_bias[2 * idx1]; bio += wgt2 * scale_bias[2 * idx2 + 1]; wgt2 = wgt2 * scale_bias[2 * idx2]; bio += wgt3 * scale_bias[2 * idx3 + 1]; wgt3 = wgt3 * scale_bias[2 * idx3]; bio += wgt4 * scale_bias[2 * idx4 + 1]; wgt4 = wgt4 * scale_bias[2 * idx4]; bio += wgt5 * scale_bias[2 * idx5 + 1]; wgt5 = wgt5 * scale_bias[2 * idx5]; bio += wgt6 * scale_bias[2 * idx6 + 1]; wgt6 = wgt6 * scale_bias[2 * idx6]; bio += wgt7 * scale_bias[2 * idx7 + 1]; wgt7 = wgt7 * scale_bias[2 * idx7]; bio += wgt8 * scale_bias[2 * idx8 + 1]; wgt8 = wgt8 * scale_bias[2 * idx8]; bio += wgt9 * scale_bias[2 * idx9 + 1]; wgt9 = wgt9 * scale_bias[2 * idx9]; bio += wgt10 * scale_bias[2 * idx10 + 1]; wgt10 = wgt10 * scale_bias[2 * idx10]; bio += wgt11 * scale_bias[2 * idx11 + 1]; wgt11 = wgt11 * scale_bias[2 * idx11]; bio += wgt12 * scale_bias[2 * idx12 + 1]; wgt12 = wgt12 * scale_bias[2 * idx12]; bio += wgt13 * scale_bias[2 * idx13 + 1]; wgt13 = wgt13 * scale_bias[2 * idx13]; bio += wgt14 * scale_bias[2 * idx14 + 1]; wgt14 = wgt14 * scale_bias[2 * idx14]; bio += wgt15 * scale_bias[2 * idx15 + 1]; wgt15 = wgt15 * scale_bias[2 * idx15]; } const uint8_t* const ip0 = &input[idx0 * block_size]; const uint8_t* const ip1 = &input[idx1 * block_size]; const uint8_t* const ip2 = &input[idx2 * block_size]; const uint8_t* const ip3 = &input[idx3 * block_size]; const uint8_t* const ip4 = &input[idx4 * block_size]; const uint8_t* const ip5 = &input[idx5 * block_size]; const uint8_t* const ip6 = &input[idx6 * block_size]; const uint8_t* const ip7 = &input[idx7 * block_size]; const uint8_t* const ip8 = &input[idx8 * block_size]; const uint8_t* const ip9 = &input[idx9 * block_size]; const uint8_t* const ip10 = &input[idx10 * block_size]; const uint8_t* const ip11 = &input[idx11 * block_size]; const uint8_t* const ip12 = &input[idx12 * block_size]; const uint8_t* const ip13 = &input[idx13 * block_size]; const uint8_t* const ip14 = &input[idx14 * block_size]; const uint8_t* const ip15 = &input[idx15 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); output = svadd_x(svAll, output, bio); auto input0 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip0[k])); auto input1 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip1[k])); auto input2 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip2[k])); auto input3 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip3[k])); auto input4 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip4[k])); auto input5 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip5[k])); auto input6 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip6[k])); auto input7 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip7[k])); auto input8 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip8[k])); auto input9 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip9[k])); auto input10 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip10[k])); auto input11 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip11[k])); auto input12 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip12[k])); auto input13 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip13[k])); auto input14 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip14[k])); auto input15 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip15[k])); output = svmla_x(svAll, output, input0, wgt0); output = svmla_x(svAll, output, input1, wgt1); output = svmla_x(svAll, output, input2, wgt2); output = svmla_x(svAll, output, input3, wgt3); output = svmla_x(svAll, output, input4, wgt4); output = svmla_x(svAll, output, input5, wgt5); output = svmla_x(svAll, output, input6, wgt6); output = svmla_x(svAll, output, input7, wgt7); output = svmla_x(svAll, output, input8, wgt8); output = svmla_x(svAll, output, input9, wgt9); output = svmla_x(svAll, output, input10, wgt10); output = svmla_x(svAll, output, input11, wgt11); output = svmla_x(svAll, output, input12, wgt12); output = svmla_x(svAll, output, input13, wgt13); output = svmla_x(svAll, output, input14, wgt14); output = svmla_x(svAll, output, input15, wgt15); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); output = svadd_x(pg, output, bio); auto input0 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip0[k])); auto input1 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip1[k])); auto input2 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip2[k])); auto input3 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip3[k])); auto input4 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip4[k])); auto input5 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip5[k])); auto input6 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip6[k])); auto input7 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip7[k])); auto input8 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip8[k])); auto input9 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip9[k])); auto input10 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip10[k])); auto input11 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip11[k])); auto input12 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip12[k])); auto input13 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip13[k])); auto input14 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip14[k])); auto input15 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip15[k])); output = svmla_x(pg, output, input0, wgt0); output = svmla_x(pg, output, input1, wgt1); output = svmla_x(pg, output, input2, wgt2); output = svmla_x(pg, output, input3, wgt3); output = svmla_x(pg, output, input4, wgt4); output = svmla_x(pg, output, input5, wgt5); output = svmla_x(pg, output, input6, wgt6); output = svmla_x(pg, output, input7, wgt7); output = svmla_x(pg, output, input8, wgt8); output = svmla_x(pg, output, input9, wgt9); output = svmla_x(pg, output, input10, wgt10); output = svmla_x(pg, output, input11, wgt11); output = svmla_x(pg, output, input12, wgt12); output = svmla_x(pg, output, input13, wgt13); output = svmla_x(pg, output, input14, wgt14); output = svmla_x(pg, output, input15, wgt15); svst1(pg, &op[k], output); k += vLen; } j += 16; pos += 16; } // unrolling 8 times while (j + 7 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; const auto idx2 = indices[pos + 2]; const auto idx3 = indices[pos + 3]; const auto idx4 = indices[pos + 4]; const auto idx5 = indices[pos + 5]; const auto idx6 = indices[pos + 6]; const auto idx7 = indices[pos + 7]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } if (idx2 < 0 || idx2 >= data_size) { return false; } if (idx3 < 0 || idx3 >= data_size) { return false; } if (idx4 < 0 || idx4 >= data_size) { return false; } if (idx5 < 0 || idx5 >= data_size) { return false; } if (idx6 < 0 || idx6 >= data_size) { return false; } if (idx7 < 0 || idx7 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float wgt2 = 1.f; float wgt3 = 1.f; float wgt4 = 1.f; float wgt5 = 1.f; float wgt6 = 1.f; float wgt7 = 1.f; float bio = 0.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2]; wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3]; wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4]; wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5]; wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6]; wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7]; } if (scale_bias) { bio += wgt0 * scale_bias[2 * idx0 + 1]; wgt0 = wgt0 * scale_bias[2 * idx0]; bio += wgt1 * scale_bias[2 * idx1 + 1]; wgt1 = wgt1 * scale_bias[2 * idx1]; bio += wgt2 * scale_bias[2 * idx2 + 1]; wgt2 = wgt2 * scale_bias[2 * idx2]; bio += wgt3 * scale_bias[2 * idx3 + 1]; wgt3 = wgt3 * scale_bias[2 * idx3]; bio += wgt4 * scale_bias[2 * idx4 + 1]; wgt4 = wgt4 * scale_bias[2 * idx4]; bio += wgt5 * scale_bias[2 * idx5 + 1]; wgt5 = wgt5 * scale_bias[2 * idx5]; bio += wgt6 * scale_bias[2 * idx6 + 1]; wgt6 = wgt6 * scale_bias[2 * idx6]; bio += wgt7 * scale_bias[2 * idx7 + 1]; wgt7 = wgt7 * scale_bias[2 * idx7]; } const uint8_t* const ip0 = &input[idx0 * block_size]; const uint8_t* const ip1 = &input[idx1 * block_size]; const uint8_t* const ip2 = &input[idx2 * block_size]; const uint8_t* const ip3 = &input[idx3 * block_size]; const uint8_t* const ip4 = &input[idx4 * block_size]; const uint8_t* const ip5 = &input[idx5 * block_size]; const uint8_t* const ip6 = &input[idx6 * block_size]; const uint8_t* const ip7 = &input[idx7 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); output = svadd_x(svAll, output, bio); auto input0 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip0[k])); auto input1 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip1[k])); auto input2 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip2[k])); auto input3 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip3[k])); auto input4 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip4[k])); auto input5 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip5[k])); auto input6 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip6[k])); auto input7 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip7[k])); output = svmla_x(svAll, output, input0, wgt0); output = svmla_x(svAll, output, input1, wgt1); output = svmla_x(svAll, output, input2, wgt2); output = svmla_x(svAll, output, input3, wgt3); output = svmla_x(svAll, output, input4, wgt4); output = svmla_x(svAll, output, input5, wgt5); output = svmla_x(svAll, output, input6, wgt6); output = svmla_x(svAll, output, input7, wgt7); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); output = svadd_x(pg, output, bio); auto input0 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip0[k])); auto input1 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip1[k])); auto input2 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip2[k])); auto input3 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip3[k])); auto input4 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip4[k])); auto input5 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip5[k])); auto input6 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip6[k])); auto input7 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip7[k])); output = svmla_x(pg, output, input0, wgt0); output = svmla_x(pg, output, input1, wgt1); output = svmla_x(pg, output, input2, wgt2); output = svmla_x(pg, output, input3, wgt3); output = svmla_x(pg, output, input4, wgt4); output = svmla_x(pg, output, input5, wgt5); output = svmla_x(pg, output, input6, wgt6); output = svmla_x(pg, output, input7, wgt7); svst1(pg, &op[k], output); k += vLen; } j += 8; pos += 8; } // unrolling 4 times while (j + 3 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; const auto idx2 = indices[pos + 2]; const auto idx3 = indices[pos + 3]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } if (idx2 < 0 || idx2 >= data_size) { return false; } if (idx3 < 0 || idx3 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float wgt2 = 1.f; float wgt3 = 1.f; float bio = 0.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2]; wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3]; } if (scale_bias) { bio += wgt0 * scale_bias[2 * idx0 + 1]; wgt0 = wgt0 * scale_bias[2 * idx0]; bio += wgt1 * scale_bias[2 * idx1 + 1]; wgt1 = wgt1 * scale_bias[2 * idx1]; bio += wgt2 * scale_bias[2 * idx2 + 1]; wgt2 = wgt2 * scale_bias[2 * idx2]; bio += wgt3 * scale_bias[2 * idx3 + 1]; wgt3 = wgt3 * scale_bias[2 * idx3]; } const uint8_t* const ip0 = &input[idx0 * block_size]; const uint8_t* const ip1 = &input[idx1 * block_size]; const uint8_t* const ip2 = &input[idx2 * block_size]; const uint8_t* const ip3 = &input[idx3 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); output = svadd_x(svAll, output, bio); auto input0 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip0[k])); auto input1 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip1[k])); auto input2 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip2[k])); auto input3 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip3[k])); output = svmla_x(svAll, output, input0, wgt0); output = svmla_x(svAll, output, input1, wgt1); output = svmla_x(svAll, output, input2, wgt2); output = svmla_x(svAll, output, input3, wgt3); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); output = svadd_x(pg, output, bio); auto input0 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip0[k])); auto input1 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip1[k])); auto input2 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip2[k])); auto input3 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip3[k])); output = svmla_x(pg, output, input0, wgt0); output = svmla_x(pg, output, input1, wgt1); output = svmla_x(pg, output, input2, wgt2); output = svmla_x(pg, output, input3, wgt3); svst1(pg, &op[k], output); k += vLen; } j += 4; pos += 4; } // unrolling 2 times while (j + 1 < end_offset) { const auto idx0 = indices[pos + 0]; const auto idx1 = indices[pos + 1]; if (idx0 < 0 || idx0 >= data_size) { return false; } if (idx1 < 0 || idx1 >= data_size) { return false; } float wgt0 = 1.f; float wgt1 = 1.f; float bio = 0.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1]; } if (scale_bias) { bio += wgt0 * scale_bias[2 * idx0 + 1]; wgt0 = wgt0 * scale_bias[2 * idx0]; bio += wgt1 * scale_bias[2 * idx1 + 1]; wgt1 = wgt1 * scale_bias[2 * idx1]; } const uint8_t* const ip0 = &input[idx0 * block_size]; const uint8_t* const ip1 = &input[idx1 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); output = svadd_x(svAll, output, bio); auto input0 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip0[k])); auto input1 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip1[k])); output = svmla_x(svAll, output, input0, wgt0); output = svmla_x(svAll, output, input1, wgt1); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); output = svadd_x(pg, output, bio); auto input0 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip0[k])); auto input1 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip1[k])); output = svmla_x(pg, output, input0, wgt0); output = svmla_x(pg, output, input1, wgt1); svst1(pg, &op[k], output); k += vLen; } j += 2; pos += 2; } // tail loop if (j < end_offset) { const auto idx0 = indices[pos + 0]; if (idx0 < 0 || idx0 >= data_size) { return false; } float wgt0 = 1.f; float bio = 0.f; if (weights) { wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0]; } if (scale_bias) { bio += wgt0 * scale_bias[2 * idx0 + 1]; wgt0 = wgt0 * scale_bias[2 * idx0]; } const uint8_t* const ip0 = &input[idx0 * block_size]; svbool_t pg; int64_t k = 0; while (k + vLen - 1 < block_size) { auto output = svld1(svAll, &op[k]); output = svadd_x(svAll, output, bio); auto input0 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip0[k])); output = svmla_x(svAll, output, input0, wgt0); svst1(svAll, &op[k], output); k += vLen; } if (k < block_size) { pg = svwhilelt_b32_s64(k, block_size); auto output = svld1(pg, &op[k]); output = svadd_x(pg, output, bio); auto input0 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip0[k])); output = svmla_x(pg, output, input0, wgt0); svst1(pg, &op[k], output); k += vLen; } pos ++; } const int64_t length = end_offset - start_offset; if (normalize_by_lengths && length != 0) { const float len_inv = 1.0f / length; svbool_t pg; int64_t j = 0; while (j + vLen - 1 < block_size) { svst1(svAll, &op[j], svmul_x(svAll, svld1(svAll, &op[j]), len_inv)); j += vLen; } if (j < block_size) { pg = svwhilelt_b32_s64(j, block_size); svst1(pg, &op[j], svmul_x(pg, svld1(pg, &op[j]), len_inv)); } } } return pos == index_size; } bool EmbeddingLookupIdx_int64_t_uint8_t_float_false__sve( const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const uint8_t* input, const int64_t* indices, const int64_t* offsets, const float* weights, const float* scale_bias, bool normalize_by_lengths, float* out) { return EmbeddingLookupIdx_int64_t_uint8_t_float__sve( block_size, output_size, index_size, data_size, input, indices, offsets, weights, scale_bias, normalize_by_lengths, out); } bool EmbeddingLookupIdx_int64_t_uint8_t_float_true__sve( const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const uint8_t* input, const int64_t* indices, const int64_t* offsets, const float* weights, const float* scale_bias, bool normalize_by_lengths, float* out) { return EmbeddingLookupIdx_int64_t_uint8_t_float__sve( block_size, output_size, index_size, data_size, input, indices, offsets, weights, scale_bias, normalize_by_lengths, out); } } // namespace caffe2