#include #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include #include #include #include #include #include #include #include #ifndef AT_PER_OPERATOR_HEADERS #include #else #include #endif #if AT_KLEIDIAI_ENABLED() #include #include #endif #if (defined(_WIN32) || defined(_WIN64)) #define RESTRICT __restrict #else #define RESTRICT __restrict__ #endif C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable") namespace at::native { namespace { inline bool is_block_start(int index, int BLOCK_SIZE) { return !(index & (BLOCK_SIZE -1)); } #if (defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2)) && !defined(_MSC_VER) // convert 16x int4 to int8, handle 64 bits at a time // used in avx2 and avx512 inline __m128i conver_int4_to_int8(const uint8_t* data) { __m128i tmp = _mm_loadu_si64((const __m128i*)data); __m128i bytes = _mm_cvtepu8_epi16(tmp); const __m128i lowMask = _mm_set1_epi8(0xF); __m128i high = _mm_andnot_si128(lowMask, bytes); __m128i low = _mm_and_si128(lowMask, bytes); high = _mm_slli_epi16(high, 4); bytes = _mm_or_si128(low, high); return bytes; } #endif #if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) // A block : {BLOCK_M, BLOCK_K}, lda = K // B block : {BLOCK_K, BLOCK_N / 2}, ldb = BLOCK_N / 2 // C block : {BLOCK_M, BLOCK_N}, ldc = N // // ScaleAndZeros block : {1, BLOCK_N, 2} // template inline void tinygemm_kernel( const BFloat16* RESTRICT A, const uint8_t* RESTRICT B, const BFloat16* RESTRICT ScaleAndZeros, BFloat16* RESTRICT C, int lda, int ldb, int ldc, int K, int BLOCK_K) { constexpr int ROWS = BLOCK_M; constexpr int COLS = BLOCK_N / 16; const int PREFETCH_SIZE_K = 16 * 4; const int PREFETCH_SIZE_KB = (PREFETCH_SIZE_K + BLOCK_K - 1) / BLOCK_K; // number of blocks on K const int KB = K / BLOCK_K; __m512 va; __m512 vb[COLS]; __m512 vc[ROWS * COLS]; __m512 scale[COLS]; __m512 zero[COLS]; // Lookup table to de-quantize int4 values to bf16. // Values are dequantized as truly int4 [-8, 7] range; // // dequant = (bf16(int4_value) * bf16_scale) + bf16_zero // static const __m512 lut = _mm512_set_ps( 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f, -1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f, -7.0f, -8.0f); // index for transpose static const __m512i idx1 = _mm512_set_epi32( 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); static const __m512i idx2 = _mm512_set_epi32( 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1); // load scale and zero point auto load_scale_and_zeros = [&](int i, int _kb) { // load 2x bfloat16 vector __m512i t = _mm512_loadu_si512((__m512i*)(ScaleAndZeros + _kb * ldc * 2 + 32 * i)); if (_kb + PREFETCH_SIZE_KB < KB) { _mm_prefetch(ScaleAndZeros + (_kb + PREFETCH_SIZE_KB) * ldc * 2 + 32 * i, _MM_HINT_T0); } // convert to 2x f32 vector __m512 a, b; vec::cvtbf16_fp32(t, a, b); // transpose scale_and_zero from {16, 2} to {2, 16} // inputs: // a: {s0, z0, s1, z1, ..., s7, z7} // b: {s8, z8, s9, z9, ..., s15, z15} // output: // scale: {s0, s1, s2, ..., s15} // zero: {z0, z1, z2, ..., z15} scale[i] = _mm512_mask_permutex2var_ps(a, 0xffff, idx1, b); zero[i] = _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b); }; auto loadc = [&](auto i) { vc[i] = _mm512_setzero_ps(); }; c10::ForcedUnroll{}(loadc); auto compute = [&, COLS](auto i, int k) { constexpr int row = i / COLS; constexpr int col = i % COLS; if constexpr (col == 0) { float aa = static_cast(A[row * lda + k]); if (k + PREFETCH_SIZE_K < K) { _mm_prefetch(A + row * lda + k + PREFETCH_SIZE_K, _MM_HINT_T0); } va = _mm512_set1_ps(aa); } if constexpr (row == 0) { if constexpr (COLS == 4) { // when BLOCK_N = 64, handle each row at a time // to reduce de-quantize overhead. if constexpr (col == 0) { __m256i b4 = _mm256_loadu_si256((__m256i*)(B + k * ldb)); if (k + PREFETCH_SIZE_K < K) { _mm_prefetch(B + (k + PREFETCH_SIZE_K) * ldb, _MM_HINT_T0); } __m512i b32 = _mm512_cvtepu8_epi32(_mm256_castsi256_si128(b4)); vb[0] = _mm512_permutexvar_ps(b32, lut); vb[0] = _mm512_fmadd_ps(vb[0], scale[0], zero[0]); vb[2] = _mm512_permutexvar_ps(_mm512_srli_epi32(b32, 4), lut); vb[2] = _mm512_fmadd_ps(vb[2], scale[2], zero[2]); b32 = _mm512_cvtepu8_epi32(_mm256_extracti128_si256(b4, 1)); vb[1] = _mm512_permutexvar_ps(b32, lut); vb[1] = _mm512_fmadd_ps(vb[1], scale[1], zero[1]); vb[3] = _mm512_permutexvar_ps(_mm512_srli_epi32(b32, 4), lut); vb[3] = _mm512_fmadd_ps(vb[3], scale[3], zero[3]); } } else { __m128i b8 = conver_int4_to_int8(B + k * ldb + col * 8); __m512i b32 = _mm512_cvtepu8_epi32(b8); vb[col] = _mm512_permutexvar_ps(b32, lut); vb[col] = _mm512_fmadd_ps(vb[col], scale[col], zero[col]); } } constexpr int idx = row * COLS + col; vc[idx] = _mm512_fmadd_ps(va, vb[col], vc[idx]); }; for (int k = 0, kb = 0; k < K; ++k) { if (is_block_start(k, BLOCK_K)) { c10::ForcedUnroll{}(load_scale_and_zeros, kb++); } c10::ForcedUnroll{}(compute, k); } //store to C auto storec = [&, COLS](auto i) { constexpr int row = i / COLS; constexpr int col = i % COLS; if constexpr (COLS == 4) { // when BLOCK_N = 64, handle each row at a time // to reduce `cvtfp32_bf16` overhead. if constexpr (col == 0) { __m512i c01 = vec::cvtfp32_bf16(vc[row * 4 + 0], vc[row * 4 + 1]); __m512i c23 = vec::cvtfp32_bf16(vc[row * 4 + 2], vc[row * 4 + 3]); _mm512_storeu_si512((__m512i*)(C + row * ldc + 0 * 32), c01); _mm512_storeu_si512((__m512i*)(C + row * ldc + 1 * 32), c23); } } else { __m256i ci = vec::cvtfp32_bf16(vc[i]); _mm256_storeu_si256((__m256i*)(C + row * ldc + col * 16), ci); } }; c10::ForcedUnroll{}(storec); } #elif defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) template inline void tinygemm_kernel( const BFloat16* RESTRICT A, const uint8_t* RESTRICT B, const BFloat16* RESTRICT ScaleAndZeros, BFloat16* RESTRICT C, int lda, int ldb, int ldc, int K, int BLOCK_K) { constexpr int ROWS = BLOCK_M; constexpr int COLS = BLOCK_N / 8; const int PREFETCH_SIZE_K = 16 * 4; const int PREFETCH_SIZE_KB = (PREFETCH_SIZE_K + BLOCK_K - 1) / BLOCK_K; // number of blocks on K const int KB = K / BLOCK_K; __m256 va; __m256 vb[COLS]; __m256 vc[ROWS * COLS]; __m256 scale[COLS]; __m256 zero[COLS]; static const __m256i idx1 = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7); // offset to shift from range [0, 15] to [-8, 7] const __m256 offset = _mm256_set1_ps(-8.0f); // load scale and zero point auto load_scale_and_zeros = [&](int i, int _kb) { // load 2x bfloat16 vector __m256i t = _mm256_loadu_si256((__m256i*)(ScaleAndZeros + _kb * ldc * 2 + 16 * i)); if (_kb + PREFETCH_SIZE_KB < KB) { _mm_prefetch(ScaleAndZeros + (_kb + PREFETCH_SIZE_KB) * ldc * 2 + 16 * i, _MM_HINT_T0); } // convert to 2x f32 vector __m256 a, b; vec::cvtbf16_fp32(t, a, b); // transpose scale_and_zero from {8, 2} to {2, 8} // inputs: // a: {s0, z0, s1, z1, s2, z2, s3, z3} // b: {s4, z4, s5, z5, s6, z6, s7, z7} // output: // scale: {s0, s1, s2, s3, s4, s5, s6, s7} // zero: {z0, z1, z2, z3, z4, z5, z6, z7} a = _mm256_permutevar8x32_ps(a, idx1); b = _mm256_permutevar8x32_ps(b, idx1); scale[i] = _mm256_permute2f128_ps(a, b, 0b0100000); zero[i] = _mm256_permute2f128_ps(a, b, 0b0110001); // zero = -8 * scale + zero zero[i] = _mm256_fmadd_ps(scale[i], offset, zero[i]); }; auto loadc = [&](auto i) { vc[i] = _mm256_setzero_ps(); }; c10::ForcedUnroll{}(loadc); auto compute = [&, COLS](auto i, int k) { constexpr int row = i / COLS; constexpr int col = i % COLS; if constexpr (col == 0) { float aa = static_cast(A[row * lda + k]); if (k + PREFETCH_SIZE_K < K) { _mm_prefetch(A + row * lda + k + PREFETCH_SIZE_K, _MM_HINT_T0); } va = _mm256_set1_ps(aa); } if constexpr (row == 0) { if constexpr (COLS == 4) { // when BLOCK_N = 32, handle each row at a time if constexpr (col == 0) { __m256i mask = _mm256_set1_epi32(0xF); __m128i b4 = _mm_loadu_si128((__m128i*)(B + k * ldb)); if (k + PREFETCH_SIZE_K < K) { _mm_prefetch(B + (k + PREFETCH_SIZE_K) * ldb, _MM_HINT_T0); } __m256i b32 = _mm256_cvtepu8_epi32(b4); vb[0] = _mm256_cvtepi32_ps(_mm256_and_si256(b32, mask)); vb[0] = _mm256_fmadd_ps(vb[0], scale[0], zero[0]); vb[2] = _mm256_cvtepi32_ps(_mm256_srli_epi32(b32, 4)); vb[2] = _mm256_fmadd_ps(vb[2], scale[2], zero[2]); b32 = _mm256_cvtepu8_epi32(_mm_shuffle_epi32(b4, _MM_SHUFFLE(3, 2, 3, 2))); vb[1] = _mm256_cvtepi32_ps(_mm256_and_si256(b32, mask)); vb[1] = _mm256_fmadd_ps(vb[1], scale[1], zero[1]); vb[3] = _mm256_cvtepi32_ps(_mm256_srli_epi32(b32, 4)); vb[3] = _mm256_fmadd_ps(vb[3], scale[3], zero[3]); } } else { if constexpr (col % 2 == 0) { // de-quantize per 64 bits (16x int4) __m128i b8 = conver_int4_to_int8(B + k * ldb + col * 4); __m128i b8_val0 = _mm_set1_epi64x(_mm_extract_epi64(b8, 0)); __m128i b8_val1 = _mm_set1_epi64x(_mm_extract_epi64(b8, 1)); if (k + PREFETCH_SIZE_K < K) { _mm_prefetch(B + (k + PREFETCH_SIZE_K) * ldb + col * 4, _MM_HINT_T0); } vb[col] = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(b8_val0)); vb[col] = _mm256_fmadd_ps(vb[col], scale[col], zero[col]); vb[col + 1] = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(b8_val1)); vb[col + 1] = _mm256_fmadd_ps(vb[col + 1], scale[col + 1], zero[col + 1]); } } } constexpr int idx = row * COLS + col; vc[idx] = _mm256_fmadd_ps(va, vb[col], vc[idx]); }; for (int k = 0, kb = 0; k < K; ++k) { if (is_block_start(k, BLOCK_K)) { c10::ForcedUnroll{}(load_scale_and_zeros, kb++); } c10::ForcedUnroll{}(compute, k); } // store to C auto storec = [&](auto i) { constexpr int row = i / COLS; constexpr int col = i % COLS; if constexpr (col % 2 == 0) { __m256i ci = vec::cvtfp32_bf16(vc[row * COLS + col], vc[row * COLS + col + 1]); _mm256_storeu_si256((__m256i*)(C + row * ldc + col * 8), ci); } }; c10::ForcedUnroll{}(storec); } #endif #if !defined(C10_MOBILE) && defined(__aarch64__) #include inline float32x4x2_t load_as_float32x4x2(const Half* ptr) { float16x4x2_t f16_val = vld2_f16(reinterpret_cast(ptr)); auto val_low = vcvt_f32_f16(f16_val.val[0]); auto val_high = vcvt_f32_f16(f16_val.val[1]); return {val_low, val_high}; } inline void store_float32x4(Half* ptr, float32x4_t val) { vst1_f16(reinterpret_cast(ptr), vcvt_f16_f32(val)); } inline float32x4x2_t load_as_float32x4x2(const BFloat16* ptr) { int32x4_t shift = vdupq_n_s32(16); uint16x4x2_t u16_val = vld2_u16(reinterpret_cast(ptr)); uint32x4_t int_low = vmovl_u16(u16_val.val[0]); uint32x4_t int_high = vmovl_u16(u16_val.val[1]); return {vreinterpretq_f32_u32(vshlq_u32(int_low, shift)), vreinterpretq_f32_u32(vshlq_u32(int_high, shift))}; } inline void store_float32x4(BFloat16* ptr, float32x4_t val) { int32x4_t shift = vdupq_n_s32(-16); uint32x4_t uint32_val = vshlq_u32(vreinterpretq_u32_f32(val), shift); vst1_u16(reinterpret_cast(ptr), vmovn_u32(uint32_val)); } inline float32x4x2_t load_as_float32x4x2(const float* ptr) { return vld2q_f32(ptr); } inline void store_float32x4(float* ptr, float32x4_t val) { vst1q_f32(ptr, val); } template inline void tinygemm_kernel_( const T* RESTRICT A, const uint8_t* RESTRICT B, const T* RESTRICT ScaleAndZeros, T* RESTRICT C, int lda, int ldb, int ldc, int K, int BLOCK_K) { int16_t shift_vals[4] = {0, -4, -8, -12}; int16x4_t shifts = vld1_s16(shift_vals); int16x4_t offs = vdup_n_s16(8); uint16x4_t mask = vdup_n_u16(0x0F); for (const auto m : c10::irange(BLOCK_M)) { for (int n = 0; n < BLOCK_N; n+= 16) { float32x4_t c_val[4]; float32x4_t scales[4], zeros[4]; c10::ForcedUnroll<4>{}([&](auto i) { c_val[i] = vdupq_n_f32(0.0); }); for (const auto k : c10::irange(K)) { const auto a_val = vdupq_n_f32(static_cast(A[m * lda + k])); if (is_block_start(k, BLOCK_K)) { int kb = k / BLOCK_K; c10::ForcedUnroll<4>{}([&](auto i) { auto scales_and_zeros = load_as_float32x4x2(ScaleAndZeros + kb * ldc * 2 + n * 2 + i * 8); scales[i] = scales_and_zeros.val[0]; zeros[i] = scales_and_zeros.val[1]; }); } c10::ForcedUnroll<4>{}([&](auto i) { uint16_t b_pack = reinterpret_cast(B + k * ldb + n / 2)[i]; uint16x4_t b_masked = vand_u16(vshl_u16(vdup_n_u16(b_pack), shifts), mask); int16x4_t b_ints = vsub_s16(vreinterpret_s16_u16(b_masked), offs); float32x4_t b_vals = vcvtq_f32_s32(vmovl_s16(b_ints)); b_vals = vaddq_f32(zeros[i], vmulq_f32(scales[i], b_vals)); c_val[i] = vfmaq_f32(c_val[i], b_vals, a_val); }); } c10::ForcedUnroll<4>{}([&](auto i) { store_float32x4(C + m * ldc + n + i * 4, c_val[i]); }); } } } template inline void tinygemm_kernel( const Half* RESTRICT A, const uint8_t* RESTRICT B, const Half* RESTRICT ScaleAndZeros, Half* RESTRICT C, int lda, int ldb, int ldc, int K, int BLOCK_K) { tinygemm_kernel_(A, B, ScaleAndZeros, C, lda, ldb, ldc, K, BLOCK_K); } template inline void tinygemm_kernel( const BFloat16* RESTRICT A, const uint8_t* RESTRICT B, const BFloat16* RESTRICT ScaleAndZeros, BFloat16* RESTRICT C, int lda, int ldb, int ldc, int K, int BLOCK_K) { tinygemm_kernel_(A, B, ScaleAndZeros, C, lda, ldb, ldc, K, BLOCK_K); } template inline void tinygemm_kernel( const float* RESTRICT A, const uint8_t* RESTRICT B, const float* RESTRICT ScaleAndZeros, float* RESTRICT C, int lda, int ldb, int ldc, int K, int BLOCK_K) { tinygemm_kernel_(A, B, ScaleAndZeros, C, lda, ldb, ldc, K, BLOCK_K); } #endif template inline float convert_int4_to_float(const uint8_t* b, int n) { static constexpr float lut[16] = { -8.0f, -7.0f, -6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f }; int index; #if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) if constexpr (BLOCK_N == 64) { const int nb = n/BLOCK_N; n -= nb*BLOCK_N; if (n < 32) { auto val = b[nb * BLOCK_N / 2 + n]; index = val & 0x0f; } else { auto val = b[nb * BLOCK_N / 2 + (n - 32)]; index = val >> 4; } } else #elif defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) if constexpr (BLOCK_N == 32) { const int nb = n/BLOCK_N; n -= nb*BLOCK_N; if (n < 16) { auto val = b[nb * BLOCK_N / 2 + n]; index = val & 0x0f; } else { auto val = b[nb * BLOCK_N / 2 + (n - 16)]; index = val >> 4; } } else #endif { const auto is_even = (n & 1) == 0; auto val = b[n/2]; index = is_even ? (val & 0x0F) : (val >> 4); } return lut[index]; } // non-vectorized version template inline void tinygemm_kernel( const T* RESTRICT A, const uint8_t* RESTRICT B, const T* RESTRICT ScaleAndZeros, T* RESTRICT C, int lda, int ldb, int ldc, int K, int BLOCK_K) { for (const auto m : c10::irange(BLOCK_M)) { for (const auto n : c10::irange(BLOCK_N)) { float c_val = 0; for (const auto k : c10::irange(K)) { int kb = k / BLOCK_K; const auto scale = static_cast(ScaleAndZeros[kb * ldc * 2 + n * 2]); const auto zero = static_cast(ScaleAndZeros[kb * ldc * 2 + n * 2 + 1]); const auto a_val = static_cast(A[m * lda + k]); float b_val = convert_int4_to_float(B + k *ldb, n); b_val = b_val * scale + zero; c_val += a_val * b_val; } C[m * ldc + n] = c_val; } } } #define LAUNCH_TINYGEMM_KERNEL(MB_SIZE, NB_SIZE) \ tinygemm_kernel( \ A_ptr, B_ptr, S_ptr, C_ptr, \ K, NB_SIZE / 2, N, K, BLOCK_K); #define LAUNCH_TINYGEMM_NB_SIZE(MB_SIZE) \ switch (nb_size) { \ case 16: \ LAUNCH_TINYGEMM_KERNEL(MB_SIZE, 16); \ break; \ case 32: \ LAUNCH_TINYGEMM_KERNEL(MB_SIZE, 32); \ break; \ case 48: \ LAUNCH_TINYGEMM_KERNEL(MB_SIZE, 48); \ break; \ case 64: \ LAUNCH_TINYGEMM_KERNEL(MB_SIZE, 64); \ break; \ default: \ TORCH_CHECK(false, "Unsupported n block size: ", nb_size); \ break; \ } // NB: int4 weight pack (with BLOCK_N 64) // weight (int32): {N/64, 64, K} // packed (uint8): {N/64, K, 32} // // 1. avx512 packed format: // When N is 64, to do 256-bit unpacking at a time, we pack Lane0 with Lane2, // Lane1 with Lane3 since we can only do shift on a 128-bit basis. // // weight: // [Lane0] N0...15: {a00, a01, a02, ...} // [Lane1] N16...31: {a10, a11, a12, ...} // [Lane2] N32...47: {a20, a21, a22, ...} // [Lane3] N48...63: {a30, a31, a32, ...} // // packed: // [Lane02] N0...31: {a20|a00, a21|a01, a22|a02, ...} // [Lane13] N32...63: {a30|a10, a31|a11, a32|a12, ...} // // Note: when N is 16, 32 or 48, pack with 64-bit format. // // 2. avx2 packed format: // When N is 32, to do 128-bit unpacking at a time. // // weight: // [Lane0] N0...15: { a0, a1, a2, ...} // [Lane1] N16...32: {a16, a17, a18, ...} // // packed: // [Lane01] N0...32: {a16|a0, a17|a1, a18|a2, ...} // // Note: When N is 16, pack with 64-bit format // // 3 non-vectorized packed format: // Do 64-bit unpacking at a time. // // weight: {a0, a1, a2, a3, ..., a14, a15} // packed: {a1|a0, a3, a2, ..., a15|a14} // void weight_to_int4pack_kernel( const Tensor& weight_packed, const Tensor& weight) { auto weight_packed_data = reinterpret_cast(weight_packed.data_ptr()); const auto weight_data = weight.data_ptr(); int N = weight.size(0); int K = weight.size(1); // 64 for avx512 and 32 for avx2/non-vectorized constexpr int BLOCK_N = vec::Vectorized::size() * 4; const int NB = (N + BLOCK_N - 1) / BLOCK_N; // parallel on NB blocks at::parallel_for(0, NB, 0, [&](int begin, int end) { for (const auto i : c10::irange(begin, end)) { int nb_size = std::min(BLOCK_N, N - i * BLOCK_N); const int32_t* src = weight_data + i * BLOCK_N * K; uint8_t* dst = weight_packed_data + i * K * BLOCK_N / 2; for (const auto k : c10::irange(K)) { #if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) if (nb_size == BLOCK_N) { for (const auto d : c10::irange(16)) { int32_t val0 = src[(d + 0) * K + k]; int32_t val1 = src[(d + 16) * K + k]; int32_t val2 = src[(d + 32) * K + k]; int32_t val3 = src[(d + 48) * K + k]; uint8_t packed02 = (((uint8_t)(val2) << 4)) | ((uint8_t)(val0)); uint8_t packed13 = (((uint8_t)(val3) << 4)) | ((uint8_t)(val1)); dst[k * 32 + d] = packed02; dst[k * 32 + 16 + d] = packed13; } } else { // for nb_size 16, 32, 48 for (int n = 0; n < nb_size; n += 2) { int32_t val0 = src[n * K + k]; int32_t val1 = src[n * K + K + k]; uint8_t packed = (((uint8_t)(val1) << 4)) | ((uint8_t)(val0)); dst[k * nb_size / 2 + n / 2] = packed; } } #elif defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) if (nb_size == BLOCK_N) { // for nb_size 32 for (const auto d : c10::irange(16)) { int32_t val0 = src[(d + 0) * K + k]; int32_t val1 = src[(d + 16) * K + k]; uint8_t packed01 = (((uint8_t)(val1) << 4)) | ((uint8_t)(val0)); dst[k * 16 + d] = packed01; } } else { // for nb_size 16 for (int n = 0; n < nb_size; n += 2) { int32_t val0 = src[n * K + k]; int32_t val1 = src[n * K + K + k]; uint8_t packed = (((uint8_t)(val1) << 4)) | ((uint8_t)(val0)); dst[k * nb_size / 2 + n / 2] = packed; } } #else for (int n = 0; n < nb_size; n += 2) { int32_t val0 = src[n * K + k]; int32_t val1 = src[n * K + K + k]; uint8_t packed = (((uint8_t)(val1) << 4)) | ((uint8_t)(val0)); dst[k * nb_size / 2 + n / 2] = packed; } #endif } } }); } template void int4pack_mm_kernel_( const Tensor& C, const Tensor& A, const Tensor& B, int qGroupSize, const Tensor& qScaleAndZeros) { const auto* A_data = A.const_data_ptr(); const auto* B_data = reinterpret_cast(B.const_data_ptr()); auto* C_data = C.data_ptr(); const auto* S_data = qScaleAndZeros.const_data_ptr(); int M = A.size(0); int N = B.size(0); int K = A.size(1); constexpr int BLOCK_M = 4; // 64 for avx512 and 32 for avx2/non-vectorized constexpr int BLOCK_N = vec::Vectorized::size() * 4; // 32, 64, 128, 256 const int BLOCK_K = qGroupSize; const int MB = (M + BLOCK_M - 1) / BLOCK_M; const int NB = (N + BLOCK_N - 1) / BLOCK_N; at::parallel_for(0, MB * NB, 0, [&](int begin, int end) { int mb{0}, nb{0}; data_index_init(begin, mb, MB, nb, NB); for ([[maybe_unused]] const auto i : c10::irange(begin, end)) { int mb_start = mb * BLOCK_M; int mb_size = std::min(BLOCK_M, M - mb_start); int nb_start = nb * BLOCK_N; int nb_size = std::min(BLOCK_N, N - nb_start); const auto* A_ptr = A_data + mb_start * K; const auto* B_ptr = B_data + nb_start * K / 2; const auto* S_ptr = S_data + nb_start * 2; auto* C_ptr = C_data + mb_start * N + nb_start; switch (mb_size) { case 1: LAUNCH_TINYGEMM_NB_SIZE(1); break; case 2: LAUNCH_TINYGEMM_NB_SIZE(2); break; case 3: LAUNCH_TINYGEMM_NB_SIZE(3); break; case 4: LAUNCH_TINYGEMM_NB_SIZE(4); break; default: TORCH_CHECK(false, "Unsupported m block size: ", mb_size); } // move to the next index data_index_step(mb, MB, nb, NB); } }); } void int4pack_mm_kernel( const Tensor& C, const Tensor& A, const Tensor& B, int qGroupSize, const Tensor& qScaleAndZeros) { if (C.scalar_type() == kBFloat16) { int4pack_mm_kernel_(C, A, B, qGroupSize, qScaleAndZeros); } else if (C.scalar_type() == kHalf) { int4pack_mm_kernel_(C, A, B, qGroupSize, qScaleAndZeros); } else { int4pack_mm_kernel_(C, A, B, qGroupSize, qScaleAndZeros); } } #if AT_KLEIDIAI_ENABLED() bool can_use_kleidiai( const at::Tensor& scales_zeros, const int64_t K, const int64_t block_size) { bool ret = false; if (cpuinfo_has_arm_neon_dot()) { // The Groupwise kernel requires BFloat16 Scales and Channelwise kernel // requires Float32 Scales. If not provided, we will use fallback // implementation. if ((block_size == K && scales_zeros.dtype() == at::kFloat) || ((block_size < K && !(block_size % 32) && !(K % block_size)) && scales_zeros.dtype() == at::kBFloat16)) { ret = true; } } return ret; } #endif /** * The Int4 quantized weights must be represented as a uint8 tensor * For matrix multiplication with a weight shape of (N x K) * the shape of the 4-bit quantized weights is [N, K/groupsize, groupsize/2]. * * For KleidiAI weight packing, the scales, biases, and Int4 quantized * weights are packed into a single `packed_weights` structure, optimized for * Arm instructions. * * In the fallback reference kernel, no special packing is required for * Int4 quantized weights. * * The Groupwise kernel requires BFloat16 Scales and Channelwise kernel requires * Float32 Scales. If not provided, we will use fallback implementation. */ void dyn_quant_pack_4bit_weight_kernel( Tensor& packed_weights, const Tensor& weights, const Tensor& scales_zeros, const std::optional& bias, const int64_t N, const int64_t K, const int64_t block_size) { #if AT_KLEIDIAI_ENABLED() if (can_use_kleidiai(scales_zeros, K, block_size)) { const int64_t weight_packed_size = kleidiai::kai_pack_rhs_int4_size(N, K, block_size); packed_weights.resize_({weight_packed_size}); kleidiai::kai_pack_int4_rhs( packed_weights, weights, scales_zeros, bias, N, K, block_size); } else #endif { TORCH_CHECK( bias.has_value() == 0, __func__, " : Bias is unsupported in reference implementation"); packed_weights = packed_weights.to(kFloat); auto weight_reshaped = weights.view({-1}).to(kFloat); auto scales_zeros_reshaped = scales_zeros.view({-1}).to(kFloat); auto res = at::cat({weight_reshaped, scales_zeros_reshaped}, 0); packed_weights.resize_(res.sizes()).copy_(res); } } static void ref_dyn_quant_matmul_4bit_channelwise_kernel( size_t m, size_t n, size_t k, const float* lhs_f32, const uint8_t* rhs_qs4cx, const float* rhs_scales_f32, float* dst_f32, float scalar_min, float scalar_max) { const size_t input_size_8bit = m * (k + sizeof(int32_t) + sizeof(float)); auto lhs_qa8dx_buffer = std::make_unique(input_size_8bit); uint8_t* lhs_qa8dx = lhs_qa8dx_buffer.get(); // Lambda for quantizing the fp32 input to 8 bit symmetric and pack it in // required format for matmul auto input_quant_pack_8bit_channelwise = [&](size_t m, size_t k, const float* lhs_f32, int8_t* lhs_qa8dx) { const size_t dst_stride = (k * sizeof(int8_t) + sizeof(float) + sizeof(int32_t)); const size_t lhs_qa8dx_stride = k; for (size_t m_idx = 0; m_idx < m; ++m_idx) { const float* src_ptr = lhs_f32 + m_idx * lhs_qa8dx_stride; float max0 = -FLT_MAX; float min0 = FLT_MAX; // Find min/max for each channel for (size_t k_idx = 0; k_idx < k; ++k_idx) { const float src0_0 = src_ptr[k_idx]; max0 = (std::max)(src0_0, max0); min0 = (std::min)(src0_0, min0); } // Maximum/minimum int8 values const float qmin = (float)INT8_MIN; const float qmax = (float)INT8_MAX; const float rmin0 = (std::min)(0.0f, min0); const float rmax0 = (std::max)(0.0f, max0); const float scale0 = rmin0 == rmax0 ? 1.f : (qmax - qmin) / (rmax0 - rmin0); // Reciprocal to quantize const float recip_scale0 = scale0 ? 1.0f / scale0 : 0.0f; const float descaled_min0 = rmin0 * scale0; const float descaled_max0 = rmax0 * scale0; const float zero_point_from_min_error0 = qmin + descaled_min0; const float zero_point_from_max_error0 = qmax + descaled_max0; float zero_point0 = zero_point_from_min_error0 + zero_point_from_max_error0 > 0 ? qmin - descaled_min0 : qmax - descaled_max0; zero_point0 = (std::max)(zero_point0, qmin); zero_point0 = (std::min)(zero_point0, qmax); // Round to nearest integer const int32_t nudged_zero_point0 = lrintf(zero_point0); int8_t* dst_ptr = (int8_t*)lhs_qa8dx + m_idx * dst_stride; // LHS offset at the beginning of the row *((float*)(dst_ptr)) = recip_scale0; dst_ptr += sizeof(float); *((int32_t*)(dst_ptr)) = -nudged_zero_point0; dst_ptr += sizeof(int32_t); // Quantize the channels for (size_t k_idx = 0; k_idx < k; ++k_idx) { const float src0_0 = src_ptr[k_idx]; // Scale the values int32_t v0_s32 = (int32_t)(std::round(src0_0 * scale0)); v0_s32 = v0_s32 + nudged_zero_point0; v0_s32 = (std::max)(v0_s32, static_cast(INT8_MIN)); v0_s32 = (std::min)(v0_s32, static_cast(INT8_MAX)); dst_ptr[0] = (int8_t)v0_s32; dst_ptr += sizeof(int8_t); } } }; // Dynamically Quantize the float32 input to 8 bit assymetric input_quant_pack_8bit_channelwise(m, k, lhs_f32, (int8_t*)lhs_qa8dx); const size_t lhs_stride = k * sizeof(int8_t) + sizeof(float) + sizeof(int32_t); const size_t rhs_qs4cx_stride = ((((k + 2 - 1) / 2) * 2) / 2); for (size_t m_idx = 0; m_idx < m; ++m_idx) { const int8_t* lhs_ptr_start = (int8_t*)lhs_qa8dx + m_idx * lhs_stride; for (size_t n_idx = 0; n_idx < n; ++n_idx) { // Main f32 accumulator int32_t iacc = 0; const int8_t* lhs_ptr = lhs_ptr_start; const uint8_t* rhs_ptr = rhs_qs4cx + n_idx * rhs_qs4cx_stride; // Get the LHS quantization parameters stored at the // beginning of each row const float lhs_scale = *(const float*)lhs_ptr; lhs_ptr += sizeof(float); const int32_t lhs_offset = *(const int32_t*)lhs_ptr; lhs_ptr += sizeof(int32_t); for (size_t k_idx = 0; k_idx < k; ++k_idx) { // Get the LHS values const int32_t lhs_v0 = (int32_t)lhs_ptr[0]; // Get the RHS values const uint8_t rhs_byte = rhs_ptr[0]; // Unpack the RHS values int32_t rhs_v0 = 0; if ((k_idx % 2) == 0) { rhs_v0 = (((int32_t)(rhs_byte & 0x0F)) - 8); } else { rhs_v0 = (((int32_t)(rhs_byte >> 4)) - 8); } iacc += lhs_v0 * rhs_v0; iacc += lhs_offset * rhs_v0; lhs_ptr += 1; // Increment only when k_idx is not a multiple of 2 rhs_ptr += k_idx % 2; } // Get the RHS scale const float rhs_scale = rhs_scales_f32[n_idx]; float main_acc = iacc * rhs_scale; main_acc = main_acc * lhs_scale; // Clamp (min-max) operation main_acc = (std::max)(main_acc, scalar_min); main_acc = (std::min)(main_acc, scalar_max); dst_f32[0] = main_acc; dst_f32 += 1; } } } static void ref_dyn_quant_matmul_4bit_groupwise_kernel( size_t m, size_t n, size_t k, size_t bl, const float* lhs_f32, const uint8_t* rhs_qs4c32, const float* rhs_scales_fp32, float* dst_f32, float scalar_min, float scalar_max) { // Lambda for LHS quantization auto lhs_quant_pack = [&](size_t m, size_t k, const float* lhs_f32, int8_t* lhs_qa8dx) { const size_t dst_stride = (k * sizeof(int8_t) + sizeof(float) + sizeof(int32_t)); for (size_t row_idx = 0; row_idx < m; ++row_idx) { const float* src_ptr = lhs_f32 + row_idx * k; float max0 = -FLT_MAX; float min0 = FLT_MAX; for (size_t k_idx = 0; k_idx < k; ++k_idx) { const float src0_0 = src_ptr[k_idx]; max0 = (std::max)(src0_0, max0); min0 = (std::min)(src0_0, min0); } const float qmin = (float)INT8_MIN; const float qmax = (float)INT8_MAX; const float rmin0 = (std::min)(0.0f, min0); const float rmax0 = (std::max)(0.0f, max0); const float scale0 = (rmin0 == rmax0) ? 1.f : (qmax - qmin) / (rmax0 - rmin0); const float recip_scale0 = scale0 ? 1.0f / scale0 : 0.0f; const float descaled_min0 = rmin0 * scale0; const float descaled_max0 = rmax0 * scale0; float zero_point0 = (qmin + descaled_min0 + qmax + descaled_max0 > 0) ? qmin - descaled_min0 : qmax - descaled_max0; zero_point0 = (std::max)(zero_point0, qmin); zero_point0 = (std::min)(zero_point0, qmax); const int32_t nudged_zero_point0 = lrintf(zero_point0); int8_t* dst_ptr = (int8_t*)lhs_qa8dx + row_idx * dst_stride; *((float*)(dst_ptr)) = recip_scale0; dst_ptr += sizeof(float); *((int32_t*)(dst_ptr)) = -nudged_zero_point0; dst_ptr += sizeof(int32_t); for (size_t k_idx = 0; k_idx < k; ++k_idx) { const float src0_0 = src_ptr[k_idx]; int32_t v0_s32 = (int32_t)(std::round(src0_0 * scale0)); v0_s32 = (std::max)( (std::min)( v0_s32 + nudged_zero_point0, static_cast(INT8_MAX)), static_cast(INT8_MIN)); dst_ptr[0] = (int8_t)v0_s32; dst_ptr += sizeof(int8_t); } } }; auto lhs_qa8dx_buffer = std::make_unique( m * (k + sizeof(float) + sizeof(int32_t))); // Allocate for LHS int8_t* lhs_qa8dx = lhs_qa8dx_buffer.get(); // Quantize and pack LHS lhs_quant_pack(m, k, lhs_f32, lhs_qa8dx); const size_t num_blocks_row = (((k + bl - 1) / bl) * bl) / bl; const size_t lhs_stride = k + sizeof(float) + sizeof(int32_t); const size_t rhs_stride = (((k + 2 - 1) / 2) * 2) / 2; for (size_t row_idx = 0; row_idx < m; ++row_idx) { const int8_t* lhs_ptr_start = lhs_qa8dx + row_idx * lhs_stride; for (size_t col_idx = 0; col_idx < n; ++col_idx) { float main_acc = 0.0f; const int8_t* lhs_ptr = lhs_ptr_start; const uint8_t* rhs_ptr = rhs_qs4c32 + col_idx * rhs_stride; const float lhs_scale = *(const float*)lhs_ptr; lhs_ptr += sizeof(float); const int32_t lhs_offset = *(const int32_t*)lhs_ptr; lhs_ptr += sizeof(int32_t); for (size_t block_idx = 0; block_idx < num_blocks_row; ++block_idx) { const float rhs_scale = rhs_scales_fp32[block_idx + col_idx * num_blocks_row]; int32_t iacc = 0; for (size_t i = 0; i < bl; ++i) { const size_t k_idx = block_idx * bl + i; if (k_idx >= k) { break; } const int32_t lhs_v0 = (int32_t)lhs_ptr[0]; const uint8_t rhs_byte = rhs_ptr[0]; int32_t rhs_v0 = (k_idx % 2 == 0) ? (((int32_t)(rhs_byte & 0x0F)) - 8) : (((int32_t)(rhs_byte >> 4)) - 8); iacc += lhs_v0 * rhs_v0; iacc += lhs_offset * rhs_v0; lhs_ptr += 1; rhs_ptr += (k_idx % 2); } main_acc += iacc * rhs_scale; } main_acc = main_acc * lhs_scale; main_acc = (std::max)(main_acc, scalar_min); main_acc = (std::min)(main_acc, scalar_max); dst_f32[0] = main_acc; dst_f32 += 1; } } } /** * Dynamic Input Quant 4 bit weights matmul execution flow (INT4 Weights + FP scales + FP32 Bias) FP32 Input Packed Buffer | | Quantize Cast to INT8 to INT8 | | v v INT8 Input INT8 Weights \ / \ / \ / INT8 Matrix Multiplication | v FP32 Dequantized and Accumulate in FP32 | v FP32 Final Output * The Groupwise kernel requires BFloat16 Scales and Channelwise kernel requires * Float32 Scales. If not provided, we will use fallback implementation. */ void dyn_quant_matmul_4bit_kernel( const Tensor& output, const Tensor& inp, const Tensor& packed_weights, const int64_t M, const int64_t N, const int64_t K, const int64_t block_size) { #if AT_KLEIDIAI_ENABLED() const int64_t weight_packed_size = kleidiai::kai_pack_rhs_int4_size(N, K, block_size); if (weight_packed_size == packed_weights.numel()) { // KleidiAI interface intenally handles the Channelwise and groupwise // distinction kleidiai::kai_quant_pack_lhs_int4_mm( output, inp, packed_weights, M, N, K, block_size); } else #endif { float* lhs_f32 = reinterpret_cast(inp.data_ptr()); const auto weights_size = N * K / 2; // The weights needs to be in uint8_t data type after quantization auto extracted_weights = (packed_weights.narrow(0, 0, weights_size)).to(kByte); auto float32_scales = (packed_weights.narrow( 0, weights_size, packed_weights.size(0) - weights_size)) .to(kFloat); uint8_t* rhs_4bit = reinterpret_cast(extracted_weights.data_ptr()); float* rhs_scales_f32 = reinterpret_cast(float32_scales.data_ptr()); float* dst_f32 = reinterpret_cast(output.data_ptr()); if (block_size == K) { ref_dyn_quant_matmul_4bit_channelwise_kernel( M, N, K, lhs_f32, rhs_4bit, rhs_scales_f32, dst_f32, -FLT_MAX, FLT_MAX); } else if (!(block_size % 32) && !(K % block_size)) { ref_dyn_quant_matmul_4bit_groupwise_kernel( M, N, K, block_size, lhs_f32, rhs_4bit, rhs_scales_f32, dst_f32, -FLT_MAX, FLT_MAX); } else { TORCH_CHECK( block_size == K || (!(block_size % 32) && !(K % block_size)), __func__, ": Group size should be multiple 32 or in_features [", K, "]. Provided ", block_size); } } } } // anonymous namespace ALSO_REGISTER_AVX512_DISPATCH(weight_to_int4pack_stub, &weight_to_int4pack_kernel) ALSO_REGISTER_AVX512_DISPATCH(int4pack_mm_stub, &int4pack_mm_kernel) REGISTER_DISPATCH(dyn_quant_pack_4bit_weight_stub, &dyn_quant_pack_4bit_weight_kernel) REGISTER_DISPATCH(dyn_quant_matmul_4bit_stub, &dyn_quant_matmul_4bit_kernel) } // at::native C10_DIAGNOSTIC_POP()