package com.gzzm.lobster.common; /** * 粗粒度 token 估算 / Coarse-grained token estimator. * *

大龙虾在政务外网运行，不允许依赖公网 tokenizer 服务， * 因此按「中文字符 ≈ 1 token，英文单词 ≈ 0.75 token，其他字符按 0.5 折算」 * 做一个稳定、保守的估算供预算治理使用。 * *

The intranet deployment forbids calling public tokenizers. * This estimator uses a conservative heuristic suitable for budget gating: * CJK char ≈ 1 token, English word ≈ 0.75, others ≈ 0.5. */ public final class TokenEstimator { private TokenEstimator() {} /** 估算一段文本的 token 数 / Estimate token count for the given text. */ public static int estimate(String text) { if (text == null || text.isEmpty()) return 0; int cjk = 0; int other = 0; boolean inWord = false; int words = 0; for (int i = 0, n = text.length(); i < n; i++) { char c = text.charAt(i); if (isCjk(c)) { cjk++; inWord = false; } else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) { if (!inWord) { words++; inWord = true; } } else if (Character.isWhitespace(c)) { // 空白不计 token —— 和实际 BPE tokenizer 行为更接近 inWord = false; } else { other++; inWord = false; } } // CJK 字符按 1:1；英文单词 ≈ 0.75 token/word；其他非空白字符按 0.5 return (int) Math.ceil(cjk + words * 0.75 + other * 0.5); } private static boolean isCjk(char c) { // 常用中日韩统一表意文字区间 return (c >= 0x4E00 && c <= 0x9FFF) || (c >= 0x3400 && c <= 0x4DBF) || (c >= 0xF900 && c <= 0xFAFF) || (c >= 0x3040 && c <= 0x30FF); // kana } }