Files
ai-xn-check/utils/tokenizer.py
nosqli cdcd69256b feat: AI API 指纹检测对比工具 - 初始版本
- 4维指纹采集: 性能/语言/能力/行为
- models.py 已加入 IdentityFingerprintModel (第5维数据模型)
- comparator.py 已升级为5维评分 (含identity维度比较)
- reporter.py 已加入身份验证报告输出
- main.py 已集成identity采集流程
- identity collector 待下次提交补充完整代码
2026-03-09 00:15:03 +08:00

66 lines
2.0 KiB
Python

"""Lightweight token estimator using regex tokenization + CJK character handling."""
import re
# Regex pattern for tokenization
_WORD_PATTERN = re.compile(r"""
[\u4e00-\u9fff]| # CJK Unified Ideographs (Chinese)
[\u3040-\u309f]| # Hiragana
[\u30a0-\u30ff]| # Katakana
[\uf900-\ufaff]| # CJK Compatibility Ideographs
[a-zA-Z]+(?:'[a-zA-Z]+)*| # English words (including contractions)
\d+(?:\.\d+)?| # Numbers (including decimals)
[^\s\w] # Punctuation
""", re.VERBOSE | re.UNICODE)
def estimate_tokens(text: str) -> int:
"""
Estimate the number of tokens in a text string.
Uses regex-based tokenization with special handling for CJK characters.
CJK characters are counted as ~1.5 tokens on average.
"""
if not text:
return 0
tokens = _WORD_PATTERN.findall(text)
count = 0
for token in tokens:
if len(token) == 1 and _is_cjk(token):
# CJK characters are roughly 1.5 tokens each
count += 1.5
elif re.match(r'^[a-zA-Z]', token):
# Long English words may be multiple tokens
if len(token) > 6:
count += max(1, len(token) / 4)
else:
count += 1
else:
count += 1
return max(1, int(count))
def _is_cjk(char: str) -> bool:
"""Check if a character is a CJK character."""
cp = ord(char)
return (
(0x4E00 <= cp <= 0x9FFF) or # CJK Unified Ideographs
(0x3040 <= cp <= 0x309F) or # Hiragana
(0x30A0 <= cp <= 0x30FF) or # Katakana
(0xF900 <= cp <= 0xFAFF) or # CJK Compatibility
(0x3400 <= cp <= 0x4DBF) # CJK Extension A
)
def count_cjk_chars(text: str) -> int:
"""Count the number of CJK characters in text."""
return sum(1 for c in text if _is_cjk(c))
def count_words(text: str) -> int:
"""Count words (non-CJK) in text."""
words = re.findall(r'[a-zA-Z]+(?:\'[a-zA-Z]+)*', text)
return len(words)