- 4维指纹采集: 性能/语言/能力/行为 - models.py 已加入 IdentityFingerprintModel (第5维数据模型) - comparator.py 已升级为5维评分 (含identity维度比较) - reporter.py 已加入身份验证报告输出 - main.py 已集成identity采集流程 - identity collector 待下次提交补充完整代码
66 lines
2.0 KiB
Python
66 lines
2.0 KiB
Python
"""Lightweight token estimator using regex tokenization + CJK character handling."""
|
|
|
|
import re
|
|
|
|
# Regex pattern for tokenization
|
|
_WORD_PATTERN = re.compile(r"""
|
|
[\u4e00-\u9fff]| # CJK Unified Ideographs (Chinese)
|
|
[\u3040-\u309f]| # Hiragana
|
|
[\u30a0-\u30ff]| # Katakana
|
|
[\uf900-\ufaff]| # CJK Compatibility Ideographs
|
|
[a-zA-Z]+(?:'[a-zA-Z]+)*| # English words (including contractions)
|
|
\d+(?:\.\d+)?| # Numbers (including decimals)
|
|
[^\s\w] # Punctuation
|
|
""", re.VERBOSE | re.UNICODE)
|
|
|
|
|
|
def estimate_tokens(text: str) -> int:
|
|
"""
|
|
Estimate the number of tokens in a text string.
|
|
Uses regex-based tokenization with special handling for CJK characters.
|
|
CJK characters are counted as ~1.5 tokens on average.
|
|
"""
|
|
if not text:
|
|
return 0
|
|
|
|
tokens = _WORD_PATTERN.findall(text)
|
|
count = 0
|
|
|
|
for token in tokens:
|
|
if len(token) == 1 and _is_cjk(token):
|
|
# CJK characters are roughly 1.5 tokens each
|
|
count += 1.5
|
|
elif re.match(r'^[a-zA-Z]', token):
|
|
# Long English words may be multiple tokens
|
|
if len(token) > 6:
|
|
count += max(1, len(token) / 4)
|
|
else:
|
|
count += 1
|
|
else:
|
|
count += 1
|
|
|
|
return max(1, int(count))
|
|
|
|
|
|
def _is_cjk(char: str) -> bool:
|
|
"""Check if a character is a CJK character."""
|
|
cp = ord(char)
|
|
return (
|
|
(0x4E00 <= cp <= 0x9FFF) or # CJK Unified Ideographs
|
|
(0x3040 <= cp <= 0x309F) or # Hiragana
|
|
(0x30A0 <= cp <= 0x30FF) or # Katakana
|
|
(0xF900 <= cp <= 0xFAFF) or # CJK Compatibility
|
|
(0x3400 <= cp <= 0x4DBF) # CJK Extension A
|
|
)
|
|
|
|
|
|
def count_cjk_chars(text: str) -> int:
|
|
"""Count the number of CJK characters in text."""
|
|
return sum(1 for c in text if _is_cjk(c))
|
|
|
|
|
|
def count_words(text: str) -> int:
|
|
"""Count words (non-CJK) in text."""
|
|
words = re.findall(r'[a-zA-Z]+(?:\'[a-zA-Z]+)*', text)
|
|
return len(words)
|