"""Lightweight token estimator using regex tokenization + CJK character handling.""" import re # Regex pattern for tokenization _WORD_PATTERN = re.compile(r""" [\u4e00-\u9fff]| # CJK Unified Ideographs (Chinese) [\u3040-\u309f]| # Hiragana [\u30a0-\u30ff]| # Katakana [\uf900-\ufaff]| # CJK Compatibility Ideographs [a-zA-Z]+(?:'[a-zA-Z]+)*| # English words (including contractions) \d+(?:\.\d+)?| # Numbers (including decimals) [^\s\w] # Punctuation """, re.VERBOSE | re.UNICODE) def estimate_tokens(text: str) -> int: """ Estimate the number of tokens in a text string. Uses regex-based tokenization with special handling for CJK characters. CJK characters are counted as ~1.5 tokens on average. """ if not text: return 0 tokens = _WORD_PATTERN.findall(text) count = 0 for token in tokens: if len(token) == 1 and _is_cjk(token): # CJK characters are roughly 1.5 tokens each count += 1.5 elif re.match(r'^[a-zA-Z]', token): # Long English words may be multiple tokens if len(token) > 6: count += max(1, len(token) / 4) else: count += 1 else: count += 1 return max(1, int(count)) def _is_cjk(char: str) -> bool: """Check if a character is a CJK character.""" cp = ord(char) return ( (0x4E00 <= cp <= 0x9FFF) or # CJK Unified Ideographs (0x3040 <= cp <= 0x309F) or # Hiragana (0x30A0 <= cp <= 0x30FF) or # Katakana (0xF900 <= cp <= 0xFAFF) or # CJK Compatibility (0x3400 <= cp <= 0x4DBF) # CJK Extension A ) def count_cjk_chars(text: str) -> int: """Count the number of CJK characters in text.""" return sum(1 for c in text if _is_cjk(c)) def count_words(text: str) -> int: """Count words (non-CJK) in text.""" words = re.findall(r'[a-zA-Z]+(?:\'[a-zA-Z]+)*', text) return len(words)