ai-xn-check/utils/tokenizer.py

"""Lightweight token estimator using regex tokenization + CJK character handling."""

import re

# Regex pattern for tokenization
_WORD_PATTERN = re.compile(r"""
    [\u4e00-\u9fff]|           # CJK Unified Ideographs (Chinese)
    [\u3040-\u309f]|           # Hiragana
    [\u30a0-\u30ff]|           # Katakana
    [\uf900-\ufaff]|           # CJK Compatibility Ideographs
    [a-zA-Z]+(?:'[a-zA-Z]+)*| # English words (including contractions)
    \d+(?:\.\d+)?|             # Numbers (including decimals)
    [^\s\w]                    # Punctuation
""", re.VERBOSE | re.UNICODE)


def estimate_tokens(text: str) -> int:
    """
    Estimate the number of tokens in a text string.
    Uses regex-based tokenization with special handling for CJK characters.
    CJK characters are counted as ~1.5 tokens on average.
    """
    if not text:
        return 0

    tokens = _WORD_PATTERN.findall(text)
    count = 0

    for token in tokens:
        if len(token) == 1 and _is_cjk(token):
            # CJK characters are roughly 1.5 tokens each
            count += 1.5
        elif re.match(r'^[a-zA-Z]', token):
            # Long English words may be multiple tokens
            if len(token) > 6:
                count += max(1, len(token) / 4)
            else:
                count += 1
        else:
            count += 1

    return max(1, int(count))


def _is_cjk(char: str) -> bool:
    """Check if a character is a CJK character."""
    cp = ord(char)
    return (
        (0x4E00 <= cp <= 0x9FFF) or    # CJK Unified Ideographs
        (0x3040 <= cp <= 0x309F) or    # Hiragana
        (0x30A0 <= cp <= 0x30FF) or    # Katakana
        (0xF900 <= cp <= 0xFAFF) or    # CJK Compatibility
        (0x3400 <= cp <= 0x4DBF)       # CJK Extension A
    )


def count_cjk_chars(text: str) -> int:
    """Count the number of CJK characters in text."""
    return sum(1 for c in text if _is_cjk(c))


def count_words(text: str) -> int:
    """Count words (non-CJK) in text."""
    words = re.findall(r'[a-zA-Z]+(?:\'[a-zA-Z]+)*', text)
    return len(words)