ai-xn-check/utils/text_analysis.py

"""Text analysis utility functions for fingerprint extraction."""

import re
from collections import Counter
from typing import Dict, List, Set


def extract_bigrams(text: str) -> Dict[str, int]:
    """Extract word bigrams from text and return frequency counts."""
    words = re.findall(r'[a-zA-Z\u4e00-\u9fff]+', text.lower())
    if len(words) < 2:
        return {}

    bigrams = []
    for i in range(len(words) - 1):
        bigrams.append(f"{words[i]}_{words[i+1]}")

    return dict(Counter(bigrams).most_common(50))


def calculate_vocab_richness(text: str) -> float:
    """
    Calculate vocabulary richness (type-token ratio).
    Returns ratio of unique words to total words using root TTR.
    """
    words = re.findall(r'[a-zA-Z\u4e00-\u9fff]+', text.lower())
    if not words:
        return 0.0

    unique_words = set(words)
    # Use root TTR to reduce sensitivity to text length
    return len(unique_words) / (len(words) ** 0.5)


def detect_markdown_features(text: str) -> Dict[str, float]:
    """
    Detect Markdown formatting features in text.
    Returns dict of feature_name -> normalized frequency.
    """
    lines = text.split('\n')
    total_lines = max(len(lines), 1)

    features = {}

    # Headers (# ## ### etc.)
    header_count = len(re.findall(r'^#{1,6}\s', text, re.MULTILINE))
    features['headers'] = header_count / total_lines

    # Bullet points (- or * or numbered)
    bullet_count = len(re.findall(r'^\s*[-*]\s', text, re.MULTILINE))
    numbered_count = len(re.findall(r'^\s*\d+\.\s', text, re.MULTILINE))
    features['bullets'] = (bullet_count + numbered_count) / total_lines

    # Code blocks (``` or indented)
    code_block_count = len(re.findall(r'```', text))
    features['code_blocks'] = code_block_count / (2 * total_lines) if code_block_count else 0

    # Bold (**text** or __text__)
    bold_count = len(re.findall(r'\*\*[^*]+\*\*|__[^_]+__', text))
    features['bold'] = bold_count / total_lines

    # Italic (*text* or _text_ — but not ** or __)
    italic_count = len(re.findall(r'(?<!\*)\*(?!\*)[^*]+\*(?!\*)|(?<!_)_(?!_)[^_]+_(?!_)', text))
    features['italic'] = italic_count / total_lines

    # Inline code (`code`)
    inline_code_count = len(re.findall(r'(?<!`)`(?!`)[^`]+`(?!`)', text))
    features['inline_code'] = inline_code_count / total_lines

    return features


def extract_opening_pattern(text: str, n_words: int = 5) -> str:
    """Extract the opening pattern (first N words) from text."""
    text = text.strip()
    if not text:
        return ""

    words = re.findall(r'\S+', text)
    return ' '.join(words[:n_words]).lower()


def extract_closing_pattern(text: str, n_words: int = 5) -> str:
    """Extract the closing pattern (last N words) from text."""
    text = text.strip()
    if not text:
        return ""

    words = re.findall(r'\S+', text)
    return ' '.join(words[-n_words:]).lower()


def calculate_cjk_ratio(text: str) -> float:
    """Calculate the ratio of CJK characters to total non-whitespace characters."""
    if not text:
        return 0.0

    total_chars = len(re.findall(r'\S', text))
    if total_chars == 0:
        return 0.0

    cjk_chars = len(re.findall(r'[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]', text))
    return cjk_chars / total_chars


def jaccard_similarity(set_a: set, set_b: set) -> float:
    """Calculate Jaccard similarity between two sets."""
    if not set_a and not set_b:
        return 1.0
    if not set_a or not set_b:
        return 0.0

    intersection = len(set_a & set_b)
    union = len(set_a | set_b)
    return intersection / union if union > 0 else 0.0


def dict_cosine_similarity(dict_a: Dict[str, float], dict_b: Dict[str, float]) -> float:
    """
    Calculate cosine similarity between two sparse vectors represented as dicts.
    """
    if not dict_a or not dict_b:
        return 0.0

    all_keys = set(dict_a.keys()) | set(dict_b.keys())

    dot_product = sum(dict_a.get(k, 0) * dict_b.get(k, 0) for k in all_keys)

    norm_a = sum(v ** 2 for v in dict_a.values()) ** 0.5
    norm_b = sum(v ** 2 for v in dict_b.values()) ** 0.5

    if norm_a == 0 or norm_b == 0:
        return 0.0

    return dot_product / (norm_a * norm_b)


def text_similarity(text_a: str, text_b: str) -> float:
    """Calculate word-level Jaccard similarity between two texts."""
    words_a = set(re.findall(r'[a-zA-Z\u4e00-\u9fff]+', text_a.lower()))
    words_b = set(re.findall(r'[a-zA-Z\u4e00-\u9fff]+', text_b.lower()))
    return jaccard_similarity(words_a, words_b)