"""Text analysis utility functions for fingerprint extraction.""" import re from collections import Counter from typing import Dict, List, Set def extract_bigrams(text: str) -> Dict[str, int]: """Extract word bigrams from text and return frequency counts.""" words = re.findall(r'[a-zA-Z\u4e00-\u9fff]+', text.lower()) if len(words) < 2: return {} bigrams = [] for i in range(len(words) - 1): bigrams.append(f"{words[i]}_{words[i+1]}") return dict(Counter(bigrams).most_common(50)) def calculate_vocab_richness(text: str) -> float: """ Calculate vocabulary richness (type-token ratio). Returns ratio of unique words to total words using root TTR. """ words = re.findall(r'[a-zA-Z\u4e00-\u9fff]+', text.lower()) if not words: return 0.0 unique_words = set(words) # Use root TTR to reduce sensitivity to text length return len(unique_words) / (len(words) ** 0.5) def detect_markdown_features(text: str) -> Dict[str, float]: """ Detect Markdown formatting features in text. Returns dict of feature_name -> normalized frequency. """ lines = text.split('\n') total_lines = max(len(lines), 1) features = {} # Headers (# ## ### etc.) header_count = len(re.findall(r'^#{1,6}\s', text, re.MULTILINE)) features['headers'] = header_count / total_lines # Bullet points (- or * or numbered) bullet_count = len(re.findall(r'^\s*[-*]\s', text, re.MULTILINE)) numbered_count = len(re.findall(r'^\s*\d+\.\s', text, re.MULTILINE)) features['bullets'] = (bullet_count + numbered_count) / total_lines # Code blocks (``` or indented) code_block_count = len(re.findall(r'```', text)) features['code_blocks'] = code_block_count / (2 * total_lines) if code_block_count else 0 # Bold (**text** or __text__) bold_count = len(re.findall(r'\*\*[^*]+\*\*|__[^_]+__', text)) features['bold'] = bold_count / total_lines # Italic (*text* or _text_ — but not ** or __) italic_count = len(re.findall(r'(? str: """Extract the opening pattern (first N words) from text.""" text = text.strip() if not text: return "" words = re.findall(r'\S+', text) return ' '.join(words[:n_words]).lower() def extract_closing_pattern(text: str, n_words: int = 5) -> str: """Extract the closing pattern (last N words) from text.""" text = text.strip() if not text: return "" words = re.findall(r'\S+', text) return ' '.join(words[-n_words:]).lower() def calculate_cjk_ratio(text: str) -> float: """Calculate the ratio of CJK characters to total non-whitespace characters.""" if not text: return 0.0 total_chars = len(re.findall(r'\S', text)) if total_chars == 0: return 0.0 cjk_chars = len(re.findall(r'[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]', text)) return cjk_chars / total_chars def jaccard_similarity(set_a: set, set_b: set) -> float: """Calculate Jaccard similarity between two sets.""" if not set_a and not set_b: return 1.0 if not set_a or not set_b: return 0.0 intersection = len(set_a & set_b) union = len(set_a | set_b) return intersection / union if union > 0 else 0.0 def dict_cosine_similarity(dict_a: Dict[str, float], dict_b: Dict[str, float]) -> float: """ Calculate cosine similarity between two sparse vectors represented as dicts. """ if not dict_a or not dict_b: return 0.0 all_keys = set(dict_a.keys()) | set(dict_b.keys()) dot_product = sum(dict_a.get(k, 0) * dict_b.get(k, 0) for k in all_keys) norm_a = sum(v ** 2 for v in dict_a.values()) ** 0.5 norm_b = sum(v ** 2 for v in dict_b.values()) ** 0.5 if norm_a == 0 or norm_b == 0: return 0.0 return dot_product / (norm_a * norm_b) def text_similarity(text_a: str, text_b: str) -> float: """Calculate word-level Jaccard similarity between two texts.""" words_a = set(re.findall(r'[a-zA-Z\u4e00-\u9fff]+', text_a.lower())) words_b = set(re.findall(r'[a-zA-Z\u4e00-\u9fff]+', text_b.lower())) return jaccard_similarity(words_a, words_b)