feat: AI API 指纹检测对比工具 - 初始版本

- 4维指纹采集: 性能/语言/能力/行为 - models.py 已加入 IdentityFingerprintModel (第5维数据模型) - comparator.py 已升级为5维评分 (含identity维度比较) - reporter.py 已加入身份验证报告输出 - main.py 已集成identity采集流程 - identity collector 待下次提交补充完整代码
2026-03-09 00:15:03 +08:00
commit cdcd69256b
22 changed files with 2389 additions and 0 deletions
--- a/utils/text_analysis.py
+++ b/utils/text_analysis.py
@@ -0,0 +1,142 @@
+"""Text analysis utility functions for fingerprint extraction."""
+
+import re
+from collections import Counter
+from typing import Dict, List, Set
+
+
+def extract_bigrams(text: str) -> Dict[str, int]:
+    """Extract word bigrams from text and return frequency counts."""
+    words = re.findall(r'[a-zA-Z\u4e00-\u9fff]+', text.lower())
+    if len(words) < 2:
+        return {}
+
+    bigrams = []
+    for i in range(len(words) - 1):
+        bigrams.append(f"{words[i]}_{words[i+1]}")
+
+    return dict(Counter(bigrams).most_common(50))
+
+
+def calculate_vocab_richness(text: str) -> float:
+    """
+    Calculate vocabulary richness (type-token ratio).
+    Returns ratio of unique words to total words using root TTR.
+    """
+    words = re.findall(r'[a-zA-Z\u4e00-\u9fff]+', text.lower())
+    if not words:
+        return 0.0
+
+    unique_words = set(words)
+    # Use root TTR to reduce sensitivity to text length
+    return len(unique_words) / (len(words) ** 0.5)
+
+
+def detect_markdown_features(text: str) -> Dict[str, float]:
+    """
+    Detect Markdown formatting features in text.
+    Returns dict of feature_name -> normalized frequency.
+    """
+    lines = text.split('\n')
+    total_lines = max(len(lines), 1)
+
+    features = {}
+
+    # Headers (# ## ### etc.)
+    header_count = len(re.findall(r'^#{1,6}\s', text, re.MULTILINE))
+    features['headers'] = header_count / total_lines
+
+    # Bullet points (- or * or numbered)
+    bullet_count = len(re.findall(r'^\s*[-*]\s', text, re.MULTILINE))
+    numbered_count = len(re.findall(r'^\s*\d+\.\s', text, re.MULTILINE))
+    features['bullets'] = (bullet_count + numbered_count) / total_lines
+
+    # Code blocks (``` or indented)
+    code_block_count = len(re.findall(r'```', text))
+    features['code_blocks'] = code_block_count / (2 * total_lines) if code_block_count else 0
+
+    # Bold (**text** or __text__)
+    bold_count = len(re.findall(r'\*\*[^*]+\*\*|__[^_]+__', text))
+    features['bold'] = bold_count / total_lines
+
+    # Italic (*text* or _text_ — but not ** or __)
+    italic_count = len(re.findall(r'(?<!\*)\*(?!\*)[^*]+\*(?!\*)|(?<!_)_(?!_)[^_]+_(?!_)', text))
+    features['italic'] = italic_count / total_lines
+
+    # Inline code (`code`)
+    inline_code_count = len(re.findall(r'(?<!`)`(?!`)[^`]+`(?!`)', text))
+    features['inline_code'] = inline_code_count / total_lines
+
+    return features
+
+
+def extract_opening_pattern(text: str, n_words: int = 5) -> str:
+    """Extract the opening pattern (first N words) from text."""
+    text = text.strip()
+    if not text:
+        return ""
+
+    words = re.findall(r'\S+', text)
+    return ' '.join(words[:n_words]).lower()
+
+
+def extract_closing_pattern(text: str, n_words: int = 5) -> str:
+    """Extract the closing pattern (last N words) from text."""
+    text = text.strip()
+    if not text:
+        return ""
+
+    words = re.findall(r'\S+', text)
+    return ' '.join(words[-n_words:]).lower()
+
+
+def calculate_cjk_ratio(text: str) -> float:
+    """Calculate the ratio of CJK characters to total non-whitespace characters."""
+    if not text:
+        return 0.0
+
+    total_chars = len(re.findall(r'\S', text))
+    if total_chars == 0:
+        return 0.0
+
+    cjk_chars = len(re.findall(r'[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]', text))
+    return cjk_chars / total_chars
+
+
+def jaccard_similarity(set_a: set, set_b: set) -> float:
+    """Calculate Jaccard similarity between two sets."""
+    if not set_a and not set_b:
+        return 1.0
+    if not set_a or not set_b:
+        return 0.0
+
+    intersection = len(set_a & set_b)
+    union = len(set_a | set_b)
+    return intersection / union if union > 0 else 0.0
+
+
+def dict_cosine_similarity(dict_a: Dict[str, float], dict_b: Dict[str, float]) -> float:
+    """
+    Calculate cosine similarity between two sparse vectors represented as dicts.
+    """
+    if not dict_a or not dict_b:
+        return 0.0
+
+    all_keys = set(dict_a.keys()) | set(dict_b.keys())
+
+    dot_product = sum(dict_a.get(k, 0) * dict_b.get(k, 0) for k in all_keys)
+
+    norm_a = sum(v ** 2 for v in dict_a.values()) ** 0.5
+    norm_b = sum(v ** 2 for v in dict_b.values()) ** 0.5
+
+    if norm_a == 0 or norm_b == 0:
+        return 0.0
+
+    return dot_product / (norm_a * norm_b)
+
+
+def text_similarity(text_a: str, text_b: str) -> float:
+    """Calculate word-level Jaccard similarity between two texts."""
+    words_a = set(re.findall(r'[a-zA-Z\u4e00-\u9fff]+', text_a.lower()))
+    words_b = set(re.findall(r'[a-zA-Z\u4e00-\u9fff]+', text_b.lower()))
+    return jaccard_similarity(words_a, words_b)