feat: AI API 指纹检测对比工具 - 初始版本
- 4维指纹采集: 性能/语言/能力/行为 - models.py 已加入 IdentityFingerprintModel (第5维数据模型) - comparator.py 已升级为5维评分 (含identity维度比较) - reporter.py 已加入身份验证报告输出 - main.py 已集成identity采集流程 - identity collector 待下次提交补充完整代码
This commit is contained in:
142
utils/text_analysis.py
Normal file
142
utils/text_analysis.py
Normal file
@@ -0,0 +1,142 @@
|
||||
"""Text analysis utility functions for fingerprint extraction."""
|
||||
|
||||
import re
|
||||
from collections import Counter
|
||||
from typing import Dict, List, Set
|
||||
|
||||
|
||||
def extract_bigrams(text: str) -> Dict[str, int]:
|
||||
"""Extract word bigrams from text and return frequency counts."""
|
||||
words = re.findall(r'[a-zA-Z\u4e00-\u9fff]+', text.lower())
|
||||
if len(words) < 2:
|
||||
return {}
|
||||
|
||||
bigrams = []
|
||||
for i in range(len(words) - 1):
|
||||
bigrams.append(f"{words[i]}_{words[i+1]}")
|
||||
|
||||
return dict(Counter(bigrams).most_common(50))
|
||||
|
||||
|
||||
def calculate_vocab_richness(text: str) -> float:
|
||||
"""
|
||||
Calculate vocabulary richness (type-token ratio).
|
||||
Returns ratio of unique words to total words using root TTR.
|
||||
"""
|
||||
words = re.findall(r'[a-zA-Z\u4e00-\u9fff]+', text.lower())
|
||||
if not words:
|
||||
return 0.0
|
||||
|
||||
unique_words = set(words)
|
||||
# Use root TTR to reduce sensitivity to text length
|
||||
return len(unique_words) / (len(words) ** 0.5)
|
||||
|
||||
|
||||
def detect_markdown_features(text: str) -> Dict[str, float]:
|
||||
"""
|
||||
Detect Markdown formatting features in text.
|
||||
Returns dict of feature_name -> normalized frequency.
|
||||
"""
|
||||
lines = text.split('\n')
|
||||
total_lines = max(len(lines), 1)
|
||||
|
||||
features = {}
|
||||
|
||||
# Headers (# ## ### etc.)
|
||||
header_count = len(re.findall(r'^#{1,6}\s', text, re.MULTILINE))
|
||||
features['headers'] = header_count / total_lines
|
||||
|
||||
# Bullet points (- or * or numbered)
|
||||
bullet_count = len(re.findall(r'^\s*[-*]\s', text, re.MULTILINE))
|
||||
numbered_count = len(re.findall(r'^\s*\d+\.\s', text, re.MULTILINE))
|
||||
features['bullets'] = (bullet_count + numbered_count) / total_lines
|
||||
|
||||
# Code blocks (``` or indented)
|
||||
code_block_count = len(re.findall(r'```', text))
|
||||
features['code_blocks'] = code_block_count / (2 * total_lines) if code_block_count else 0
|
||||
|
||||
# Bold (**text** or __text__)
|
||||
bold_count = len(re.findall(r'\*\*[^*]+\*\*|__[^_]+__', text))
|
||||
features['bold'] = bold_count / total_lines
|
||||
|
||||
# Italic (*text* or _text_ — but not ** or __)
|
||||
italic_count = len(re.findall(r'(?<!\*)\*(?!\*)[^*]+\*(?!\*)|(?<!_)_(?!_)[^_]+_(?!_)', text))
|
||||
features['italic'] = italic_count / total_lines
|
||||
|
||||
# Inline code (`code`)
|
||||
inline_code_count = len(re.findall(r'(?<!`)`(?!`)[^`]+`(?!`)', text))
|
||||
features['inline_code'] = inline_code_count / total_lines
|
||||
|
||||
return features
|
||||
|
||||
|
||||
def extract_opening_pattern(text: str, n_words: int = 5) -> str:
|
||||
"""Extract the opening pattern (first N words) from text."""
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
words = re.findall(r'\S+', text)
|
||||
return ' '.join(words[:n_words]).lower()
|
||||
|
||||
|
||||
def extract_closing_pattern(text: str, n_words: int = 5) -> str:
|
||||
"""Extract the closing pattern (last N words) from text."""
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
words = re.findall(r'\S+', text)
|
||||
return ' '.join(words[-n_words:]).lower()
|
||||
|
||||
|
||||
def calculate_cjk_ratio(text: str) -> float:
|
||||
"""Calculate the ratio of CJK characters to total non-whitespace characters."""
|
||||
if not text:
|
||||
return 0.0
|
||||
|
||||
total_chars = len(re.findall(r'\S', text))
|
||||
if total_chars == 0:
|
||||
return 0.0
|
||||
|
||||
cjk_chars = len(re.findall(r'[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]', text))
|
||||
return cjk_chars / total_chars
|
||||
|
||||
|
||||
def jaccard_similarity(set_a: set, set_b: set) -> float:
|
||||
"""Calculate Jaccard similarity between two sets."""
|
||||
if not set_a and not set_b:
|
||||
return 1.0
|
||||
if not set_a or not set_b:
|
||||
return 0.0
|
||||
|
||||
intersection = len(set_a & set_b)
|
||||
union = len(set_a | set_b)
|
||||
return intersection / union if union > 0 else 0.0
|
||||
|
||||
|
||||
def dict_cosine_similarity(dict_a: Dict[str, float], dict_b: Dict[str, float]) -> float:
|
||||
"""
|
||||
Calculate cosine similarity between two sparse vectors represented as dicts.
|
||||
"""
|
||||
if not dict_a or not dict_b:
|
||||
return 0.0
|
||||
|
||||
all_keys = set(dict_a.keys()) | set(dict_b.keys())
|
||||
|
||||
dot_product = sum(dict_a.get(k, 0) * dict_b.get(k, 0) for k in all_keys)
|
||||
|
||||
norm_a = sum(v ** 2 for v in dict_a.values()) ** 0.5
|
||||
norm_b = sum(v ** 2 for v in dict_b.values()) ** 0.5
|
||||
|
||||
if norm_a == 0 or norm_b == 0:
|
||||
return 0.0
|
||||
|
||||
return dot_product / (norm_a * norm_b)
|
||||
|
||||
|
||||
def text_similarity(text_a: str, text_b: str) -> float:
|
||||
"""Calculate word-level Jaccard similarity between two texts."""
|
||||
words_a = set(re.findall(r'[a-zA-Z\u4e00-\u9fff]+', text_a.lower()))
|
||||
words_b = set(re.findall(r'[a-zA-Z\u4e00-\u9fff]+', text_b.lower()))
|
||||
return jaccard_similarity(words_a, words_b)
|
||||
Reference in New Issue
Block a user