Files
ai-xn-check/utils/text_analysis.py
nosqli cdcd69256b feat: AI API 指纹检测对比工具 - 初始版本
- 4维指纹采集: 性能/语言/能力/行为
- models.py 已加入 IdentityFingerprintModel (第5维数据模型)
- comparator.py 已升级为5维评分 (含identity维度比较)
- reporter.py 已加入身份验证报告输出
- main.py 已集成identity采集流程
- identity collector 待下次提交补充完整代码
2026-03-09 00:15:03 +08:00

143 lines
4.4 KiB
Python

"""Text analysis utility functions for fingerprint extraction."""
import re
from collections import Counter
from typing import Dict, List, Set
def extract_bigrams(text: str) -> Dict[str, int]:
"""Extract word bigrams from text and return frequency counts."""
words = re.findall(r'[a-zA-Z\u4e00-\u9fff]+', text.lower())
if len(words) < 2:
return {}
bigrams = []
for i in range(len(words) - 1):
bigrams.append(f"{words[i]}_{words[i+1]}")
return dict(Counter(bigrams).most_common(50))
def calculate_vocab_richness(text: str) -> float:
"""
Calculate vocabulary richness (type-token ratio).
Returns ratio of unique words to total words using root TTR.
"""
words = re.findall(r'[a-zA-Z\u4e00-\u9fff]+', text.lower())
if not words:
return 0.0
unique_words = set(words)
# Use root TTR to reduce sensitivity to text length
return len(unique_words) / (len(words) ** 0.5)
def detect_markdown_features(text: str) -> Dict[str, float]:
"""
Detect Markdown formatting features in text.
Returns dict of feature_name -> normalized frequency.
"""
lines = text.split('\n')
total_lines = max(len(lines), 1)
features = {}
# Headers (# ## ### etc.)
header_count = len(re.findall(r'^#{1,6}\s', text, re.MULTILINE))
features['headers'] = header_count / total_lines
# Bullet points (- or * or numbered)
bullet_count = len(re.findall(r'^\s*[-*]\s', text, re.MULTILINE))
numbered_count = len(re.findall(r'^\s*\d+\.\s', text, re.MULTILINE))
features['bullets'] = (bullet_count + numbered_count) / total_lines
# Code blocks (``` or indented)
code_block_count = len(re.findall(r'```', text))
features['code_blocks'] = code_block_count / (2 * total_lines) if code_block_count else 0
# Bold (**text** or __text__)
bold_count = len(re.findall(r'\*\*[^*]+\*\*|__[^_]+__', text))
features['bold'] = bold_count / total_lines
# Italic (*text* or _text_ — but not ** or __)
italic_count = len(re.findall(r'(?<!\*)\*(?!\*)[^*]+\*(?!\*)|(?<!_)_(?!_)[^_]+_(?!_)', text))
features['italic'] = italic_count / total_lines
# Inline code (`code`)
inline_code_count = len(re.findall(r'(?<!`)`(?!`)[^`]+`(?!`)', text))
features['inline_code'] = inline_code_count / total_lines
return features
def extract_opening_pattern(text: str, n_words: int = 5) -> str:
"""Extract the opening pattern (first N words) from text."""
text = text.strip()
if not text:
return ""
words = re.findall(r'\S+', text)
return ' '.join(words[:n_words]).lower()
def extract_closing_pattern(text: str, n_words: int = 5) -> str:
"""Extract the closing pattern (last N words) from text."""
text = text.strip()
if not text:
return ""
words = re.findall(r'\S+', text)
return ' '.join(words[-n_words:]).lower()
def calculate_cjk_ratio(text: str) -> float:
"""Calculate the ratio of CJK characters to total non-whitespace characters."""
if not text:
return 0.0
total_chars = len(re.findall(r'\S', text))
if total_chars == 0:
return 0.0
cjk_chars = len(re.findall(r'[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]', text))
return cjk_chars / total_chars
def jaccard_similarity(set_a: set, set_b: set) -> float:
"""Calculate Jaccard similarity between two sets."""
if not set_a and not set_b:
return 1.0
if not set_a or not set_b:
return 0.0
intersection = len(set_a & set_b)
union = len(set_a | set_b)
return intersection / union if union > 0 else 0.0
def dict_cosine_similarity(dict_a: Dict[str, float], dict_b: Dict[str, float]) -> float:
"""
Calculate cosine similarity between two sparse vectors represented as dicts.
"""
if not dict_a or not dict_b:
return 0.0
all_keys = set(dict_a.keys()) | set(dict_b.keys())
dot_product = sum(dict_a.get(k, 0) * dict_b.get(k, 0) for k in all_keys)
norm_a = sum(v ** 2 for v in dict_a.values()) ** 0.5
norm_b = sum(v ** 2 for v in dict_b.values()) ** 0.5
if norm_a == 0 or norm_b == 0:
return 0.0
return dot_product / (norm_a * norm_b)
def text_similarity(text_a: str, text_b: str) -> float:
"""Calculate word-level Jaccard similarity between two texts."""
words_a = set(re.findall(r'[a-zA-Z\u4e00-\u9fff]+', text_a.lower()))
words_b = set(re.findall(r'[a-zA-Z\u4e00-\u9fff]+', text_b.lower()))
return jaccard_similarity(words_a, words_b)