- 4维指纹采集: 性能/语言/能力/行为 - models.py 已加入 IdentityFingerprintModel (第5维数据模型) - comparator.py 已升级为5维评分 (含identity维度比较) - reporter.py 已加入身份验证报告输出 - main.py 已集成identity采集流程 - identity collector 待下次提交补充完整代码
143 lines
4.4 KiB
Python
143 lines
4.4 KiB
Python
"""Text analysis utility functions for fingerprint extraction."""
|
|
|
|
import re
|
|
from collections import Counter
|
|
from typing import Dict, List, Set
|
|
|
|
|
|
def extract_bigrams(text: str) -> Dict[str, int]:
|
|
"""Extract word bigrams from text and return frequency counts."""
|
|
words = re.findall(r'[a-zA-Z\u4e00-\u9fff]+', text.lower())
|
|
if len(words) < 2:
|
|
return {}
|
|
|
|
bigrams = []
|
|
for i in range(len(words) - 1):
|
|
bigrams.append(f"{words[i]}_{words[i+1]}")
|
|
|
|
return dict(Counter(bigrams).most_common(50))
|
|
|
|
|
|
def calculate_vocab_richness(text: str) -> float:
|
|
"""
|
|
Calculate vocabulary richness (type-token ratio).
|
|
Returns ratio of unique words to total words using root TTR.
|
|
"""
|
|
words = re.findall(r'[a-zA-Z\u4e00-\u9fff]+', text.lower())
|
|
if not words:
|
|
return 0.0
|
|
|
|
unique_words = set(words)
|
|
# Use root TTR to reduce sensitivity to text length
|
|
return len(unique_words) / (len(words) ** 0.5)
|
|
|
|
|
|
def detect_markdown_features(text: str) -> Dict[str, float]:
|
|
"""
|
|
Detect Markdown formatting features in text.
|
|
Returns dict of feature_name -> normalized frequency.
|
|
"""
|
|
lines = text.split('\n')
|
|
total_lines = max(len(lines), 1)
|
|
|
|
features = {}
|
|
|
|
# Headers (# ## ### etc.)
|
|
header_count = len(re.findall(r'^#{1,6}\s', text, re.MULTILINE))
|
|
features['headers'] = header_count / total_lines
|
|
|
|
# Bullet points (- or * or numbered)
|
|
bullet_count = len(re.findall(r'^\s*[-*]\s', text, re.MULTILINE))
|
|
numbered_count = len(re.findall(r'^\s*\d+\.\s', text, re.MULTILINE))
|
|
features['bullets'] = (bullet_count + numbered_count) / total_lines
|
|
|
|
# Code blocks (``` or indented)
|
|
code_block_count = len(re.findall(r'```', text))
|
|
features['code_blocks'] = code_block_count / (2 * total_lines) if code_block_count else 0
|
|
|
|
# Bold (**text** or __text__)
|
|
bold_count = len(re.findall(r'\*\*[^*]+\*\*|__[^_]+__', text))
|
|
features['bold'] = bold_count / total_lines
|
|
|
|
# Italic (*text* or _text_ — but not ** or __)
|
|
italic_count = len(re.findall(r'(?<!\*)\*(?!\*)[^*]+\*(?!\*)|(?<!_)_(?!_)[^_]+_(?!_)', text))
|
|
features['italic'] = italic_count / total_lines
|
|
|
|
# Inline code (`code`)
|
|
inline_code_count = len(re.findall(r'(?<!`)`(?!`)[^`]+`(?!`)', text))
|
|
features['inline_code'] = inline_code_count / total_lines
|
|
|
|
return features
|
|
|
|
|
|
def extract_opening_pattern(text: str, n_words: int = 5) -> str:
|
|
"""Extract the opening pattern (first N words) from text."""
|
|
text = text.strip()
|
|
if not text:
|
|
return ""
|
|
|
|
words = re.findall(r'\S+', text)
|
|
return ' '.join(words[:n_words]).lower()
|
|
|
|
|
|
def extract_closing_pattern(text: str, n_words: int = 5) -> str:
|
|
"""Extract the closing pattern (last N words) from text."""
|
|
text = text.strip()
|
|
if not text:
|
|
return ""
|
|
|
|
words = re.findall(r'\S+', text)
|
|
return ' '.join(words[-n_words:]).lower()
|
|
|
|
|
|
def calculate_cjk_ratio(text: str) -> float:
|
|
"""Calculate the ratio of CJK characters to total non-whitespace characters."""
|
|
if not text:
|
|
return 0.0
|
|
|
|
total_chars = len(re.findall(r'\S', text))
|
|
if total_chars == 0:
|
|
return 0.0
|
|
|
|
cjk_chars = len(re.findall(r'[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]', text))
|
|
return cjk_chars / total_chars
|
|
|
|
|
|
def jaccard_similarity(set_a: set, set_b: set) -> float:
|
|
"""Calculate Jaccard similarity between two sets."""
|
|
if not set_a and not set_b:
|
|
return 1.0
|
|
if not set_a or not set_b:
|
|
return 0.0
|
|
|
|
intersection = len(set_a & set_b)
|
|
union = len(set_a | set_b)
|
|
return intersection / union if union > 0 else 0.0
|
|
|
|
|
|
def dict_cosine_similarity(dict_a: Dict[str, float], dict_b: Dict[str, float]) -> float:
|
|
"""
|
|
Calculate cosine similarity between two sparse vectors represented as dicts.
|
|
"""
|
|
if not dict_a or not dict_b:
|
|
return 0.0
|
|
|
|
all_keys = set(dict_a.keys()) | set(dict_b.keys())
|
|
|
|
dot_product = sum(dict_a.get(k, 0) * dict_b.get(k, 0) for k in all_keys)
|
|
|
|
norm_a = sum(v ** 2 for v in dict_a.values()) ** 0.5
|
|
norm_b = sum(v ** 2 for v in dict_b.values()) ** 0.5
|
|
|
|
if norm_a == 0 or norm_b == 0:
|
|
return 0.0
|
|
|
|
return dot_product / (norm_a * norm_b)
|
|
|
|
|
|
def text_similarity(text_a: str, text_b: str) -> float:
|
|
"""Calculate word-level Jaccard similarity between two texts."""
|
|
words_a = set(re.findall(r'[a-zA-Z\u4e00-\u9fff]+', text_a.lower()))
|
|
words_b = set(re.findall(r'[a-zA-Z\u4e00-\u9fff]+', text_b.lower()))
|
|
return jaccard_similarity(words_a, words_b)
|