- 4维指纹采集: 性能/语言/能力/行为 - models.py 已加入 IdentityFingerprintModel (第5维数据模型) - comparator.py 已升级为5维评分 (含identity维度比较) - reporter.py 已加入身份验证报告输出 - main.py 已集成identity采集流程 - identity collector 待下次提交补充完整代码
118 lines
4.3 KiB
Python
118 lines
4.3 KiB
Python
"""Language fingerprint collector — vocabulary, formatting, patterns, CJK ratio."""
|
||
|
||
from typing import Dict, List
|
||
from core.client import AIClient
|
||
from core.models import LanguageFingerprint, CollectionConfig
|
||
from utils.text_analysis import (
|
||
extract_bigrams, calculate_vocab_richness, detect_markdown_features,
|
||
extract_opening_pattern, extract_closing_pattern, calculate_cjk_ratio,
|
||
)
|
||
|
||
|
||
# 8 prompts designed to elicit different language behaviors
|
||
LANGUAGE_PROMPTS = [
|
||
# General explanation (tests natural language style)
|
||
"Explain how photosynthesis works in simple terms.",
|
||
# Technical writing (tests formatting tendencies)
|
||
"List 5 best practices for writing clean code and explain each briefly.",
|
||
# Creative writing (tests vocabulary richness)
|
||
"Describe a sunset over the ocean in a vivid, poetic paragraph.",
|
||
# Chinese response (tests CJK handling)
|
||
"请用中文解释什么是机器学习,以及它在日常生活中的应用。",
|
||
# Structured output (tests formatting patterns)
|
||
"Compare Python and JavaScript: give 3 similarities and 3 differences.",
|
||
# Analytical (tests reasoning language)
|
||
"What are the pros and cons of remote work? Give a balanced analysis.",
|
||
# Instructional (tests step-by-step patterns)
|
||
"How do you make a cup of pour-over coffee? Give step-by-step instructions.",
|
||
# Mixed language (tests code-switching behavior)
|
||
"用中英文混合的方式解释什么是API(应用程序编程接口),可以适当使用英文技术术语。",
|
||
]
|
||
|
||
|
||
async def collect_language(client: AIClient, config: CollectionConfig,
|
||
progress_callback=None) -> LanguageFingerprint:
|
||
"""
|
||
Collect language fingerprint from an AI API channel.
|
||
|
||
Analyzes vocabulary, formatting habits, opening/closing patterns,
|
||
and CJK character usage across multiple prompt types.
|
||
"""
|
||
all_texts: List[str] = []
|
||
all_bigrams: Dict[str, int] = {}
|
||
all_format_features: Dict[str, List[float]] = {}
|
||
opening_patterns: List[str] = []
|
||
closing_patterns: List[str] = []
|
||
cjk_ratios: List[float] = []
|
||
|
||
total_tasks = len(LANGUAGE_PROMPTS)
|
||
completed = 0
|
||
|
||
for prompt_idx, prompt in enumerate(LANGUAGE_PROMPTS):
|
||
try:
|
||
text, latency, headers = await client.send_message(
|
||
prompt=prompt,
|
||
max_tokens=config.max_tokens,
|
||
)
|
||
|
||
if not text:
|
||
continue
|
||
|
||
all_texts.append(text)
|
||
|
||
# Extract bigrams and merge
|
||
bigrams = extract_bigrams(text)
|
||
for k, v in bigrams.items():
|
||
all_bigrams[k] = all_bigrams.get(k, 0) + v
|
||
|
||
# Detect markdown features
|
||
features = detect_markdown_features(text)
|
||
for k, v in features.items():
|
||
if k not in all_format_features:
|
||
all_format_features[k] = []
|
||
all_format_features[k].append(v)
|
||
|
||
# Extract opening and closing patterns
|
||
opening = extract_opening_pattern(text)
|
||
closing = extract_closing_pattern(text)
|
||
if opening:
|
||
opening_patterns.append(opening)
|
||
if closing:
|
||
closing_patterns.append(closing)
|
||
|
||
# Calculate CJK ratio
|
||
cjk_ratios.append(calculate_cjk_ratio(text))
|
||
|
||
except Exception as e:
|
||
if progress_callback:
|
||
progress_callback(f" ⚠ Language prompt {prompt_idx+1} failed: {e}")
|
||
continue
|
||
|
||
completed += 1
|
||
if progress_callback:
|
||
progress_callback(f" Language: {completed}/{total_tasks}")
|
||
|
||
# Aggregate results
|
||
combined_text = "\n".join(all_texts)
|
||
vocab_richness = calculate_vocab_richness(combined_text)
|
||
|
||
# Keep top 30 bigrams
|
||
sorted_bigrams = dict(sorted(all_bigrams.items(), key=lambda x: x[1], reverse=True)[:30])
|
||
|
||
# Average format features
|
||
avg_format = {}
|
||
for k, values in all_format_features.items():
|
||
avg_format[k] = sum(values) / len(values) if values else 0.0
|
||
|
||
# Average CJK ratio
|
||
avg_cjk = sum(cjk_ratios) / len(cjk_ratios) if cjk_ratios else 0.0
|
||
|
||
return LanguageFingerprint(
|
||
vocab_richness=vocab_richness,
|
||
top_bigrams=sorted_bigrams,
|
||
format_features=avg_format,
|
||
opening_patterns=opening_patterns,
|
||
closing_patterns=closing_patterns,
|
||
cjk_ratio=avg_cjk,
|
||
)
|