Files
ai-xn-check/collectors/language.py
nosqli cdcd69256b feat: AI API 指纹检测对比工具 - 初始版本
- 4维指纹采集: 性能/语言/能力/行为
- models.py 已加入 IdentityFingerprintModel (第5维数据模型)
- comparator.py 已升级为5维评分 (含identity维度比较)
- reporter.py 已加入身份验证报告输出
- main.py 已集成identity采集流程
- identity collector 待下次提交补充完整代码
2026-03-09 00:15:03 +08:00

118 lines
4.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Language fingerprint collector — vocabulary, formatting, patterns, CJK ratio."""
from typing import Dict, List
from core.client import AIClient
from core.models import LanguageFingerprint, CollectionConfig
from utils.text_analysis import (
extract_bigrams, calculate_vocab_richness, detect_markdown_features,
extract_opening_pattern, extract_closing_pattern, calculate_cjk_ratio,
)
# 8 prompts designed to elicit different language behaviors
LANGUAGE_PROMPTS = [
# General explanation (tests natural language style)
"Explain how photosynthesis works in simple terms.",
# Technical writing (tests formatting tendencies)
"List 5 best practices for writing clean code and explain each briefly.",
# Creative writing (tests vocabulary richness)
"Describe a sunset over the ocean in a vivid, poetic paragraph.",
# Chinese response (tests CJK handling)
"请用中文解释什么是机器学习,以及它在日常生活中的应用。",
# Structured output (tests formatting patterns)
"Compare Python and JavaScript: give 3 similarities and 3 differences.",
# Analytical (tests reasoning language)
"What are the pros and cons of remote work? Give a balanced analysis.",
# Instructional (tests step-by-step patterns)
"How do you make a cup of pour-over coffee? Give step-by-step instructions.",
# Mixed language (tests code-switching behavior)
"用中英文混合的方式解释什么是API应用程序编程接口可以适当使用英文技术术语。",
]
async def collect_language(client: AIClient, config: CollectionConfig,
progress_callback=None) -> LanguageFingerprint:
"""
Collect language fingerprint from an AI API channel.
Analyzes vocabulary, formatting habits, opening/closing patterns,
and CJK character usage across multiple prompt types.
"""
all_texts: List[str] = []
all_bigrams: Dict[str, int] = {}
all_format_features: Dict[str, List[float]] = {}
opening_patterns: List[str] = []
closing_patterns: List[str] = []
cjk_ratios: List[float] = []
total_tasks = len(LANGUAGE_PROMPTS)
completed = 0
for prompt_idx, prompt in enumerate(LANGUAGE_PROMPTS):
try:
text, latency, headers = await client.send_message(
prompt=prompt,
max_tokens=config.max_tokens,
)
if not text:
continue
all_texts.append(text)
# Extract bigrams and merge
bigrams = extract_bigrams(text)
for k, v in bigrams.items():
all_bigrams[k] = all_bigrams.get(k, 0) + v
# Detect markdown features
features = detect_markdown_features(text)
for k, v in features.items():
if k not in all_format_features:
all_format_features[k] = []
all_format_features[k].append(v)
# Extract opening and closing patterns
opening = extract_opening_pattern(text)
closing = extract_closing_pattern(text)
if opening:
opening_patterns.append(opening)
if closing:
closing_patterns.append(closing)
# Calculate CJK ratio
cjk_ratios.append(calculate_cjk_ratio(text))
except Exception as e:
if progress_callback:
progress_callback(f" ⚠ Language prompt {prompt_idx+1} failed: {e}")
continue
completed += 1
if progress_callback:
progress_callback(f" Language: {completed}/{total_tasks}")
# Aggregate results
combined_text = "\n".join(all_texts)
vocab_richness = calculate_vocab_richness(combined_text)
# Keep top 30 bigrams
sorted_bigrams = dict(sorted(all_bigrams.items(), key=lambda x: x[1], reverse=True)[:30])
# Average format features
avg_format = {}
for k, values in all_format_features.items():
avg_format[k] = sum(values) / len(values) if values else 0.0
# Average CJK ratio
avg_cjk = sum(cjk_ratios) / len(cjk_ratios) if cjk_ratios else 0.0
return LanguageFingerprint(
vocab_richness=vocab_richness,
top_bigrams=sorted_bigrams,
format_features=avg_format,
opening_patterns=opening_patterns,
closing_patterns=closing_patterns,
cjk_ratio=avg_cjk,
)