feat: AI API 指纹检测对比工具 - 初始版本
- 4维指纹采集: 性能/语言/能力/行为 - models.py 已加入 IdentityFingerprintModel (第5维数据模型) - comparator.py 已升级为5维评分 (含identity维度比较) - reporter.py 已加入身份验证报告输出 - main.py 已集成identity采集流程 - identity collector 待下次提交补充完整代码
This commit is contained in:
0
core/__init__.py
Normal file
0
core/__init__.py
Normal file
179
core/client.py
Normal file
179
core/client.py
Normal file
@@ -0,0 +1,179 @@
|
||||
"""Async HTTP client for Anthropic-compatible AI API."""
|
||||
|
||||
import json
|
||||
import time
|
||||
import httpx
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class StreamingMetrics:
|
||||
"""Metrics collected during streaming response."""
|
||||
ttft_ms: float = 0.0
|
||||
token_timestamps: list = field(default_factory=list)
|
||||
total_tokens: int = 0
|
||||
tps: float = 0.0
|
||||
|
||||
|
||||
class AIClient:
|
||||
"""Async client for Anthropic-compatible AI API."""
|
||||
|
||||
def __init__(self, base_url: str, api_key: str, model: str,
|
||||
timeout: float = 60, anthropic_version: str = "2023-06-01"):
|
||||
self.base_url = base_url.rstrip('/')
|
||||
self.api_key = api_key
|
||||
self.model = model
|
||||
self.timeout = timeout
|
||||
self.anthropic_version = anthropic_version
|
||||
self._client: Optional[httpx.AsyncClient] = None
|
||||
|
||||
async def __aenter__(self):
|
||||
self._client = httpx.AsyncClient(
|
||||
timeout=httpx.Timeout(self.timeout, connect=10.0),
|
||||
http2=True,
|
||||
follow_redirects=True,
|
||||
)
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *args):
|
||||
if self._client:
|
||||
await self._client.aclose()
|
||||
self._client = None
|
||||
|
||||
def _get_headers(self) -> dict:
|
||||
return {
|
||||
"x-api-key": self.api_key,
|
||||
"anthropic-version": self.anthropic_version,
|
||||
"content-type": "application/json",
|
||||
}
|
||||
|
||||
def _get_url(self) -> str:
|
||||
return f"{self.base_url}/v1/messages?beta=true"
|
||||
|
||||
def _build_body(self, prompt: str, max_tokens: int = 1024,
|
||||
system: str = None, temperature: float = None) -> dict:
|
||||
body = {
|
||||
"model": self.model,
|
||||
"max_tokens": max_tokens,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
}
|
||||
if system:
|
||||
body["system"] = system
|
||||
if temperature is not None:
|
||||
body["temperature"] = temperature
|
||||
return body
|
||||
|
||||
async def send_message(self, prompt: str, max_tokens: int = 1024,
|
||||
system: str = None, temperature: float = None
|
||||
) -> tuple:
|
||||
"""
|
||||
Send a non-streaming message.
|
||||
Returns: (response_text, latency_ms, response_headers)
|
||||
"""
|
||||
if not self._client:
|
||||
raise RuntimeError("Client not initialized. Use 'async with' context.")
|
||||
|
||||
body = self._build_body(prompt, max_tokens, system, temperature)
|
||||
|
||||
start = time.perf_counter()
|
||||
response = await self._client.post(
|
||||
self._get_url(),
|
||||
headers=self._get_headers(),
|
||||
json=body,
|
||||
)
|
||||
latency_ms = (time.perf_counter() - start) * 1000
|
||||
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
# Extract text from response
|
||||
text = ""
|
||||
if "content" in data and len(data["content"]) > 0:
|
||||
text = data["content"][0].get("text", "")
|
||||
|
||||
# Collect headers
|
||||
headers = dict(response.headers)
|
||||
|
||||
return text, latency_ms, headers
|
||||
|
||||
async def send_message_streaming(self, prompt: str, max_tokens: int = 1024,
|
||||
system: str = None, temperature: float = None
|
||||
) -> tuple:
|
||||
"""
|
||||
Send a streaming message using SSE.
|
||||
Returns: (full_text, streaming_metrics, response_headers)
|
||||
"""
|
||||
if not self._client:
|
||||
raise RuntimeError("Client not initialized. Use 'async with' context.")
|
||||
|
||||
body = self._build_body(prompt, max_tokens, system, temperature)
|
||||
body["stream"] = True
|
||||
|
||||
metrics = StreamingMetrics()
|
||||
full_text = ""
|
||||
response_headers = {}
|
||||
|
||||
start = time.perf_counter()
|
||||
first_token_received = False
|
||||
|
||||
async with self._client.stream(
|
||||
"POST",
|
||||
self._get_url(),
|
||||
headers=self._get_headers(),
|
||||
json=body,
|
||||
) as response:
|
||||
response.raise_for_status()
|
||||
response_headers = dict(response.headers)
|
||||
|
||||
buffer = ""
|
||||
async for chunk in response.aiter_text():
|
||||
buffer += chunk
|
||||
|
||||
while "\n" in buffer:
|
||||
line, buffer = buffer.split("\n", 1)
|
||||
line = line.strip()
|
||||
|
||||
if not line or line.startswith(":"):
|
||||
continue
|
||||
|
||||
if line.startswith("data: "):
|
||||
data_str = line[6:]
|
||||
|
||||
if data_str.strip() == "[DONE]":
|
||||
continue
|
||||
|
||||
try:
|
||||
event_data = json.loads(data_str)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
continue
|
||||
|
||||
event_type = event_data.get("type", "")
|
||||
|
||||
if event_type == "content_block_delta":
|
||||
delta = event_data.get("delta", {})
|
||||
text_chunk = delta.get("text", "")
|
||||
|
||||
if text_chunk:
|
||||
now = time.perf_counter()
|
||||
|
||||
if not first_token_received:
|
||||
metrics.ttft_ms = (now - start) * 1000
|
||||
first_token_received = True
|
||||
|
||||
metrics.token_timestamps.append(now - start)
|
||||
metrics.total_tokens += 1
|
||||
full_text += text_chunk
|
||||
|
||||
elapsed = time.perf_counter() - start
|
||||
if metrics.total_tokens > 0 and elapsed > 0:
|
||||
if len(metrics.token_timestamps) > 1:
|
||||
generation_time = metrics.token_timestamps[-1] - metrics.token_timestamps[0]
|
||||
if generation_time > 0:
|
||||
metrics.tps = (metrics.total_tokens - 1) / generation_time
|
||||
else:
|
||||
metrics.tps = metrics.total_tokens / elapsed
|
||||
else:
|
||||
metrics.tps = metrics.total_tokens / elapsed
|
||||
|
||||
return full_text, metrics, response_headers
|
||||
52
core/config.py
Normal file
52
core/config.py
Normal file
@@ -0,0 +1,52 @@
|
||||
"""YAML configuration loader and validator."""
|
||||
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from .models import ChannelConfig, CollectionConfig
|
||||
|
||||
|
||||
def load_config(config_path: str) -> dict:
|
||||
"""Load and validate configuration from YAML file."""
|
||||
path = Path(config_path)
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"Config file not found: {config_path}")
|
||||
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
raw = yaml.safe_load(f)
|
||||
|
||||
# Parse channel configs
|
||||
genuine = _parse_channel(raw.get('genuine', {}), 'genuine')
|
||||
suspect = _parse_channel(raw.get('suspect', {}), 'suspect')
|
||||
|
||||
# Parse collection config
|
||||
coll = raw.get('collection', {})
|
||||
collection = CollectionConfig(
|
||||
repeat_count=coll.get('repeat_count', 3),
|
||||
timeout=coll.get('timeout', 60),
|
||||
max_tokens=coll.get('max_tokens', 1024),
|
||||
anthropic_version=coll.get('anthropic_version', '2023-06-01'),
|
||||
)
|
||||
|
||||
# Parse output config
|
||||
output = raw.get('output', {})
|
||||
|
||||
return {
|
||||
'genuine': genuine,
|
||||
'suspect': suspect,
|
||||
'collection': collection,
|
||||
'output': output,
|
||||
}
|
||||
|
||||
|
||||
def _parse_channel(data: dict, name: str) -> ChannelConfig:
|
||||
"""Parse and validate a channel configuration."""
|
||||
required = ['base_url', 'api_key', 'model']
|
||||
for key in required:
|
||||
if key not in data or not data[key]:
|
||||
raise ValueError(f"Channel '{name}' missing required field: {key}")
|
||||
|
||||
return ChannelConfig(
|
||||
base_url=data['base_url'].rstrip('/'),
|
||||
api_key=data['api_key'],
|
||||
model=data['model'],
|
||||
)
|
||||
278
core/models.py
Normal file
278
core/models.py
Normal file
@@ -0,0 +1,278 @@
|
||||
"""Data models for AI API fingerprint detection."""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChannelConfig:
|
||||
"""Configuration for a single API channel."""
|
||||
base_url: str
|
||||
api_key: str
|
||||
model: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class CollectionConfig:
|
||||
"""Configuration for data collection."""
|
||||
repeat_count: int = 3
|
||||
timeout: float = 60
|
||||
max_tokens: int = 1024
|
||||
anthropic_version: str = "2023-06-01"
|
||||
|
||||
|
||||
@dataclass
|
||||
class PerformanceFingerprint:
|
||||
"""Performance metrics fingerprint."""
|
||||
latencies_ms: List[float] = field(default_factory=list)
|
||||
p50_latency_ms: float = 0.0
|
||||
p95_latency_ms: float = 0.0
|
||||
p99_latency_ms: float = 0.0
|
||||
avg_ttft_ms: float = 0.0
|
||||
avg_tps: float = 0.0
|
||||
response_lengths: List[int] = field(default_factory=list)
|
||||
avg_response_length: float = 0.0
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"latencies_ms": self.latencies_ms,
|
||||
"p50_latency_ms": self.p50_latency_ms,
|
||||
"p95_latency_ms": self.p95_latency_ms,
|
||||
"p99_latency_ms": self.p99_latency_ms,
|
||||
"avg_ttft_ms": self.avg_ttft_ms,
|
||||
"avg_tps": self.avg_tps,
|
||||
"response_lengths": self.response_lengths,
|
||||
"avg_response_length": self.avg_response_length,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> "PerformanceFingerprint":
|
||||
return cls(
|
||||
latencies_ms=data.get("latencies_ms", []),
|
||||
p50_latency_ms=data.get("p50_latency_ms", 0.0),
|
||||
p95_latency_ms=data.get("p95_latency_ms", 0.0),
|
||||
p99_latency_ms=data.get("p99_latency_ms", 0.0),
|
||||
avg_ttft_ms=data.get("avg_ttft_ms", 0.0),
|
||||
avg_tps=data.get("avg_tps", 0.0),
|
||||
response_lengths=data.get("response_lengths", []),
|
||||
avg_response_length=data.get("avg_response_length", 0.0),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LanguageFingerprint:
|
||||
"""Language pattern fingerprint."""
|
||||
vocab_richness: float = 0.0
|
||||
top_bigrams: Dict[str, int] = field(default_factory=dict)
|
||||
format_features: Dict[str, float] = field(default_factory=dict)
|
||||
opening_patterns: List[str] = field(default_factory=list)
|
||||
closing_patterns: List[str] = field(default_factory=list)
|
||||
cjk_ratio: float = 0.0
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"vocab_richness": self.vocab_richness,
|
||||
"top_bigrams": self.top_bigrams,
|
||||
"format_features": self.format_features,
|
||||
"opening_patterns": self.opening_patterns,
|
||||
"closing_patterns": self.closing_patterns,
|
||||
"cjk_ratio": self.cjk_ratio,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> "LanguageFingerprint":
|
||||
return cls(
|
||||
vocab_richness=data.get("vocab_richness", 0.0),
|
||||
top_bigrams=data.get("top_bigrams", {}),
|
||||
format_features=data.get("format_features", {}),
|
||||
opening_patterns=data.get("opening_patterns", []),
|
||||
closing_patterns=data.get("closing_patterns", []),
|
||||
cjk_ratio=data.get("cjk_ratio", 0.0),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CapabilityFingerprint:
|
||||
"""Capability test fingerprint."""
|
||||
knowledge_cutoff_responses: Dict[str, str] = field(default_factory=dict)
|
||||
math_scores: Dict[str, bool] = field(default_factory=dict)
|
||||
code_scores: Dict[str, bool] = field(default_factory=dict)
|
||||
refusal_patterns: Dict[str, bool] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"knowledge_cutoff_responses": self.knowledge_cutoff_responses,
|
||||
"math_scores": self.math_scores,
|
||||
"code_scores": self.code_scores,
|
||||
"refusal_patterns": self.refusal_patterns,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> "CapabilityFingerprint":
|
||||
return cls(
|
||||
knowledge_cutoff_responses=data.get("knowledge_cutoff_responses", {}),
|
||||
math_scores=data.get("math_scores", {}),
|
||||
code_scores=data.get("code_scores", {}),
|
||||
refusal_patterns=data.get("refusal_patterns", {}),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class BehavioralFingerprint:
|
||||
"""Behavioral pattern fingerprint."""
|
||||
consistency_scores: List[float] = field(default_factory=list)
|
||||
instruction_compliance: Dict[str, bool] = field(default_factory=dict)
|
||||
response_headers: Dict[str, str] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"consistency_scores": self.consistency_scores,
|
||||
"instruction_compliance": self.instruction_compliance,
|
||||
"response_headers": self.response_headers,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> "BehavioralFingerprint":
|
||||
return cls(
|
||||
consistency_scores=data.get("consistency_scores", []),
|
||||
instruction_compliance=data.get("instruction_compliance", {}),
|
||||
response_headers=data.get("response_headers", {}),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class IdentityFingerprintModel:
|
||||
"""Identity verification fingerprint — stored in FullFingerprint.
|
||||
This is a lightweight model for serialization; the full IdentityFingerprint
|
||||
lives in collectors/identity.py and is converted to/from this for storage.
|
||||
"""
|
||||
claimed_identity: str = ""
|
||||
claimed_developer: str = ""
|
||||
identity_consistency: float = 0.0
|
||||
detected_model: str = ""
|
||||
detection_confidence: float = 0.0
|
||||
model_scores: Dict[str, float] = field(default_factory=dict)
|
||||
vocab_markers: Dict[str, int] = field(default_factory=dict)
|
||||
marker_details: Dict[str, List[str]] = field(default_factory=dict)
|
||||
signature_behaviors: Dict[str, str] = field(default_factory=dict)
|
||||
system_prompt_leaked: bool = False
|
||||
system_prompt_hints: List[str] = field(default_factory=list)
|
||||
knowledge_results: Dict[str, bool] = field(default_factory=dict)
|
||||
identity_responses: Dict[str, str] = field(default_factory=dict)
|
||||
is_claimed_model: bool = True
|
||||
identity_mismatch_reasons: List[str] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"claimed_identity": self.claimed_identity,
|
||||
"claimed_developer": self.claimed_developer,
|
||||
"identity_consistency": self.identity_consistency,
|
||||
"detected_model": self.detected_model,
|
||||
"detection_confidence": self.detection_confidence,
|
||||
"model_scores": self.model_scores,
|
||||
"vocab_markers": self.vocab_markers,
|
||||
"marker_details": self.marker_details,
|
||||
"signature_behaviors": self.signature_behaviors,
|
||||
"system_prompt_leaked": self.system_prompt_leaked,
|
||||
"system_prompt_hints": self.system_prompt_hints,
|
||||
"knowledge_results": self.knowledge_results,
|
||||
"identity_responses": self.identity_responses,
|
||||
"is_claimed_model": self.is_claimed_model,
|
||||
"identity_mismatch_reasons": self.identity_mismatch_reasons,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> "IdentityFingerprintModel":
|
||||
return cls(
|
||||
claimed_identity=data.get("claimed_identity", ""),
|
||||
claimed_developer=data.get("claimed_developer", ""),
|
||||
identity_consistency=data.get("identity_consistency", 0.0),
|
||||
detected_model=data.get("detected_model", ""),
|
||||
detection_confidence=data.get("detection_confidence", 0.0),
|
||||
model_scores=data.get("model_scores", {}),
|
||||
vocab_markers=data.get("vocab_markers", {}),
|
||||
marker_details=data.get("marker_details", {}),
|
||||
signature_behaviors=data.get("signature_behaviors", {}),
|
||||
system_prompt_leaked=data.get("system_prompt_leaked", False),
|
||||
system_prompt_hints=data.get("system_prompt_hints", []),
|
||||
knowledge_results=data.get("knowledge_results", {}),
|
||||
identity_responses=data.get("identity_responses", {}),
|
||||
is_claimed_model=data.get("is_claimed_model", True),
|
||||
identity_mismatch_reasons=data.get("identity_mismatch_reasons", []),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class FullFingerprint:
|
||||
"""Complete fingerprint combining all dimensions."""
|
||||
channel_name: str = ""
|
||||
timestamp: str = ""
|
||||
performance: PerformanceFingerprint = field(default_factory=PerformanceFingerprint)
|
||||
language: LanguageFingerprint = field(default_factory=LanguageFingerprint)
|
||||
capability: CapabilityFingerprint = field(default_factory=CapabilityFingerprint)
|
||||
behavioral: BehavioralFingerprint = field(default_factory=BehavioralFingerprint)
|
||||
identity: IdentityFingerprintModel = field(default_factory=IdentityFingerprintModel)
|
||||
raw_responses: Dict[str, list] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"channel_name": self.channel_name,
|
||||
"timestamp": self.timestamp,
|
||||
"performance": self.performance.to_dict(),
|
||||
"language": self.language.to_dict(),
|
||||
"capability": self.capability.to_dict(),
|
||||
"behavioral": self.behavioral.to_dict(),
|
||||
"identity": self.identity.to_dict(),
|
||||
"raw_responses": self.raw_responses,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> "FullFingerprint":
|
||||
return cls(
|
||||
channel_name=data.get("channel_name", ""),
|
||||
timestamp=data.get("timestamp", ""),
|
||||
performance=PerformanceFingerprint.from_dict(data.get("performance", {})),
|
||||
language=LanguageFingerprint.from_dict(data.get("language", {})),
|
||||
capability=CapabilityFingerprint.from_dict(data.get("capability", {})),
|
||||
behavioral=BehavioralFingerprint.from_dict(data.get("behavioral", {})),
|
||||
identity=IdentityFingerprintModel.from_dict(data.get("identity", {})),
|
||||
raw_responses=data.get("raw_responses", {}),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DimensionScore:
|
||||
"""Score for a single comparison dimension."""
|
||||
dimension: str = ""
|
||||
score: float = 0.0
|
||||
weight: float = 0.0
|
||||
details: Dict = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"dimension": self.dimension,
|
||||
"score": self.score,
|
||||
"weight": self.weight,
|
||||
"details": self.details,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ComparisonResult:
|
||||
"""Final comparison result across all dimensions."""
|
||||
genuine_channel: str = ""
|
||||
suspect_channel: str = ""
|
||||
dimension_scores: List[DimensionScore] = field(default_factory=list)
|
||||
overall_score: float = 0.0
|
||||
verdict: str = ""
|
||||
timestamp: str = ""
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"genuine_channel": self.genuine_channel,
|
||||
"suspect_channel": self.suspect_channel,
|
||||
"dimension_scores": [ds.to_dict() for ds in self.dimension_scores],
|
||||
"overall_score": self.overall_score,
|
||||
"verdict": self.verdict,
|
||||
"timestamp": self.timestamp,
|
||||
}
|
||||
Reference in New Issue
Block a user