feat(运维监控): 增强监控功能和健康评分系统
后端改进: - 新增健康评分计算服务(ops_health_score.go) - 添加分布式锁支持(ops_advisory_lock.go) - 优化指标采集和聚合逻辑 - 新增运维指标采集间隔配置(60-3600秒) - 移除未使用的WebSocket查询token认证中间件 - 改进清理服务和告警评估逻辑 前端改进: - 简化OpsDashboard组件结构 - 完善国际化文本(中英文) - 新增运维监控相关API类型定义 - 添加运维指标采集间隔设置界面 - 优化错误详情模态框 测试: - 添加健康评分单元测试 - 更新API契约测试
This commit is contained in:
@@ -1733,8 +1733,10 @@ export default {
|
||||
redis: 'Redis',
|
||||
goroutines: 'Goroutines',
|
||||
jobs: 'Jobs',
|
||||
jobsHelp: 'Click “Details” to view job heartbeats and recent errors',
|
||||
active: 'active',
|
||||
idle: 'idle',
|
||||
waiting: 'waiting',
|
||||
ok: 'ok',
|
||||
lastRun: 'last_run:',
|
||||
lastSuccess: 'last_success:',
|
||||
@@ -1770,12 +1772,50 @@ export default {
|
||||
errorsSla: 'Errors (SLA scope)',
|
||||
upstreamExcl429529: 'Upstream (excl 429/529)',
|
||||
failedToLoadData: 'Failed to load ops data.',
|
||||
failedToLoadOverview: 'Failed to load overview',
|
||||
failedToLoadThroughputTrend: 'Failed to load throughput trend',
|
||||
failedToLoadLatencyHistogram: 'Failed to load latency histogram',
|
||||
failedToLoadErrorTrend: 'Failed to load error trend',
|
||||
failedToLoadErrorDistribution: 'Failed to load error distribution',
|
||||
failedToLoadErrorDetail: 'Failed to load error detail',
|
||||
retryFailed: 'Retry failed',
|
||||
tpsK: 'TPS (K)',
|
||||
top: 'Top:',
|
||||
throughputTrend: 'Throughput Trend',
|
||||
latencyHistogram: 'Latency Histogram',
|
||||
errorTrend: 'Error Trend',
|
||||
errorDistribution: 'Error Distribution',
|
||||
// Health Score & Diagnosis
|
||||
health: 'Health',
|
||||
healthCondition: 'Health Condition',
|
||||
healthHelp: 'Overall system health score based on SLA, error rate, and resource usage',
|
||||
healthyStatus: 'Healthy',
|
||||
riskyStatus: 'At Risk',
|
||||
idleStatus: 'Idle',
|
||||
diagnosis: {
|
||||
title: 'Smart Diagnosis',
|
||||
footer: 'Automated diagnostic suggestions based on current metrics',
|
||||
idle: 'System is currently idle',
|
||||
idleImpact: 'No active traffic',
|
||||
upstreamCritical: 'Upstream error rate critically high ({rate}%)',
|
||||
upstreamCriticalImpact: 'May affect many user requests',
|
||||
upstreamHigh: 'Upstream error rate elevated ({rate}%)',
|
||||
upstreamHighImpact: 'Recommend checking upstream service status',
|
||||
slaCritical: 'SLA critically below target ({sla}%)',
|
||||
slaCriticalImpact: 'User experience severely degraded',
|
||||
slaLow: 'SLA below target ({sla}%)',
|
||||
slaLowImpact: 'Service quality needs attention',
|
||||
errorHigh: 'Error rate too high ({rate}%)',
|
||||
errorHighImpact: 'Many requests failing',
|
||||
errorElevated: 'Error rate elevated ({rate}%)',
|
||||
errorElevatedImpact: 'Recommend checking error logs',
|
||||
healthCritical: 'Overall health score critically low ({score})',
|
||||
healthCriticalImpact: 'Multiple metrics may be degraded; prioritize error rate and latency investigation',
|
||||
healthLow: 'Overall health score low ({score})',
|
||||
healthLowImpact: 'May indicate minor instability; monitor SLA and error rates',
|
||||
healthy: 'All system metrics normal',
|
||||
healthyImpact: 'Service running stable'
|
||||
},
|
||||
// Error Log
|
||||
errorLog: {
|
||||
timeId: 'Time / ID',
|
||||
@@ -2069,7 +2109,21 @@ export default {
|
||||
throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.',
|
||||
latencyHistogram: 'Latency distribution (duration_ms) for successful requests.',
|
||||
errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).',
|
||||
errorDistribution: 'Error distribution by status code.'
|
||||
errorDistribution: 'Error distribution by status code.',
|
||||
goroutines:
|
||||
'Number of Go runtime goroutines (lightweight threads). There is no absolute “safe” number—use your historical baseline. Heuristic: <2k is common; 2k–8k watch; >8k plus rising queue/latency often suggests blocking/leaks.',
|
||||
cpu: 'CPU usage percentage, showing system processor load.',
|
||||
memory: 'Memory usage, including used and total available memory.',
|
||||
db: 'Database connection pool status, including active, idle, and waiting connections.',
|
||||
redis: 'Redis connection pool status, showing active and idle connections.',
|
||||
jobs: 'Background job execution status, including last run time, success time, and error information.',
|
||||
qps: 'Queries Per Second (QPS) and Tokens Per Second (TPS), real-time system throughput.',
|
||||
tokens: 'Total number of tokens processed in the current time window.',
|
||||
sla: 'Service Level Agreement success rate, excluding business limits (e.g., insufficient balance, quota exceeded).',
|
||||
errors: 'Error statistics, including total errors, error rate, and upstream error rate.',
|
||||
latency: 'Request latency statistics, including p50, p90, p95, p99 percentiles.',
|
||||
ttft: 'Time To First Token, measuring the speed of first byte return in streaming responses.',
|
||||
health: 'System health score (0-100), considering SLA, error rate, and resource usage.'
|
||||
},
|
||||
charts: {
|
||||
emptyRequest: 'No requests in this window.',
|
||||
@@ -2183,7 +2237,9 @@ export default {
|
||||
queryModeHint: 'Default query mode for Ops Dashboard (auto/raw/preagg)',
|
||||
queryModeAuto: 'Auto (recommended)',
|
||||
queryModeRaw: 'Raw (most accurate, slower)',
|
||||
queryModePreagg: 'Preagg (fastest, requires aggregation)'
|
||||
queryModePreagg: 'Preagg (fastest, requires aggregation)',
|
||||
metricsInterval: 'Metrics Collection Interval (seconds)',
|
||||
metricsIntervalHint: 'How often to collect system/request metrics (60-3600 seconds)'
|
||||
},
|
||||
adminApiKey: {
|
||||
title: 'Admin API Key',
|
||||
|
||||
Reference in New Issue
Block a user