diff --git a/backend/internal/service/ops_health_score.go b/backend/internal/service/ops_health_score.go index 68cfc10d..feb0d843 100644 --- a/backend/internal/service/ops_health_score.go +++ b/backend/internal/service/ops_health_score.go @@ -9,7 +9,8 @@ import ( // // Design goals: // - Backend-owned scoring (UI only displays). -// - Uses "overall" business indicators (SLA/error/latency) plus infra indicators (db/redis/cpu/mem/jobs). +// - Layered scoring: Business Health (70%) + Infrastructure Health (30%) +// - Avoids double-counting (e.g., DB failure affects both infra and business metrics) // - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data. func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int { if overview == nil { @@ -22,97 +23,124 @@ func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) return 100 } - score := 100.0 + businessHealth := computeBusinessHealth(overview) + infraHealth := computeInfraHealth(now, overview) - // --- SLA (primary signal) --- - // SLA is a ratio (0..1). Target is intentionally modest for LLM gateways; it can be tuned later. + // Weighted combination: 70% business + 30% infrastructure + score := businessHealth*0.7 + infraHealth*0.3 + return int(math.Round(clampFloat64(score, 0, 100))) +} + +// computeBusinessHealth calculates business health score (0-100) +// Components: SLA (50%) + Error Rate (30%) + Latency (20%) +func computeBusinessHealth(overview *OpsDashboardOverview) float64 { + // SLA score: 99.5% → 100, 95% → 0 (linear) + slaScore := 100.0 slaPct := clampFloat64(overview.SLA*100, 0, 100) if slaPct < 99.5 { - // Up to -45 points as SLA drops. - score -= math.Min(45, (99.5-slaPct)*12) + if slaPct >= 95 { + slaScore = (slaPct - 95) / 4.5 * 100 + } else { + slaScore = 0 + } } - // --- Error rates (secondary signal) --- + // Error rate score: 0.5% → 100, 5% → 0 (linear) + // Combines request errors and upstream errors + errorScore := 100.0 errorPct := clampFloat64(overview.ErrorRate*100, 0, 100) - if errorPct > 1 { - // Cap at -20 points by 6% error rate. - score -= math.Min(20, (errorPct-1)*4) - } - upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100) - if upstreamPct > 1 { - // Upstream instability deserves extra weight, but keep it smaller than SLA/error. - score -= math.Min(15, (upstreamPct-1)*3) + combinedErrorPct := math.Max(errorPct, upstreamPct) // Use worst case + if combinedErrorPct > 0.5 { + if combinedErrorPct <= 5 { + errorScore = (5 - combinedErrorPct) / 4.5 * 100 + } else { + errorScore = 0 + } } - // --- Latency (tail-focused) --- - // Use p99 of duration + TTFT. Penalize only when clearly elevated. + // Latency score: 1s → 100, 10s → 0 (linear) + // Uses P99 of duration (TTFT is less critical for overall health) + latencyScore := 100.0 if overview.Duration.P99 != nil { p99 := float64(*overview.Duration.P99) - if p99 > 2000 { - // From 2s upward, gradually penalize up to -20. - score -= math.Min(20, (p99-2000)/900) // ~20s => ~-20 - } - } - if overview.TTFT.P99 != nil { - p99 := float64(*overview.TTFT.P99) - if p99 > 500 { - // TTFT > 500ms starts hurting; cap at -10. - score -= math.Min(10, (p99-500)/200) // 2.5s => -10 + if p99 > 1000 { + if p99 <= 10000 { + latencyScore = (10000 - p99) / 9000 * 100 + } else { + latencyScore = 0 + } } } - // --- System metrics snapshot (best-effort) --- + // Weighted combination + return slaScore*0.5 + errorScore*0.3 + latencyScore*0.2 +} + +// computeInfraHealth calculates infrastructure health score (0-100) +// Components: Storage (40%) + Compute Resources (30%) + Background Jobs (30%) +func computeInfraHealth(now time.Time, overview *OpsDashboardOverview) float64 { + // Storage score: DB critical, Redis less critical + storageScore := 100.0 if overview.SystemMetrics != nil { if overview.SystemMetrics.DBOK != nil && !*overview.SystemMetrics.DBOK { - score -= 20 - } - if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK { - score -= 15 - } - - if overview.SystemMetrics.CPUUsagePercent != nil { - cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100) - if cpuPct > 85 { - score -= math.Min(10, (cpuPct-85)*1.5) - } - } - if overview.SystemMetrics.MemoryUsagePercent != nil { - memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100) - if memPct > 90 { - score -= math.Min(10, (memPct-90)*1.0) - } - } - - if overview.SystemMetrics.DBConnWaiting != nil && *overview.SystemMetrics.DBConnWaiting > 0 { - waiting := float64(*overview.SystemMetrics.DBConnWaiting) - score -= math.Min(10, waiting*2) - } - if overview.SystemMetrics.ConcurrencyQueueDepth != nil && *overview.SystemMetrics.ConcurrencyQueueDepth > 0 { - depth := float64(*overview.SystemMetrics.ConcurrencyQueueDepth) - score -= math.Min(10, depth*0.5) + storageScore = 0 // DB failure is critical + } else if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK { + storageScore = 50 // Redis failure is degraded but not critical } } - // --- Job heartbeats (best-effort) --- - // Penalize only clear "error after last success" signals, and cap the impact. - jobPenalty := 0.0 + // Compute resources score: CPU + Memory + computeScore := 100.0 + if overview.SystemMetrics != nil { + cpuScore := 100.0 + if overview.SystemMetrics.CPUUsagePercent != nil { + cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100) + if cpuPct > 80 { + if cpuPct <= 100 { + cpuScore = (100 - cpuPct) / 20 * 100 + } else { + cpuScore = 0 + } + } + } + + memScore := 100.0 + if overview.SystemMetrics.MemoryUsagePercent != nil { + memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100) + if memPct > 85 { + if memPct <= 100 { + memScore = (100 - memPct) / 15 * 100 + } else { + memScore = 0 + } + } + } + + computeScore = (cpuScore + memScore) / 2 + } + + // Background jobs score + jobScore := 100.0 + failedJobs := 0 + totalJobs := 0 for _, hb := range overview.JobHeartbeats { if hb == nil { continue } + totalJobs++ if hb.LastErrorAt != nil && (hb.LastSuccessAt == nil || hb.LastErrorAt.After(*hb.LastSuccessAt)) { - jobPenalty += 5 - continue - } - if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute { - jobPenalty += 2 + failedJobs++ + } else if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute { + failedJobs++ } } - score -= math.Min(15, jobPenalty) + if totalJobs > 0 && failedJobs > 0 { + jobScore = (1 - float64(failedJobs)/float64(totalJobs)) * 100 + } - score = clampFloat64(score, 0, 100) - return int(math.Round(score)) + // Weighted combination + return storageScore*0.4 + computeScore*0.3 + jobScore*0.3 } func clampFloat64(v float64, min float64, max float64) float64 { diff --git a/backend/internal/service/ops_health_score_test.go b/backend/internal/service/ops_health_score_test.go index d7e5dd8c..849ba146 100644 --- a/backend/internal/service/ops_health_score_test.go +++ b/backend/internal/service/ops_health_score_test.go @@ -55,6 +55,377 @@ func TestComputeDashboardHealthScore_DegradesOnBadSignals(t *testing.T) { require.GreaterOrEqual(t, score, 0) } +func TestComputeDashboardHealthScore_Comprehensive(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + overview *OpsDashboardOverview + wantMin int + wantMax int + }{ + { + name: "nil overview returns 0", + overview: nil, + wantMin: 0, + wantMax: 0, + }, + { + name: "perfect health", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + RequestCountSLA: 1000, + SLA: 1.0, + ErrorRate: 0, + UpstreamErrorRate: 0, + Duration: OpsPercentiles{P99: intPtr(500)}, + TTFT: OpsPercentiles{P99: intPtr(100)}, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(true), + RedisOK: boolPtr(true), + CPUUsagePercent: float64Ptr(30), + MemoryUsagePercent: float64Ptr(40), + }, + }, + wantMin: 100, + wantMax: 100, + }, + { + name: "good health - SLA 99.8%", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + RequestCountSLA: 1000, + SLA: 0.998, + ErrorRate: 0.003, + UpstreamErrorRate: 0.001, + Duration: OpsPercentiles{P99: intPtr(800)}, + TTFT: OpsPercentiles{P99: intPtr(200)}, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(true), + RedisOK: boolPtr(true), + CPUUsagePercent: float64Ptr(50), + MemoryUsagePercent: float64Ptr(60), + }, + }, + wantMin: 95, + wantMax: 100, + }, + { + name: "medium health - SLA 96%", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + RequestCountSLA: 1000, + SLA: 0.96, + ErrorRate: 0.02, + UpstreamErrorRate: 0.01, + Duration: OpsPercentiles{P99: intPtr(3000)}, + TTFT: OpsPercentiles{P99: intPtr(600)}, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(true), + RedisOK: boolPtr(true), + CPUUsagePercent: float64Ptr(70), + MemoryUsagePercent: float64Ptr(75), + }, + }, + wantMin: 60, + wantMax: 85, + }, + { + name: "DB failure", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + RequestCountSLA: 1000, + SLA: 0.995, + ErrorRate: 0, + UpstreamErrorRate: 0, + Duration: OpsPercentiles{P99: intPtr(500)}, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(false), + RedisOK: boolPtr(true), + CPUUsagePercent: float64Ptr(30), + MemoryUsagePercent: float64Ptr(40), + }, + }, + wantMin: 70, + wantMax: 90, + }, + { + name: "Redis failure", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + RequestCountSLA: 1000, + SLA: 0.995, + ErrorRate: 0, + UpstreamErrorRate: 0, + Duration: OpsPercentiles{P99: intPtr(500)}, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(true), + RedisOK: boolPtr(false), + CPUUsagePercent: float64Ptr(30), + MemoryUsagePercent: float64Ptr(40), + }, + }, + wantMin: 85, + wantMax: 95, + }, + { + name: "high CPU usage", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + RequestCountSLA: 1000, + SLA: 0.995, + ErrorRate: 0, + UpstreamErrorRate: 0, + Duration: OpsPercentiles{P99: intPtr(500)}, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(true), + RedisOK: boolPtr(true), + CPUUsagePercent: float64Ptr(95), + MemoryUsagePercent: float64Ptr(40), + }, + }, + wantMin: 85, + wantMax: 100, + }, + { + name: "combined failures - business degraded + infra healthy", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + RequestCountSLA: 1000, + SLA: 0.90, + ErrorRate: 0.05, + UpstreamErrorRate: 0.02, + Duration: OpsPercentiles{P99: intPtr(10000)}, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(true), + RedisOK: boolPtr(true), + CPUUsagePercent: float64Ptr(20), + MemoryUsagePercent: float64Ptr(30), + }, + }, + wantMin: 25, + wantMax: 50, + }, + { + name: "combined failures - business healthy + infra degraded", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + RequestCountSLA: 1000, + SLA: 0.998, + ErrorRate: 0.001, + UpstreamErrorRate: 0, + Duration: OpsPercentiles{P99: intPtr(600)}, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(false), + RedisOK: boolPtr(false), + CPUUsagePercent: float64Ptr(95), + MemoryUsagePercent: float64Ptr(95), + }, + }, + wantMin: 70, + wantMax: 90, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + score := computeDashboardHealthScore(time.Now().UTC(), tt.overview) + require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %d", tt.wantMin) + require.LessOrEqual(t, score, tt.wantMax, "score should be <= %d", tt.wantMax) + require.GreaterOrEqual(t, score, 0, "score must be >= 0") + require.LessOrEqual(t, score, 100, "score must be <= 100") + }) + } +} + +func TestComputeBusinessHealth(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + overview *OpsDashboardOverview + wantMin float64 + wantMax float64 + }{ + { + name: "perfect metrics", + overview: &OpsDashboardOverview{ + SLA: 1.0, + ErrorRate: 0, + UpstreamErrorRate: 0, + Duration: OpsPercentiles{P99: intPtr(500)}, + }, + wantMin: 100, + wantMax: 100, + }, + { + name: "SLA boundary 99.5%", + overview: &OpsDashboardOverview{ + SLA: 0.995, + ErrorRate: 0, + UpstreamErrorRate: 0, + Duration: OpsPercentiles{P99: intPtr(500)}, + }, + wantMin: 100, + wantMax: 100, + }, + { + name: "SLA boundary 95%", + overview: &OpsDashboardOverview{ + SLA: 0.95, + ErrorRate: 0, + UpstreamErrorRate: 0, + Duration: OpsPercentiles{P99: intPtr(500)}, + }, + wantMin: 50, + wantMax: 60, + }, + { + name: "error rate boundary 0.5%", + overview: &OpsDashboardOverview{ + SLA: 0.995, + ErrorRate: 0.005, + UpstreamErrorRate: 0, + Duration: OpsPercentiles{P99: intPtr(500)}, + }, + wantMin: 95, + wantMax: 100, + }, + { + name: "latency boundary 1000ms", + overview: &OpsDashboardOverview{ + SLA: 0.995, + ErrorRate: 0, + UpstreamErrorRate: 0, + Duration: OpsPercentiles{P99: intPtr(1000)}, + }, + wantMin: 95, + wantMax: 100, + }, + { + name: "upstream error dominates", + overview: &OpsDashboardOverview{ + SLA: 0.995, + ErrorRate: 0.001, + UpstreamErrorRate: 0.03, + Duration: OpsPercentiles{P99: intPtr(500)}, + }, + wantMin: 75, + wantMax: 90, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + score := computeBusinessHealth(tt.overview) + require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %.1f", tt.wantMin) + require.LessOrEqual(t, score, tt.wantMax, "score should be <= %.1f", tt.wantMax) + require.GreaterOrEqual(t, score, 0.0, "score must be >= 0") + require.LessOrEqual(t, score, 100.0, "score must be <= 100") + }) + } +} + +func TestComputeInfraHealth(t *testing.T) { + t.Parallel() + + now := time.Now().UTC() + + tests := []struct { + name string + overview *OpsDashboardOverview + wantMin float64 + wantMax float64 + }{ + { + name: "all infrastructure healthy", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(true), + RedisOK: boolPtr(true), + CPUUsagePercent: float64Ptr(30), + MemoryUsagePercent: float64Ptr(40), + }, + }, + wantMin: 100, + wantMax: 100, + }, + { + name: "DB down", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(false), + RedisOK: boolPtr(true), + CPUUsagePercent: float64Ptr(30), + MemoryUsagePercent: float64Ptr(40), + }, + }, + wantMin: 50, + wantMax: 70, + }, + { + name: "Redis down", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(true), + RedisOK: boolPtr(false), + CPUUsagePercent: float64Ptr(30), + MemoryUsagePercent: float64Ptr(40), + }, + }, + wantMin: 80, + wantMax: 95, + }, + { + name: "CPU at 90%", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(true), + RedisOK: boolPtr(true), + CPUUsagePercent: float64Ptr(90), + MemoryUsagePercent: float64Ptr(40), + }, + }, + wantMin: 85, + wantMax: 95, + }, + { + name: "failed background job", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(true), + RedisOK: boolPtr(true), + CPUUsagePercent: float64Ptr(30), + MemoryUsagePercent: float64Ptr(40), + }, + JobHeartbeats: []*OpsJobHeartbeat{ + { + JobName: "test-job", + LastErrorAt: &now, + }, + }, + }, + wantMin: 70, + wantMax: 90, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + score := computeInfraHealth(now, tt.overview) + require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %.1f", tt.wantMin) + require.LessOrEqual(t, score, tt.wantMax, "score should be <= %.1f", tt.wantMax) + require.GreaterOrEqual(t, score, 0.0, "score must be >= 0") + require.LessOrEqual(t, score, 100.0, "score must be <= 100") + }) + } +} + func timePtr(v time.Time) *time.Time { return &v } func stringPtr(v string) *string { return &v } diff --git a/frontend/src/i18n/locales/en.ts b/frontend/src/i18n/locales/en.ts index 52faf577..6458bcd4 100644 --- a/frontend/src/i18n/locales/en.ts +++ b/frontend/src/i18n/locales/en.ts @@ -1928,22 +1928,62 @@ export default { footer: 'Automated diagnostic suggestions based on current metrics', idle: 'System is currently idle', idleImpact: 'No active traffic', + // Resource diagnostics + dbDown: 'Database connection failed', + dbDownImpact: 'All database operations will fail', + dbDownAction: 'Check database service status, network connectivity, and connection configuration', + redisDown: 'Redis connection failed', + redisDownImpact: 'Cache functionality degraded, performance may decline', + redisDownAction: 'Check Redis service status and network connectivity', + cpuCritical: 'CPU usage critically high ({usage}%)', + cpuCriticalImpact: 'System response slowing, may affect all requests', + cpuCriticalAction: 'Check CPU-intensive tasks, consider scaling or code optimization', + cpuHigh: 'CPU usage elevated ({usage}%)', + cpuHighImpact: 'System load is high, needs attention', + cpuHighAction: 'Monitor CPU trends, prepare scaling plan', + memoryCritical: 'Memory usage critically high ({usage}%)', + memoryCriticalImpact: 'May trigger OOM, system stability threatened', + memoryCriticalAction: 'Check for memory leaks, consider increasing memory or optimizing usage', + memoryHigh: 'Memory usage elevated ({usage}%)', + memoryHighImpact: 'Memory pressure is high, needs attention', + memoryHighAction: 'Monitor memory trends, check for memory leaks', + // Latency diagnostics + latencyCritical: 'Response latency critically high ({latency}ms)', + latencyCriticalImpact: 'User experience extremely poor, many requests timing out', + latencyCriticalAction: 'Check slow queries, database indexes, network latency, and upstream services', + latencyHigh: 'Response latency elevated ({latency}ms)', + latencyHighImpact: 'User experience degraded, needs optimization', + latencyHighAction: 'Analyze slow request logs, optimize database queries and business logic', + ttftHigh: 'Time to first byte elevated ({ttft}ms)', + ttftHighImpact: 'User perceived latency increased', + ttftHighAction: 'Optimize request processing flow, reduce pre-processing time', + // Error rate diagnostics upstreamCritical: 'Upstream error rate critically high ({rate}%)', upstreamCriticalImpact: 'May affect many user requests', + upstreamCriticalAction: 'Check upstream service health, enable fallback strategies', upstreamHigh: 'Upstream error rate elevated ({rate}%)', upstreamHighImpact: 'Recommend checking upstream service status', - slaCritical: 'SLA critically below target ({sla}%)', - slaCriticalImpact: 'User experience severely degraded', - slaLow: 'SLA below target ({sla}%)', - slaLowImpact: 'Service quality needs attention', + upstreamHighAction: 'Contact upstream service team, prepare fallback plan', errorHigh: 'Error rate too high ({rate}%)', errorHighImpact: 'Many requests failing', + errorHighAction: 'Check error logs, identify root cause, urgent fix required', errorElevated: 'Error rate elevated ({rate}%)', errorElevatedImpact: 'Recommend checking error logs', + errorElevatedAction: 'Analyze error types and distribution, create fix plan', + // SLA diagnostics + slaCritical: 'SLA critically below target ({sla}%)', + slaCriticalImpact: 'User experience severely degraded', + slaCriticalAction: 'Urgently investigate errors and latency, consider rate limiting', + slaLow: 'SLA below target ({sla}%)', + slaLowImpact: 'Service quality needs attention', + slaLowAction: 'Analyze SLA decline causes, optimize system performance', + // Health score diagnostics healthCritical: 'Overall health score critically low ({score})', healthCriticalImpact: 'Multiple metrics may be degraded; prioritize error rate and latency investigation', + healthCriticalAction: 'Comprehensive system check, prioritize critical-level issues', healthLow: 'Overall health score low ({score})', healthLowImpact: 'May indicate minor instability; monitor SLA and error rates', + healthLowAction: 'Monitor metric trends, prevent issue escalation', healthy: 'All system metrics normal', healthyImpact: 'Service running stable' }, diff --git a/frontend/src/i18n/locales/zh.ts b/frontend/src/i18n/locales/zh.ts index 32f516e7..da1aa0b7 100644 --- a/frontend/src/i18n/locales/zh.ts +++ b/frontend/src/i18n/locales/zh.ts @@ -2074,22 +2074,62 @@ export default { footer: '基于当前指标的自动诊断建议', idle: '系统当前处于待机状态', idleImpact: '无活跃流量', + // Resource diagnostics + dbDown: '数据库连接失败', + dbDownImpact: '所有数据库操作将失败', + dbDownAction: '检查数据库服务状态、网络连接和连接配置', + redisDown: 'Redis连接失败', + redisDownImpact: '缓存功能降级,性能可能下降', + redisDownAction: '检查Redis服务状态和网络连接', + cpuCritical: 'CPU使用率严重过高 ({usage}%)', + cpuCriticalImpact: '系统响应变慢,可能影响所有请求', + cpuCriticalAction: '检查CPU密集型任务,考虑扩容或优化代码', + cpuHigh: 'CPU使用率偏高 ({usage}%)', + cpuHighImpact: '系统负载较高,需要关注', + cpuHighAction: '监控CPU趋势,准备扩容方案', + memoryCritical: '内存使用率严重过高 ({usage}%)', + memoryCriticalImpact: '可能触发OOM,系统稳定性受威胁', + memoryCriticalAction: '检查内存泄漏,考虑增加内存或优化内存使用', + memoryHigh: '内存使用率偏高 ({usage}%)', + memoryHighImpact: '内存压力较大,需要关注', + memoryHighAction: '监控内存趋势,检查是否有内存泄漏', + // Latency diagnostics + latencyCritical: '响应延迟严重过高 ({latency}ms)', + latencyCriticalImpact: '用户体验极差,大量请求超时', + latencyCriticalAction: '检查慢查询、数据库索引、网络延迟和上游服务', + latencyHigh: '响应延迟偏高 ({latency}ms)', + latencyHighImpact: '用户体验下降,需要优化', + latencyHighAction: '分析慢请求日志,优化数据库查询和业务逻辑', + ttftHigh: '首字节时间偏高 ({ttft}ms)', + ttftHighImpact: '用户感知延迟增加', + ttftHighAction: '优化请求处理流程,减少前置逻辑耗时', + // Error rate diagnostics upstreamCritical: '上游错误率严重偏高 ({rate}%)', upstreamCriticalImpact: '可能影响大量用户请求', + upstreamCriticalAction: '检查上游服务健康状态,启用降级策略', upstreamHigh: '上游错误率偏高 ({rate}%)', upstreamHighImpact: '建议检查上游服务状态', - slaCritical: 'SLA 严重低于目标 ({sla}%)', - slaCriticalImpact: '用户体验严重受损', - slaLow: 'SLA 低于目标 ({sla}%)', - slaLowImpact: '需要关注服务质量', + upstreamHighAction: '联系上游服务团队,准备降级方案', errorHigh: '错误率过高 ({rate}%)', errorHighImpact: '大量请求失败', + errorHighAction: '查看错误日志,定位错误根因,紧急修复', errorElevated: '错误率偏高 ({rate}%)', errorElevatedImpact: '建议检查错误日志', + errorElevatedAction: '分析错误类型和分布,制定修复计划', + // SLA diagnostics + slaCritical: 'SLA 严重低于目标 ({sla}%)', + slaCriticalImpact: '用户体验严重受损', + slaCriticalAction: '紧急排查错误和延迟问题,考虑限流保护', + slaLow: 'SLA 低于目标 ({sla}%)', + slaLowImpact: '需要关注服务质量', + slaLowAction: '分析SLA下降原因,优化系统性能', + // Health score diagnostics healthCritical: '综合健康评分过低 ({score})', healthCriticalImpact: '多个指标可能同时异常,建议优先排查错误与延迟', + healthCriticalAction: '全面检查系统状态,优先处理critical级别问题', healthLow: '综合健康评分偏低 ({score})', healthLowImpact: '可能存在轻度波动,建议关注 SLA 与错误率', + healthLowAction: '监控指标趋势,预防问题恶化', healthy: '所有系统指标正常', healthyImpact: '服务运行稳定' }, diff --git a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue index afc17813..ccb5dac7 100644 --- a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue +++ b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue @@ -287,6 +287,7 @@ interface DiagnosisItem { type: 'critical' | 'warning' | 'info' message: string impact: string + action?: string } const diagnosisReport = computed(() => { @@ -304,63 +305,157 @@ const diagnosisReport = computed(() => { return report } - const upstreamRatePct = (ov.upstream_error_rate ?? 0) * 100 - if (upstreamRatePct > 10) { + // Resource diagnostics (highest priority) + const sm = ov.system_metrics + if (sm) { + if (sm.db_ok === false) { + report.push({ + type: 'critical', + message: t('admin.ops.diagnosis.dbDown'), + impact: t('admin.ops.diagnosis.dbDownImpact'), + action: t('admin.ops.diagnosis.dbDownAction') + }) + } + if (sm.redis_ok === false) { + report.push({ + type: 'warning', + message: t('admin.ops.diagnosis.redisDown'), + impact: t('admin.ops.diagnosis.redisDownImpact'), + action: t('admin.ops.diagnosis.redisDownAction') + }) + } + + const cpuPct = sm.cpu_usage_percent ?? 0 + if (cpuPct > 90) { + report.push({ + type: 'critical', + message: t('admin.ops.diagnosis.cpuCritical', { usage: cpuPct.toFixed(1) }), + impact: t('admin.ops.diagnosis.cpuCriticalImpact'), + action: t('admin.ops.diagnosis.cpuCriticalAction') + }) + } else if (cpuPct > 80) { + report.push({ + type: 'warning', + message: t('admin.ops.diagnosis.cpuHigh', { usage: cpuPct.toFixed(1) }), + impact: t('admin.ops.diagnosis.cpuHighImpact'), + action: t('admin.ops.diagnosis.cpuHighAction') + }) + } + + const memPct = sm.memory_usage_percent ?? 0 + if (memPct > 90) { + report.push({ + type: 'critical', + message: t('admin.ops.diagnosis.memoryCritical', { usage: memPct.toFixed(1) }), + impact: t('admin.ops.diagnosis.memoryCriticalImpact'), + action: t('admin.ops.diagnosis.memoryCriticalAction') + }) + } else if (memPct > 85) { + report.push({ + type: 'warning', + message: t('admin.ops.diagnosis.memoryHigh', { usage: memPct.toFixed(1) }), + impact: t('admin.ops.diagnosis.memoryHighImpact'), + action: t('admin.ops.diagnosis.memoryHighAction') + }) + } + } + + // Latency diagnostics + const durationP99 = ov.duration?.p99_ms ?? 0 + if (durationP99 > 2000) { report.push({ type: 'critical', - message: t('admin.ops.diagnosis.upstreamCritical', { rate: upstreamRatePct.toFixed(2) }), - impact: t('admin.ops.diagnosis.upstreamCriticalImpact') + message: t('admin.ops.diagnosis.latencyCritical', { latency: durationP99.toFixed(0) }), + impact: t('admin.ops.diagnosis.latencyCriticalImpact'), + action: t('admin.ops.diagnosis.latencyCriticalAction') }) - } else if (upstreamRatePct > 3) { + } else if (durationP99 > 1000) { report.push({ type: 'warning', - message: t('admin.ops.diagnosis.upstreamHigh', { rate: upstreamRatePct.toFixed(2) }), - impact: t('admin.ops.diagnosis.upstreamHighImpact') + message: t('admin.ops.diagnosis.latencyHigh', { latency: durationP99.toFixed(0) }), + impact: t('admin.ops.diagnosis.latencyHighImpact'), + action: t('admin.ops.diagnosis.latencyHighAction') }) } + const ttftP99 = ov.ttft?.p99_ms ?? 0 + if (ttftP99 > 500) { + report.push({ + type: 'warning', + message: t('admin.ops.diagnosis.ttftHigh', { ttft: ttftP99.toFixed(0) }), + impact: t('admin.ops.diagnosis.ttftHighImpact'), + action: t('admin.ops.diagnosis.ttftHighAction') + }) + } + + // Error rate diagnostics (adjusted thresholds) + const upstreamRatePct = (ov.upstream_error_rate ?? 0) * 100 + if (upstreamRatePct > 5) { + report.push({ + type: 'critical', + message: t('admin.ops.diagnosis.upstreamCritical', { rate: upstreamRatePct.toFixed(2) }), + impact: t('admin.ops.diagnosis.upstreamCriticalImpact'), + action: t('admin.ops.diagnosis.upstreamCriticalAction') + }) + } else if (upstreamRatePct > 2) { + report.push({ + type: 'warning', + message: t('admin.ops.diagnosis.upstreamHigh', { rate: upstreamRatePct.toFixed(2) }), + impact: t('admin.ops.diagnosis.upstreamHighImpact'), + action: t('admin.ops.diagnosis.upstreamHighAction') + }) + } + + const errorPct = (ov.error_rate ?? 0) * 100 + if (errorPct > 3) { + report.push({ + type: 'critical', + message: t('admin.ops.diagnosis.errorHigh', { rate: errorPct.toFixed(2) }), + impact: t('admin.ops.diagnosis.errorHighImpact'), + action: t('admin.ops.diagnosis.errorHighAction') + }) + } else if (errorPct > 0.5) { + report.push({ + type: 'warning', + message: t('admin.ops.diagnosis.errorElevated', { rate: errorPct.toFixed(2) }), + impact: t('admin.ops.diagnosis.errorElevatedImpact'), + action: t('admin.ops.diagnosis.errorElevatedAction') + }) + } + + // SLA diagnostics const slaPct = (ov.sla ?? 0) * 100 if (slaPct < 90) { report.push({ type: 'critical', message: t('admin.ops.diagnosis.slaCritical', { sla: slaPct.toFixed(2) }), - impact: t('admin.ops.diagnosis.slaCriticalImpact') + impact: t('admin.ops.diagnosis.slaCriticalImpact'), + action: t('admin.ops.diagnosis.slaCriticalAction') }) } else if (slaPct < 98) { report.push({ type: 'warning', message: t('admin.ops.diagnosis.slaLow', { sla: slaPct.toFixed(2) }), - impact: t('admin.ops.diagnosis.slaLowImpact') - }) - } - - const errorPct = (ov.error_rate ?? 0) * 100 - if (errorPct > 5) { - report.push({ - type: 'critical', - message: t('admin.ops.diagnosis.errorHigh', { rate: errorPct.toFixed(2) }), - impact: t('admin.ops.diagnosis.errorHighImpact') - }) - } else if (errorPct > 1) { - report.push({ - type: 'warning', - message: t('admin.ops.diagnosis.errorElevated', { rate: errorPct.toFixed(2) }), - impact: t('admin.ops.diagnosis.errorElevatedImpact') + impact: t('admin.ops.diagnosis.slaLowImpact'), + action: t('admin.ops.diagnosis.slaLowAction') }) } + // Health score diagnostics (lowest priority) if (healthScoreValue.value != null) { if (healthScoreValue.value < 60) { report.push({ type: 'critical', message: t('admin.ops.diagnosis.healthCritical', { score: healthScoreValue.value }), - impact: t('admin.ops.diagnosis.healthCriticalImpact') + impact: t('admin.ops.diagnosis.healthCriticalImpact'), + action: t('admin.ops.diagnosis.healthCriticalAction') }) } else if (healthScoreValue.value < 90) { report.push({ type: 'warning', message: t('admin.ops.diagnosis.healthLow', { score: healthScoreValue.value }), - impact: t('admin.ops.diagnosis.healthLowImpact') + impact: t('admin.ops.diagnosis.healthLowImpact'), + action: t('admin.ops.diagnosis.healthLowAction') }) } } @@ -752,9 +847,12 @@ function openJobsDetails() { /> -
+
{{ item.message }}
{{ item.impact }}
+
+ 💡 {{ item.action }} +