feat(ops): 优化健康评分算法和智能诊断机制

- 采用分层加权评分（业务70% + 基础设施30%），避免重复扣分 - 新增延迟诊断（P99 > 2s critical, > 1s warning） - 新增资源诊断（CPU/内存/DB/Redis状态） - 调整诊断阈值（上游错误率5% critical，请求错误率3% critical） - 为每个诊断项添加可操作建议 - 添加完整的单元测试覆盖（30+测试用例） - 完善中英文国际化文本
2026-01-11 21:42:02 +08:00
parent c8e3a476fc
commit 8fffcd8091
5 changed files with 677 additions and 100 deletions
--- a/backend/internal/service/ops_health_score.go
+++ b/backend/internal/service/ops_health_score.go
@@ -9,7 +9,8 @@ import (
 //
 // Design goals:
 // - Backend-owned scoring (UI only displays).
-// - Uses "overall" business indicators (SLA/error/latency) plus infra indicators (db/redis/cpu/mem/jobs).
+// - Layered scoring: Business Health (70%) + Infrastructure Health (30%)
+// - Avoids double-counting (e.g., DB failure affects both infra and business metrics)
 // - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data.
 func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int {
 	if overview == nil {
@@ -22,97 +23,124 @@ func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview)
 		return 100
 	}

-	score := 100.0
+	businessHealth := computeBusinessHealth(overview)
+	infraHealth := computeInfraHealth(now, overview)

-	// --- SLA (primary signal) ---
-	// SLA is a ratio (0..1). Target is intentionally modest for LLM gateways; it can be tuned later.
+	// Weighted combination: 70% business + 30% infrastructure
+	score := businessHealth*0.7 + infraHealth*0.3
+	return int(math.Round(clampFloat64(score, 0, 100)))
+}
+
+// computeBusinessHealth calculates business health score (0-100)
+// Components: SLA (50%) + Error Rate (30%) + Latency (20%)
+func computeBusinessHealth(overview *OpsDashboardOverview) float64 {
+	// SLA score: 99.5% → 100, 95% → 0 (linear)
+	slaScore := 100.0
 	slaPct := clampFloat64(overview.SLA*100, 0, 100)
 	if slaPct < 99.5 {
-		// Up to -45 points as SLA drops.
-		score -= math.Min(45, (99.5-slaPct)*12)
+		if slaPct >= 95 {
+			slaScore = (slaPct - 95) / 4.5 * 100
+		} else {
+			slaScore = 0
+		}
 	}

-	// --- Error rates (secondary signal) ---
+	// Error rate score: 0.5% → 100, 5% → 0 (linear)
+	// Combines request errors and upstream errors
+	errorScore := 100.0
 	errorPct := clampFloat64(overview.ErrorRate*100, 0, 100)
-	if errorPct > 1 {
-		// Cap at -20 points by 6% error rate.
-		score -= math.Min(20, (errorPct-1)*4)
-	}
-
 	upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100)
-	if upstreamPct > 1 {
-		// Upstream instability deserves extra weight, but keep it smaller than SLA/error.
-		score -= math.Min(15, (upstreamPct-1)*3)
+	combinedErrorPct := math.Max(errorPct, upstreamPct) // Use worst case
+	if combinedErrorPct > 0.5 {
+		if combinedErrorPct <= 5 {
+			errorScore = (5 - combinedErrorPct) / 4.5 * 100
+		} else {
+			errorScore = 0
+		}
 	}

-	// --- Latency (tail-focused) ---
-	// Use p99 of duration + TTFT. Penalize only when clearly elevated.
+	// Latency score: 1s → 100, 10s → 0 (linear)
+	// Uses P99 of duration (TTFT is less critical for overall health)
+	latencyScore := 100.0
 	if overview.Duration.P99 != nil {
 		p99 := float64(*overview.Duration.P99)
-		if p99 > 2000 {
-			// From 2s upward, gradually penalize up to -20.
-			score -= math.Min(20, (p99-2000)/900) // ~20s => ~-20
-		}
-	}
-	if overview.TTFT.P99 != nil {
-		p99 := float64(*overview.TTFT.P99)
-		if p99 > 500 {
-			// TTFT > 500ms starts hurting; cap at -10.
-			score -= math.Min(10, (p99-500)/200) // 2.5s => -10
+		if p99 > 1000 {
+			if p99 <= 10000 {
+				latencyScore = (10000 - p99) / 9000 * 100
+			} else {
+				latencyScore = 0
+			}
 		}
 	}

-	// --- System metrics snapshot (best-effort) ---
+	// Weighted combination
+	return slaScore*0.5 + errorScore*0.3 + latencyScore*0.2
+}
+
+// computeInfraHealth calculates infrastructure health score (0-100)
+// Components: Storage (40%) + Compute Resources (30%) + Background Jobs (30%)
+func computeInfraHealth(now time.Time, overview *OpsDashboardOverview) float64 {
+	// Storage score: DB critical, Redis less critical
+	storageScore := 100.0
 	if overview.SystemMetrics != nil {
 		if overview.SystemMetrics.DBOK != nil && !*overview.SystemMetrics.DBOK {
-			score -= 20
-		}
-		if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK {
-			score -= 15
-		}
-
-		if overview.SystemMetrics.CPUUsagePercent != nil {
-			cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100)
-			if cpuPct > 85 {
-				score -= math.Min(10, (cpuPct-85)*1.5)
-			}
-		}
-		if overview.SystemMetrics.MemoryUsagePercent != nil {
-			memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100)
-			if memPct > 90 {
-				score -= math.Min(10, (memPct-90)*1.0)
-			}
-		}
-
-		if overview.SystemMetrics.DBConnWaiting != nil && *overview.SystemMetrics.DBConnWaiting > 0 {
-			waiting := float64(*overview.SystemMetrics.DBConnWaiting)
-			score -= math.Min(10, waiting*2)
-		}
-		if overview.SystemMetrics.ConcurrencyQueueDepth != nil && *overview.SystemMetrics.ConcurrencyQueueDepth > 0 {
-			depth := float64(*overview.SystemMetrics.ConcurrencyQueueDepth)
-			score -= math.Min(10, depth*0.5)
+			storageScore = 0 // DB failure is critical
+		} else if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK {
+			storageScore = 50 // Redis failure is degraded but not critical
 		}
 	}

-	// --- Job heartbeats (best-effort) ---
-	// Penalize only clear "error after last success" signals, and cap the impact.
-	jobPenalty := 0.0
+	// Compute resources score: CPU + Memory
+	computeScore := 100.0
+	if overview.SystemMetrics != nil {
+		cpuScore := 100.0
+		if overview.SystemMetrics.CPUUsagePercent != nil {
+			cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100)
+			if cpuPct > 80 {
+				if cpuPct <= 100 {
+					cpuScore = (100 - cpuPct) / 20 * 100
+				} else {
+					cpuScore = 0
+				}
+			}
+		}
+
+		memScore := 100.0
+		if overview.SystemMetrics.MemoryUsagePercent != nil {
+			memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100)
+			if memPct > 85 {
+				if memPct <= 100 {
+					memScore = (100 - memPct) / 15 * 100
+				} else {
+					memScore = 0
+				}
+			}
+		}
+
+		computeScore = (cpuScore + memScore) / 2
+	}
+
+	// Background jobs score
+	jobScore := 100.0
+	failedJobs := 0
+	totalJobs := 0
 	for _, hb := range overview.JobHeartbeats {
 		if hb == nil {
 			continue
 		}
+		totalJobs++
 		if hb.LastErrorAt != nil && (hb.LastSuccessAt == nil || hb.LastErrorAt.After(*hb.LastSuccessAt)) {
-			jobPenalty += 5
-			continue
-		}
-		if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute {
-			jobPenalty += 2
+			failedJobs++
+		} else if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute {
+			failedJobs++
 		}
 	}
-	score -= math.Min(15, jobPenalty)
+	if totalJobs > 0 && failedJobs > 0 {
+		jobScore = (1 - float64(failedJobs)/float64(totalJobs)) * 100
+	}

-	score = clampFloat64(score, 0, 100)
-	return int(math.Round(score))
+	// Weighted combination
+	return storageScore*0.4 + computeScore*0.3 + jobScore*0.3
 }

 func clampFloat64(v float64, min float64, max float64) float64 {
--- a/backend/internal/service/ops_health_score_test.go
+++ b/backend/internal/service/ops_health_score_test.go
@@ -55,6 +55,377 @@ func TestComputeDashboardHealthScore_DegradesOnBadSignals(t *testing.T) {
 	require.GreaterOrEqual(t, score, 0)
 }

+func TestComputeDashboardHealthScore_Comprehensive(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name     string
+		overview *OpsDashboardOverview
+		wantMin  int
+		wantMax  int
+	}{
+		{
+			name:     "nil overview returns 0",
+			overview: nil,
+			wantMin:  0,
+			wantMax:  0,
+		},
+		{
+			name: "perfect health",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				RequestCountSLA:   1000,
+				SLA:               1.0,
+				ErrorRate:         0,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+				TTFT:              OpsPercentiles{P99: intPtr(100)},
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(30),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+			},
+			wantMin: 100,
+			wantMax: 100,
+		},
+		{
+			name: "good health - SLA 99.8%",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				RequestCountSLA:   1000,
+				SLA:               0.998,
+				ErrorRate:         0.003,
+				UpstreamErrorRate: 0.001,
+				Duration:          OpsPercentiles{P99: intPtr(800)},
+				TTFT:              OpsPercentiles{P99: intPtr(200)},
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(50),
+					MemoryUsagePercent: float64Ptr(60),
+				},
+			},
+			wantMin: 95,
+			wantMax: 100,
+		},
+		{
+			name: "medium health - SLA 96%",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				RequestCountSLA:   1000,
+				SLA:               0.96,
+				ErrorRate:         0.02,
+				UpstreamErrorRate: 0.01,
+				Duration:          OpsPercentiles{P99: intPtr(3000)},
+				TTFT:              OpsPercentiles{P99: intPtr(600)},
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(70),
+					MemoryUsagePercent: float64Ptr(75),
+				},
+			},
+			wantMin: 60,
+			wantMax: 85,
+		},
+		{
+			name: "DB failure",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				RequestCountSLA:   1000,
+				SLA:               0.995,
+				ErrorRate:         0,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(false),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(30),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+			},
+			wantMin: 70,
+			wantMax: 90,
+		},
+		{
+			name: "Redis failure",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				RequestCountSLA:   1000,
+				SLA:               0.995,
+				ErrorRate:         0,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(false),
+					CPUUsagePercent:    float64Ptr(30),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+			},
+			wantMin: 85,
+			wantMax: 95,
+		},
+		{
+			name: "high CPU usage",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				RequestCountSLA:   1000,
+				SLA:               0.995,
+				ErrorRate:         0,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(95),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+			},
+			wantMin: 85,
+			wantMax: 100,
+		},
+		{
+			name: "combined failures - business degraded + infra healthy",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				RequestCountSLA:   1000,
+				SLA:               0.90,
+				ErrorRate:         0.05,
+				UpstreamErrorRate: 0.02,
+				Duration:          OpsPercentiles{P99: intPtr(10000)},
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(20),
+					MemoryUsagePercent: float64Ptr(30),
+				},
+			},
+			wantMin: 25,
+			wantMax: 50,
+		},
+		{
+			name: "combined failures - business healthy + infra degraded",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				RequestCountSLA:   1000,
+				SLA:               0.998,
+				ErrorRate:         0.001,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(600)},
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(false),
+					RedisOK:            boolPtr(false),
+					CPUUsagePercent:    float64Ptr(95),
+					MemoryUsagePercent: float64Ptr(95),
+				},
+			},
+			wantMin: 70,
+			wantMax: 90,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			score := computeDashboardHealthScore(time.Now().UTC(), tt.overview)
+			require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %d", tt.wantMin)
+			require.LessOrEqual(t, score, tt.wantMax, "score should be <= %d", tt.wantMax)
+			require.GreaterOrEqual(t, score, 0, "score must be >= 0")
+			require.LessOrEqual(t, score, 100, "score must be <= 100")
+		})
+	}
+}
+
+func TestComputeBusinessHealth(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name     string
+		overview *OpsDashboardOverview
+		wantMin  float64
+		wantMax  float64
+	}{
+		{
+			name: "perfect metrics",
+			overview: &OpsDashboardOverview{
+				SLA:               1.0,
+				ErrorRate:         0,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+			},
+			wantMin: 100,
+			wantMax: 100,
+		},
+		{
+			name: "SLA boundary 99.5%",
+			overview: &OpsDashboardOverview{
+				SLA:               0.995,
+				ErrorRate:         0,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+			},
+			wantMin: 100,
+			wantMax: 100,
+		},
+		{
+			name: "SLA boundary 95%",
+			overview: &OpsDashboardOverview{
+				SLA:               0.95,
+				ErrorRate:         0,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+			},
+			wantMin: 50,
+			wantMax: 60,
+		},
+		{
+			name: "error rate boundary 0.5%",
+			overview: &OpsDashboardOverview{
+				SLA:               0.995,
+				ErrorRate:         0.005,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+			},
+			wantMin: 95,
+			wantMax: 100,
+		},
+		{
+			name: "latency boundary 1000ms",
+			overview: &OpsDashboardOverview{
+				SLA:               0.995,
+				ErrorRate:         0,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(1000)},
+			},
+			wantMin: 95,
+			wantMax: 100,
+		},
+		{
+			name: "upstream error dominates",
+			overview: &OpsDashboardOverview{
+				SLA:               0.995,
+				ErrorRate:         0.001,
+				UpstreamErrorRate: 0.03,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+			},
+			wantMin: 75,
+			wantMax: 90,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			score := computeBusinessHealth(tt.overview)
+			require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %.1f", tt.wantMin)
+			require.LessOrEqual(t, score, tt.wantMax, "score should be <= %.1f", tt.wantMax)
+			require.GreaterOrEqual(t, score, 0.0, "score must be >= 0")
+			require.LessOrEqual(t, score, 100.0, "score must be <= 100")
+		})
+	}
+}
+
+func TestComputeInfraHealth(t *testing.T) {
+	t.Parallel()
+
+	now := time.Now().UTC()
+
+	tests := []struct {
+		name     string
+		overview *OpsDashboardOverview
+		wantMin  float64
+		wantMax  float64
+	}{
+		{
+			name: "all infrastructure healthy",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(30),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+			},
+			wantMin: 100,
+			wantMax: 100,
+		},
+		{
+			name: "DB down",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(false),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(30),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+			},
+			wantMin: 50,
+			wantMax: 70,
+		},
+		{
+			name: "Redis down",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(false),
+					CPUUsagePercent:    float64Ptr(30),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+			},
+			wantMin: 80,
+			wantMax: 95,
+		},
+		{
+			name: "CPU at 90%",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(90),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+			},
+			wantMin: 85,
+			wantMax: 95,
+		},
+		{
+			name: "failed background job",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(30),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+				JobHeartbeats: []*OpsJobHeartbeat{
+					{
+						JobName:     "test-job",
+						LastErrorAt: &now,
+					},
+				},
+			},
+			wantMin: 70,
+			wantMax: 90,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			score := computeInfraHealth(now, tt.overview)
+			require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %.1f", tt.wantMin)
+			require.LessOrEqual(t, score, tt.wantMax, "score should be <= %.1f", tt.wantMax)
+			require.GreaterOrEqual(t, score, 0.0, "score must be >= 0")
+			require.LessOrEqual(t, score, 100.0, "score must be <= 100")
+		})
+	}
+}
+
 func timePtr(v time.Time) *time.Time { return &v }

 func stringPtr(v string) *string { return &v }