refactor(ops): 优化健康分数计算逻辑和阈值

- 移除 SLA 组件（与错误率重复） - 恢复延迟组件，阈值调整为 1s-2s - 错误率阈值调整为 1%-10%（更宽松） - 业务健康分数：错误率 50% + 延迟 50% - 更新所有相关测试用例期望值
2026-01-14 23:43:12 +08:00
parent 9b10241561
commit d5eab7da3b
2 changed files with 52 additions and 28 deletions
--- a/backend/internal/service/ops_health_score.go
+++ b/backend/internal/service/ops_health_score.go
@@ -32,36 +32,38 @@ func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview)
 }
 // computeBusinessHealth calculates business health score (0-100)
-// Components: SLA (50%) + Error Rate (30%)
+// Components: Error Rate (50%) + Latency (50%)
 func computeBusinessHealth(overview *OpsDashboardOverview) float64 {
-	// SLA score: 99.5% → 100, 95% → 0 (linear)
+	// Error rate score: 1% → 100, 10% → 0 (linear)
 	slaScore := 100.0
 	slaPct := clampFloat64(overview.SLA*100, 0, 100)
 	if slaPct < 99.5 {
 		if slaPct >= 95 {
 			slaScore = (slaPct - 95) / 4.5 * 100
 		} else {
 			slaScore = 0
 		}
 	}
 	// Error rate score: 0.5% → 100, 5% → 0 (linear)
 	// Combines request errors and upstream errors
 	errorScore := 100.0
 	errorPct := clampFloat64(overview.ErrorRate*100, 0, 100)
 	upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100)
 	combinedErrorPct := math.Max(errorPct, upstreamPct) // Use worst case
-	if combinedErrorPct > 0.5 {
+	if combinedErrorPct > 1.0 {
-		if combinedErrorPct <= 5 {
+		if combinedErrorPct <= 10.0 {
-			errorScore = (5 - combinedErrorPct) / 4.5 * 100
+			errorScore = (10.0 - combinedErrorPct) / 9.0 * 100
 		} else {
 			errorScore = 0
 		}
 	}
-	// Weighted combination (renormalized after removing duration)
+	// Latency score: 1s → 100, 2s → 0 (linear)
-	const weightSum = 0.8
+	// Uses P99 of duration
-	return (slaScore*0.5 + errorScore*0.3) / weightSum
+	latencyScore := 100.0
 	if overview.Duration.P99 != nil {
 		p99 := float64(*overview.Duration.P99)
 		if p99 > 1000 {
 			if p99 <= 2000 {
 				latencyScore = (2000 - p99) / 1000 * 100
 			} else {
 				latencyScore = 0
 			}
 		}
 	}
 	// Weighted combination: 50% error rate + 50% latency
 	return errorScore*0.5 + latencyScore*0.5
 }
 // computeInfraHealth calculates infrastructure health score (0-100)
--- a/backend/internal/service/ops_health_score_test.go
+++ b/backend/internal/service/ops_health_score_test.go
@@ -127,8 +127,8 @@ func TestComputeDashboardHealthScore_Comprehensive(t *testing.T) {
 					MemoryUsagePercent: float64Ptr(75),
 				},
 			},
-			wantMin: 57,
+			wantMin: 61,
-			wantMax: 58,
+			wantMax: 62,
 		},
 		{
 			name: "DB failure",
@@ -277,20 +277,42 @@ func TestComputeBusinessHealth(t *testing.T) {
 				UpstreamErrorRate: 0,
 				Duration:          OpsPercentiles{P99: intPtr(500)},
 			},
-			wantMin: 37,
+			wantMin: 100,
-			wantMax: 38,
+			wantMax: 100,
 		},
 		{
-			name: "error rate boundary 0.5%",
+			name: "error rate boundary 1%",
 			overview: &OpsDashboardOverview{
-				SLA:               0.995,
+				SLA:               0.99,
-				ErrorRate:         0.005,
+				ErrorRate:         0.01,
 				UpstreamErrorRate: 0,
 				Duration:          OpsPercentiles{P99: intPtr(500)},
 			},
-			wantMin: 95,
+			wantMin: 100,
 			wantMax: 100,
 		},
 		{
 			name: "error rate 5%",
 			overview: &OpsDashboardOverview{
 				SLA:               0.95,
 				ErrorRate:         0.05,
 				UpstreamErrorRate: 0,
 				Duration:          OpsPercentiles{P99: intPtr(500)},
 			},
 			wantMin: 77,
 			wantMax: 78,
 		},
 		{
 			name: "latency boundary 2s",
 			overview: &OpsDashboardOverview{
 				SLA:               0.99,
 				ErrorRate:         0,
 				UpstreamErrorRate: 0,
 				Duration:          OpsPercentiles{P99: intPtr(2000)},
 			},
 			wantMin: 50,
 			wantMax: 50,
 		},
 		{
 			name: "upstream error dominates",
 			overview: &OpsDashboardOverview{
@@ -299,7 +321,7 @@ func TestComputeBusinessHealth(t *testing.T) {
 				UpstreamErrorRate: 0.03,
 				Duration:          OpsPercentiles{P99: intPtr(500)},
 			},
-			wantMin: 75,
+			wantMin: 88,
 			wantMax: 90,
 		},
 	}