Files
sub2api/backend/internal/service/ops_health_score.go
IanShaw027 b2ff326ced fix(ops): 调整健康分数权重以修复CI测试
- 将业务健康和基础设施健康的权重从80/20调整为70/30
- 使基础设施故障(DB/Redis)对总分影响更明显
- 修复三个失败的测试用例:
  * DB故障: 92→88 (期望70-90)
  * Redis故障: 96→94 (期望85-95)
  * 业务降级: 82→84.5 (期望84-85)
2026-01-15 23:02:15 +08:00

144 lines
4.2 KiB
Go

package service
import (
"math"
"time"
)
// computeDashboardHealthScore computes a 0-100 health score from the metrics returned by the dashboard overview.
//
// Design goals:
// - Backend-owned scoring (UI only displays).
// - Layered scoring: Business Health (70%) + Infrastructure Health (30%)
// - Avoids double-counting (e.g., DB failure affects both infra and business metrics)
// - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data.
func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int {
if overview == nil {
return 0
}
// Idle/no-data: avoid showing a "bad" score when there is no traffic.
// UI can still render a gray/idle state based on QPS + error rate.
if overview.RequestCountSLA <= 0 && overview.RequestCountTotal <= 0 && overview.ErrorCountTotal <= 0 {
return 100
}
businessHealth := computeBusinessHealth(overview)
infraHealth := computeInfraHealth(now, overview)
// Weighted combination: 70% business + 30% infrastructure
score := businessHealth*0.7 + infraHealth*0.3
return int(math.Round(clampFloat64(score, 0, 100)))
}
// computeBusinessHealth calculates business health score (0-100)
// Components: Error Rate (50%) + TTFT (50%)
func computeBusinessHealth(overview *OpsDashboardOverview) float64 {
// Error rate score: 1% → 100, 10% → 0 (linear)
// Combines request errors and upstream errors
errorScore := 100.0
errorPct := clampFloat64(overview.ErrorRate*100, 0, 100)
upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100)
combinedErrorPct := math.Max(errorPct, upstreamPct) // Use worst case
if combinedErrorPct > 1.0 {
if combinedErrorPct <= 10.0 {
errorScore = (10.0 - combinedErrorPct) / 9.0 * 100
} else {
errorScore = 0
}
}
// TTFT score: 1s → 100, 3s → 0 (linear)
// Time to first token is critical for user experience
ttftScore := 100.0
if overview.TTFT.P99 != nil {
p99 := float64(*overview.TTFT.P99)
if p99 > 1000 {
if p99 <= 3000 {
ttftScore = (3000 - p99) / 2000 * 100
} else {
ttftScore = 0
}
}
}
// Weighted combination: 50% error rate + 50% TTFT
return errorScore*0.5 + ttftScore*0.5
}
// computeInfraHealth calculates infrastructure health score (0-100)
// Components: Storage (40%) + Compute Resources (30%) + Background Jobs (30%)
func computeInfraHealth(now time.Time, overview *OpsDashboardOverview) float64 {
// Storage score: DB critical, Redis less critical
storageScore := 100.0
if overview.SystemMetrics != nil {
if overview.SystemMetrics.DBOK != nil && !*overview.SystemMetrics.DBOK {
storageScore = 0 // DB failure is critical
} else if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK {
storageScore = 50 // Redis failure is degraded but not critical
}
}
// Compute resources score: CPU + Memory
computeScore := 100.0
if overview.SystemMetrics != nil {
cpuScore := 100.0
if overview.SystemMetrics.CPUUsagePercent != nil {
cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100)
if cpuPct > 80 {
if cpuPct <= 100 {
cpuScore = (100 - cpuPct) / 20 * 100
} else {
cpuScore = 0
}
}
}
memScore := 100.0
if overview.SystemMetrics.MemoryUsagePercent != nil {
memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100)
if memPct > 85 {
if memPct <= 100 {
memScore = (100 - memPct) / 15 * 100
} else {
memScore = 0
}
}
}
computeScore = (cpuScore + memScore) / 2
}
// Background jobs score
jobScore := 100.0
failedJobs := 0
totalJobs := 0
for _, hb := range overview.JobHeartbeats {
if hb == nil {
continue
}
totalJobs++
if hb.LastErrorAt != nil && (hb.LastSuccessAt == nil || hb.LastErrorAt.After(*hb.LastSuccessAt)) {
failedJobs++
} else if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute {
failedJobs++
}
}
if totalJobs > 0 && failedJobs > 0 {
jobScore = (1 - float64(failedJobs)/float64(totalJobs)) * 100
}
// Weighted combination
return storageScore*0.4 + computeScore*0.3 + jobScore*0.3
}
func clampFloat64(v float64, min float64, max float64) float64 {
if v < min {
return min
}
if v > max {
return max
}
return v
}