feat(ops): 优化健康评分算法和智能诊断机制

- 采用分层加权评分(业务70% + 基础设施30%),避免重复扣分
- 新增延迟诊断(P99 > 2s critical, > 1s warning)
- 新增资源诊断(CPU/内存/DB/Redis状态)
- 调整诊断阈值(上游错误率5% critical,请求错误率3% critical)
- 为每个诊断项添加可操作建议
- 添加完整的单元测试覆盖(30+测试用例)
- 完善中英文国际化文本
This commit is contained in:
IanShaw027
2026-01-11 21:42:02 +08:00
parent c8e3a476fc
commit 8fffcd8091
5 changed files with 677 additions and 100 deletions

View File

@@ -9,7 +9,8 @@ import (
//
// Design goals:
// - Backend-owned scoring (UI only displays).
// - Uses "overall" business indicators (SLA/error/latency) plus infra indicators (db/redis/cpu/mem/jobs).
// - Layered scoring: Business Health (70%) + Infrastructure Health (30%)
// - Avoids double-counting (e.g., DB failure affects both infra and business metrics)
// - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data.
func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int {
if overview == nil {
@@ -22,97 +23,124 @@ func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview)
return 100
}
score := 100.0
businessHealth := computeBusinessHealth(overview)
infraHealth := computeInfraHealth(now, overview)
// --- SLA (primary signal) ---
// SLA is a ratio (0..1). Target is intentionally modest for LLM gateways; it can be tuned later.
// Weighted combination: 70% business + 30% infrastructure
score := businessHealth*0.7 + infraHealth*0.3
return int(math.Round(clampFloat64(score, 0, 100)))
}
// computeBusinessHealth calculates business health score (0-100)
// Components: SLA (50%) + Error Rate (30%) + Latency (20%)
func computeBusinessHealth(overview *OpsDashboardOverview) float64 {
// SLA score: 99.5% → 100, 95% → 0 (linear)
slaScore := 100.0
slaPct := clampFloat64(overview.SLA*100, 0, 100)
if slaPct < 99.5 {
// Up to -45 points as SLA drops.
score -= math.Min(45, (99.5-slaPct)*12)
if slaPct >= 95 {
slaScore = (slaPct - 95) / 4.5 * 100
} else {
slaScore = 0
}
}
// --- Error rates (secondary signal) ---
// Error rate score: 0.5% → 100, 5% → 0 (linear)
// Combines request errors and upstream errors
errorScore := 100.0
errorPct := clampFloat64(overview.ErrorRate*100, 0, 100)
if errorPct > 1 {
// Cap at -20 points by 6% error rate.
score -= math.Min(20, (errorPct-1)*4)
}
upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100)
if upstreamPct > 1 {
// Upstream instability deserves extra weight, but keep it smaller than SLA/error.
score -= math.Min(15, (upstreamPct-1)*3)
combinedErrorPct := math.Max(errorPct, upstreamPct) // Use worst case
if combinedErrorPct > 0.5 {
if combinedErrorPct <= 5 {
errorScore = (5 - combinedErrorPct) / 4.5 * 100
} else {
errorScore = 0
}
}
// --- Latency (tail-focused) ---
// Use p99 of duration + TTFT. Penalize only when clearly elevated.
// Latency score: 1s → 100, 10s → 0 (linear)
// Uses P99 of duration (TTFT is less critical for overall health)
latencyScore := 100.0
if overview.Duration.P99 != nil {
p99 := float64(*overview.Duration.P99)
if p99 > 2000 {
// From 2s upward, gradually penalize up to -20.
score -= math.Min(20, (p99-2000)/900) // ~20s => ~-20
}
}
if overview.TTFT.P99 != nil {
p99 := float64(*overview.TTFT.P99)
if p99 > 500 {
// TTFT > 500ms starts hurting; cap at -10.
score -= math.Min(10, (p99-500)/200) // 2.5s => -10
if p99 > 1000 {
if p99 <= 10000 {
latencyScore = (10000 - p99) / 9000 * 100
} else {
latencyScore = 0
}
}
}
// --- System metrics snapshot (best-effort) ---
// Weighted combination
return slaScore*0.5 + errorScore*0.3 + latencyScore*0.2
}
// computeInfraHealth calculates infrastructure health score (0-100)
// Components: Storage (40%) + Compute Resources (30%) + Background Jobs (30%)
func computeInfraHealth(now time.Time, overview *OpsDashboardOverview) float64 {
// Storage score: DB critical, Redis less critical
storageScore := 100.0
if overview.SystemMetrics != nil {
if overview.SystemMetrics.DBOK != nil && !*overview.SystemMetrics.DBOK {
score -= 20
}
if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK {
score -= 15
}
if overview.SystemMetrics.CPUUsagePercent != nil {
cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100)
if cpuPct > 85 {
score -= math.Min(10, (cpuPct-85)*1.5)
}
}
if overview.SystemMetrics.MemoryUsagePercent != nil {
memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100)
if memPct > 90 {
score -= math.Min(10, (memPct-90)*1.0)
}
}
if overview.SystemMetrics.DBConnWaiting != nil && *overview.SystemMetrics.DBConnWaiting > 0 {
waiting := float64(*overview.SystemMetrics.DBConnWaiting)
score -= math.Min(10, waiting*2)
}
if overview.SystemMetrics.ConcurrencyQueueDepth != nil && *overview.SystemMetrics.ConcurrencyQueueDepth > 0 {
depth := float64(*overview.SystemMetrics.ConcurrencyQueueDepth)
score -= math.Min(10, depth*0.5)
storageScore = 0 // DB failure is critical
} else if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK {
storageScore = 50 // Redis failure is degraded but not critical
}
}
// --- Job heartbeats (best-effort) ---
// Penalize only clear "error after last success" signals, and cap the impact.
jobPenalty := 0.0
// Compute resources score: CPU + Memory
computeScore := 100.0
if overview.SystemMetrics != nil {
cpuScore := 100.0
if overview.SystemMetrics.CPUUsagePercent != nil {
cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100)
if cpuPct > 80 {
if cpuPct <= 100 {
cpuScore = (100 - cpuPct) / 20 * 100
} else {
cpuScore = 0
}
}
}
memScore := 100.0
if overview.SystemMetrics.MemoryUsagePercent != nil {
memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100)
if memPct > 85 {
if memPct <= 100 {
memScore = (100 - memPct) / 15 * 100
} else {
memScore = 0
}
}
}
computeScore = (cpuScore + memScore) / 2
}
// Background jobs score
jobScore := 100.0
failedJobs := 0
totalJobs := 0
for _, hb := range overview.JobHeartbeats {
if hb == nil {
continue
}
totalJobs++
if hb.LastErrorAt != nil && (hb.LastSuccessAt == nil || hb.LastErrorAt.After(*hb.LastSuccessAt)) {
jobPenalty += 5
continue
}
if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute {
jobPenalty += 2
failedJobs++
} else if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute {
failedJobs++
}
}
score -= math.Min(15, jobPenalty)
if totalJobs > 0 && failedJobs > 0 {
jobScore = (1 - float64(failedJobs)/float64(totalJobs)) * 100
}
score = clampFloat64(score, 0, 100)
return int(math.Round(score))
// Weighted combination
return storageScore*0.4 + computeScore*0.3 + jobScore*0.3
}
func clampFloat64(v float64, min float64, max float64) float64 {

View File

@@ -55,6 +55,377 @@ func TestComputeDashboardHealthScore_DegradesOnBadSignals(t *testing.T) {
require.GreaterOrEqual(t, score, 0)
}
func TestComputeDashboardHealthScore_Comprehensive(t *testing.T) {
t.Parallel()
tests := []struct {
name string
overview *OpsDashboardOverview
wantMin int
wantMax int
}{
{
name: "nil overview returns 0",
overview: nil,
wantMin: 0,
wantMax: 0,
},
{
name: "perfect health",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
RequestCountSLA: 1000,
SLA: 1.0,
ErrorRate: 0,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(500)},
TTFT: OpsPercentiles{P99: intPtr(100)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(30),
MemoryUsagePercent: float64Ptr(40),
},
},
wantMin: 100,
wantMax: 100,
},
{
name: "good health - SLA 99.8%",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
RequestCountSLA: 1000,
SLA: 0.998,
ErrorRate: 0.003,
UpstreamErrorRate: 0.001,
Duration: OpsPercentiles{P99: intPtr(800)},
TTFT: OpsPercentiles{P99: intPtr(200)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(50),
MemoryUsagePercent: float64Ptr(60),
},
},
wantMin: 95,
wantMax: 100,
},
{
name: "medium health - SLA 96%",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
RequestCountSLA: 1000,
SLA: 0.96,
ErrorRate: 0.02,
UpstreamErrorRate: 0.01,
Duration: OpsPercentiles{P99: intPtr(3000)},
TTFT: OpsPercentiles{P99: intPtr(600)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(70),
MemoryUsagePercent: float64Ptr(75),
},
},
wantMin: 60,
wantMax: 85,
},
{
name: "DB failure",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
RequestCountSLA: 1000,
SLA: 0.995,
ErrorRate: 0,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(500)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(false),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(30),
MemoryUsagePercent: float64Ptr(40),
},
},
wantMin: 70,
wantMax: 90,
},
{
name: "Redis failure",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
RequestCountSLA: 1000,
SLA: 0.995,
ErrorRate: 0,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(500)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(false),
CPUUsagePercent: float64Ptr(30),
MemoryUsagePercent: float64Ptr(40),
},
},
wantMin: 85,
wantMax: 95,
},
{
name: "high CPU usage",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
RequestCountSLA: 1000,
SLA: 0.995,
ErrorRate: 0,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(500)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(95),
MemoryUsagePercent: float64Ptr(40),
},
},
wantMin: 85,
wantMax: 100,
},
{
name: "combined failures - business degraded + infra healthy",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
RequestCountSLA: 1000,
SLA: 0.90,
ErrorRate: 0.05,
UpstreamErrorRate: 0.02,
Duration: OpsPercentiles{P99: intPtr(10000)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(20),
MemoryUsagePercent: float64Ptr(30),
},
},
wantMin: 25,
wantMax: 50,
},
{
name: "combined failures - business healthy + infra degraded",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
RequestCountSLA: 1000,
SLA: 0.998,
ErrorRate: 0.001,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(600)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(false),
RedisOK: boolPtr(false),
CPUUsagePercent: float64Ptr(95),
MemoryUsagePercent: float64Ptr(95),
},
},
wantMin: 70,
wantMax: 90,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
score := computeDashboardHealthScore(time.Now().UTC(), tt.overview)
require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %d", tt.wantMin)
require.LessOrEqual(t, score, tt.wantMax, "score should be <= %d", tt.wantMax)
require.GreaterOrEqual(t, score, 0, "score must be >= 0")
require.LessOrEqual(t, score, 100, "score must be <= 100")
})
}
}
func TestComputeBusinessHealth(t *testing.T) {
t.Parallel()
tests := []struct {
name string
overview *OpsDashboardOverview
wantMin float64
wantMax float64
}{
{
name: "perfect metrics",
overview: &OpsDashboardOverview{
SLA: 1.0,
ErrorRate: 0,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(500)},
},
wantMin: 100,
wantMax: 100,
},
{
name: "SLA boundary 99.5%",
overview: &OpsDashboardOverview{
SLA: 0.995,
ErrorRate: 0,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(500)},
},
wantMin: 100,
wantMax: 100,
},
{
name: "SLA boundary 95%",
overview: &OpsDashboardOverview{
SLA: 0.95,
ErrorRate: 0,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(500)},
},
wantMin: 50,
wantMax: 60,
},
{
name: "error rate boundary 0.5%",
overview: &OpsDashboardOverview{
SLA: 0.995,
ErrorRate: 0.005,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(500)},
},
wantMin: 95,
wantMax: 100,
},
{
name: "latency boundary 1000ms",
overview: &OpsDashboardOverview{
SLA: 0.995,
ErrorRate: 0,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(1000)},
},
wantMin: 95,
wantMax: 100,
},
{
name: "upstream error dominates",
overview: &OpsDashboardOverview{
SLA: 0.995,
ErrorRate: 0.001,
UpstreamErrorRate: 0.03,
Duration: OpsPercentiles{P99: intPtr(500)},
},
wantMin: 75,
wantMax: 90,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
score := computeBusinessHealth(tt.overview)
require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %.1f", tt.wantMin)
require.LessOrEqual(t, score, tt.wantMax, "score should be <= %.1f", tt.wantMax)
require.GreaterOrEqual(t, score, 0.0, "score must be >= 0")
require.LessOrEqual(t, score, 100.0, "score must be <= 100")
})
}
}
func TestComputeInfraHealth(t *testing.T) {
t.Parallel()
now := time.Now().UTC()
tests := []struct {
name string
overview *OpsDashboardOverview
wantMin float64
wantMax float64
}{
{
name: "all infrastructure healthy",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(30),
MemoryUsagePercent: float64Ptr(40),
},
},
wantMin: 100,
wantMax: 100,
},
{
name: "DB down",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(false),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(30),
MemoryUsagePercent: float64Ptr(40),
},
},
wantMin: 50,
wantMax: 70,
},
{
name: "Redis down",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(false),
CPUUsagePercent: float64Ptr(30),
MemoryUsagePercent: float64Ptr(40),
},
},
wantMin: 80,
wantMax: 95,
},
{
name: "CPU at 90%",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(90),
MemoryUsagePercent: float64Ptr(40),
},
},
wantMin: 85,
wantMax: 95,
},
{
name: "failed background job",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(30),
MemoryUsagePercent: float64Ptr(40),
},
JobHeartbeats: []*OpsJobHeartbeat{
{
JobName: "test-job",
LastErrorAt: &now,
},
},
},
wantMin: 70,
wantMax: 90,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
score := computeInfraHealth(now, tt.overview)
require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %.1f", tt.wantMin)
require.LessOrEqual(t, score, tt.wantMax, "score should be <= %.1f", tt.wantMax)
require.GreaterOrEqual(t, score, 0.0, "score must be >= 0")
require.LessOrEqual(t, score, 100.0, "score must be <= 100")
})
}
}
func timePtr(v time.Time) *time.Time { return &v }
func stringPtr(v string) *string { return &v }