refactor(ops): 优化健康分数计算逻辑和阈值
- 移除 SLA 组件(与错误率重复) - 恢复延迟组件,阈值调整为 1s-2s - 错误率阈值调整为 1%-10%(更宽松) - 业务健康分数:错误率 50% + 延迟 50% - 更新所有相关测试用例期望值
This commit is contained in:
@@ -32,36 +32,38 @@ func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// computeBusinessHealth calculates business health score (0-100)
|
// computeBusinessHealth calculates business health score (0-100)
|
||||||
// Components: SLA (50%) + Error Rate (30%)
|
// Components: Error Rate (50%) + Latency (50%)
|
||||||
func computeBusinessHealth(overview *OpsDashboardOverview) float64 {
|
func computeBusinessHealth(overview *OpsDashboardOverview) float64 {
|
||||||
// SLA score: 99.5% → 100, 95% → 0 (linear)
|
// Error rate score: 1% → 100, 10% → 0 (linear)
|
||||||
slaScore := 100.0
|
|
||||||
slaPct := clampFloat64(overview.SLA*100, 0, 100)
|
|
||||||
if slaPct < 99.5 {
|
|
||||||
if slaPct >= 95 {
|
|
||||||
slaScore = (slaPct - 95) / 4.5 * 100
|
|
||||||
} else {
|
|
||||||
slaScore = 0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Error rate score: 0.5% → 100, 5% → 0 (linear)
|
|
||||||
// Combines request errors and upstream errors
|
// Combines request errors and upstream errors
|
||||||
errorScore := 100.0
|
errorScore := 100.0
|
||||||
errorPct := clampFloat64(overview.ErrorRate*100, 0, 100)
|
errorPct := clampFloat64(overview.ErrorRate*100, 0, 100)
|
||||||
upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100)
|
upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100)
|
||||||
combinedErrorPct := math.Max(errorPct, upstreamPct) // Use worst case
|
combinedErrorPct := math.Max(errorPct, upstreamPct) // Use worst case
|
||||||
if combinedErrorPct > 0.5 {
|
if combinedErrorPct > 1.0 {
|
||||||
if combinedErrorPct <= 5 {
|
if combinedErrorPct <= 10.0 {
|
||||||
errorScore = (5 - combinedErrorPct) / 4.5 * 100
|
errorScore = (10.0 - combinedErrorPct) / 9.0 * 100
|
||||||
} else {
|
} else {
|
||||||
errorScore = 0
|
errorScore = 0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Weighted combination (renormalized after removing duration)
|
// Latency score: 1s → 100, 2s → 0 (linear)
|
||||||
const weightSum = 0.8
|
// Uses P99 of duration
|
||||||
return (slaScore*0.5 + errorScore*0.3) / weightSum
|
latencyScore := 100.0
|
||||||
|
if overview.Duration.P99 != nil {
|
||||||
|
p99 := float64(*overview.Duration.P99)
|
||||||
|
if p99 > 1000 {
|
||||||
|
if p99 <= 2000 {
|
||||||
|
latencyScore = (2000 - p99) / 1000 * 100
|
||||||
|
} else {
|
||||||
|
latencyScore = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Weighted combination: 50% error rate + 50% latency
|
||||||
|
return errorScore*0.5 + latencyScore*0.5
|
||||||
}
|
}
|
||||||
|
|
||||||
// computeInfraHealth calculates infrastructure health score (0-100)
|
// computeInfraHealth calculates infrastructure health score (0-100)
|
||||||
|
|||||||
@@ -127,8 +127,8 @@ func TestComputeDashboardHealthScore_Comprehensive(t *testing.T) {
|
|||||||
MemoryUsagePercent: float64Ptr(75),
|
MemoryUsagePercent: float64Ptr(75),
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
wantMin: 57,
|
wantMin: 61,
|
||||||
wantMax: 58,
|
wantMax: 62,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "DB failure",
|
name: "DB failure",
|
||||||
@@ -277,20 +277,42 @@ func TestComputeBusinessHealth(t *testing.T) {
|
|||||||
UpstreamErrorRate: 0,
|
UpstreamErrorRate: 0,
|
||||||
Duration: OpsPercentiles{P99: intPtr(500)},
|
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||||
},
|
},
|
||||||
wantMin: 37,
|
wantMin: 100,
|
||||||
wantMax: 38,
|
wantMax: 100,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "error rate boundary 0.5%",
|
name: "error rate boundary 1%",
|
||||||
overview: &OpsDashboardOverview{
|
overview: &OpsDashboardOverview{
|
||||||
SLA: 0.995,
|
SLA: 0.99,
|
||||||
ErrorRate: 0.005,
|
ErrorRate: 0.01,
|
||||||
UpstreamErrorRate: 0,
|
UpstreamErrorRate: 0,
|
||||||
Duration: OpsPercentiles{P99: intPtr(500)},
|
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||||
},
|
},
|
||||||
wantMin: 95,
|
wantMin: 100,
|
||||||
wantMax: 100,
|
wantMax: 100,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "error rate 5%",
|
||||||
|
overview: &OpsDashboardOverview{
|
||||||
|
SLA: 0.95,
|
||||||
|
ErrorRate: 0.05,
|
||||||
|
UpstreamErrorRate: 0,
|
||||||
|
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||||
|
},
|
||||||
|
wantMin: 77,
|
||||||
|
wantMax: 78,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "latency boundary 2s",
|
||||||
|
overview: &OpsDashboardOverview{
|
||||||
|
SLA: 0.99,
|
||||||
|
ErrorRate: 0,
|
||||||
|
UpstreamErrorRate: 0,
|
||||||
|
Duration: OpsPercentiles{P99: intPtr(2000)},
|
||||||
|
},
|
||||||
|
wantMin: 50,
|
||||||
|
wantMax: 50,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
name: "upstream error dominates",
|
name: "upstream error dominates",
|
||||||
overview: &OpsDashboardOverview{
|
overview: &OpsDashboardOverview{
|
||||||
@@ -299,7 +321,7 @@ func TestComputeBusinessHealth(t *testing.T) {
|
|||||||
UpstreamErrorRate: 0.03,
|
UpstreamErrorRate: 0.03,
|
||||||
Duration: OpsPercentiles{P99: intPtr(500)},
|
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||||
},
|
},
|
||||||
wantMin: 75,
|
wantMin: 88,
|
||||||
wantMax: 90,
|
wantMax: 90,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user