refactor(ops): 移除duration相关告警指标,简化监控配置
主要改动: - 移除 p95_latency_ms 和 p99_latency_ms 告警指标类型 - 移除配置中的 latency_p99_ms_max 阈值设置 - 简化健康分数计算(移除latency权重,重新归一化SLA和错误率) - 移除duration相关的诊断规则和阈值检查 - 统一术语:延迟 → 请求时长 - 保留duration数据展示,但不再用于告警判断 - 聚焦TTFT作为主要的响应速度告警指标 影响范围: - Backend: handler, service, models, tests - Frontend: API types, i18n, components
This commit is contained in:
@@ -32,7 +32,7 @@ func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview)
|
||||
}
|
||||
|
||||
// computeBusinessHealth calculates business health score (0-100)
|
||||
// Components: SLA (50%) + Error Rate (30%) + Latency (20%)
|
||||
// Components: SLA (50%) + Error Rate (30%)
|
||||
func computeBusinessHealth(overview *OpsDashboardOverview) float64 {
|
||||
// SLA score: 99.5% → 100, 95% → 0 (linear)
|
||||
slaScore := 100.0
|
||||
@@ -59,22 +59,9 @@ func computeBusinessHealth(overview *OpsDashboardOverview) float64 {
|
||||
}
|
||||
}
|
||||
|
||||
// Latency score: 1s → 100, 10s → 0 (linear)
|
||||
// Uses P99 of duration (TTFT is less critical for overall health)
|
||||
latencyScore := 100.0
|
||||
if overview.Duration.P99 != nil {
|
||||
p99 := float64(*overview.Duration.P99)
|
||||
if p99 > 1000 {
|
||||
if p99 <= 10000 {
|
||||
latencyScore = (10000 - p99) / 9000 * 100
|
||||
} else {
|
||||
latencyScore = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Weighted combination
|
||||
return slaScore*0.5 + errorScore*0.3 + latencyScore*0.2
|
||||
// Weighted combination (renormalized after removing duration)
|
||||
const weightSum = 0.8
|
||||
return (slaScore*0.5 + errorScore*0.3) / weightSum
|
||||
}
|
||||
|
||||
// computeInfraHealth calculates infrastructure health score (0-100)
|
||||
|
||||
Reference in New Issue
Block a user