主要改动: - 移除 p95_latency_ms 和 p99_latency_ms 告警指标类型 - 移除配置中的 latency_p99_ms_max 阈值设置 - 简化健康分数计算(移除latency权重,重新归一化SLA和错误率) - 移除duration相关的诊断规则和阈值检查 - 统一术语:延迟 → 请求时长 - 保留duration数据展示,但不再用于告警判断 - 聚焦TTFT作为主要的响应速度告警指标 影响范围: - Backend: handler, service, models, tests - Frontend: API types, i18n, components
142 lines
4.1 KiB
Go
142 lines
4.1 KiB
Go
package service
|
|
|
|
import (
|
|
"math"
|
|
"time"
|
|
)
|
|
|
|
// computeDashboardHealthScore computes a 0-100 health score from the metrics returned by the dashboard overview.
|
|
//
|
|
// Design goals:
|
|
// - Backend-owned scoring (UI only displays).
|
|
// - Layered scoring: Business Health (70%) + Infrastructure Health (30%)
|
|
// - Avoids double-counting (e.g., DB failure affects both infra and business metrics)
|
|
// - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data.
|
|
func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int {
|
|
if overview == nil {
|
|
return 0
|
|
}
|
|
|
|
// Idle/no-data: avoid showing a "bad" score when there is no traffic.
|
|
// UI can still render a gray/idle state based on QPS + error rate.
|
|
if overview.RequestCountSLA <= 0 && overview.RequestCountTotal <= 0 && overview.ErrorCountTotal <= 0 {
|
|
return 100
|
|
}
|
|
|
|
businessHealth := computeBusinessHealth(overview)
|
|
infraHealth := computeInfraHealth(now, overview)
|
|
|
|
// Weighted combination: 70% business + 30% infrastructure
|
|
score := businessHealth*0.7 + infraHealth*0.3
|
|
return int(math.Round(clampFloat64(score, 0, 100)))
|
|
}
|
|
|
|
// computeBusinessHealth calculates business health score (0-100)
|
|
// Components: SLA (50%) + Error Rate (30%)
|
|
func computeBusinessHealth(overview *OpsDashboardOverview) float64 {
|
|
// SLA score: 99.5% → 100, 95% → 0 (linear)
|
|
slaScore := 100.0
|
|
slaPct := clampFloat64(overview.SLA*100, 0, 100)
|
|
if slaPct < 99.5 {
|
|
if slaPct >= 95 {
|
|
slaScore = (slaPct - 95) / 4.5 * 100
|
|
} else {
|
|
slaScore = 0
|
|
}
|
|
}
|
|
|
|
// Error rate score: 0.5% → 100, 5% → 0 (linear)
|
|
// Combines request errors and upstream errors
|
|
errorScore := 100.0
|
|
errorPct := clampFloat64(overview.ErrorRate*100, 0, 100)
|
|
upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100)
|
|
combinedErrorPct := math.Max(errorPct, upstreamPct) // Use worst case
|
|
if combinedErrorPct > 0.5 {
|
|
if combinedErrorPct <= 5 {
|
|
errorScore = (5 - combinedErrorPct) / 4.5 * 100
|
|
} else {
|
|
errorScore = 0
|
|
}
|
|
}
|
|
|
|
// Weighted combination (renormalized after removing duration)
|
|
const weightSum = 0.8
|
|
return (slaScore*0.5 + errorScore*0.3) / weightSum
|
|
}
|
|
|
|
// computeInfraHealth calculates infrastructure health score (0-100)
|
|
// Components: Storage (40%) + Compute Resources (30%) + Background Jobs (30%)
|
|
func computeInfraHealth(now time.Time, overview *OpsDashboardOverview) float64 {
|
|
// Storage score: DB critical, Redis less critical
|
|
storageScore := 100.0
|
|
if overview.SystemMetrics != nil {
|
|
if overview.SystemMetrics.DBOK != nil && !*overview.SystemMetrics.DBOK {
|
|
storageScore = 0 // DB failure is critical
|
|
} else if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK {
|
|
storageScore = 50 // Redis failure is degraded but not critical
|
|
}
|
|
}
|
|
|
|
// Compute resources score: CPU + Memory
|
|
computeScore := 100.0
|
|
if overview.SystemMetrics != nil {
|
|
cpuScore := 100.0
|
|
if overview.SystemMetrics.CPUUsagePercent != nil {
|
|
cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100)
|
|
if cpuPct > 80 {
|
|
if cpuPct <= 100 {
|
|
cpuScore = (100 - cpuPct) / 20 * 100
|
|
} else {
|
|
cpuScore = 0
|
|
}
|
|
}
|
|
}
|
|
|
|
memScore := 100.0
|
|
if overview.SystemMetrics.MemoryUsagePercent != nil {
|
|
memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100)
|
|
if memPct > 85 {
|
|
if memPct <= 100 {
|
|
memScore = (100 - memPct) / 15 * 100
|
|
} else {
|
|
memScore = 0
|
|
}
|
|
}
|
|
}
|
|
|
|
computeScore = (cpuScore + memScore) / 2
|
|
}
|
|
|
|
// Background jobs score
|
|
jobScore := 100.0
|
|
failedJobs := 0
|
|
totalJobs := 0
|
|
for _, hb := range overview.JobHeartbeats {
|
|
if hb == nil {
|
|
continue
|
|
}
|
|
totalJobs++
|
|
if hb.LastErrorAt != nil && (hb.LastSuccessAt == nil || hb.LastErrorAt.After(*hb.LastSuccessAt)) {
|
|
failedJobs++
|
|
} else if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute {
|
|
failedJobs++
|
|
}
|
|
}
|
|
if totalJobs > 0 && failedJobs > 0 {
|
|
jobScore = (1 - float64(failedJobs)/float64(totalJobs)) * 100
|
|
}
|
|
|
|
// Weighted combination
|
|
return storageScore*0.4 + computeScore*0.3 + jobScore*0.3
|
|
}
|
|
|
|
func clampFloat64(v float64, min float64, max float64) float64 {
|
|
if v < min {
|
|
return min
|
|
}
|
|
if v > max {
|
|
return max
|
|
}
|
|
return v
|
|
}
|