后端改进: - 新增健康评分计算服务(ops_health_score.go) - 添加分布式锁支持(ops_advisory_lock.go) - 优化指标采集和聚合逻辑 - 新增运维指标采集间隔配置(60-3600秒) - 移除未使用的WebSocket查询token认证中间件 - 改进清理服务和告警评估逻辑 前端改进: - 简化OpsDashboard组件结构 - 完善国际化文本(中英文) - 新增运维监控相关API类型定义 - 添加运维指标采集间隔设置界面 - 优化错误详情模态框 测试: - 添加健康评分单元测试 - 更新API契约测试
127 lines
3.8 KiB
Go
127 lines
3.8 KiB
Go
package service
|
|
|
|
import (
|
|
"math"
|
|
"time"
|
|
)
|
|
|
|
// computeDashboardHealthScore computes a 0-100 health score from the metrics returned by the dashboard overview.
|
|
//
|
|
// Design goals:
|
|
// - Backend-owned scoring (UI only displays).
|
|
// - Uses "overall" business indicators (SLA/error/latency) plus infra indicators (db/redis/cpu/mem/jobs).
|
|
// - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data.
|
|
func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int {
|
|
if overview == nil {
|
|
return 0
|
|
}
|
|
|
|
// Idle/no-data: avoid showing a "bad" score when there is no traffic.
|
|
// UI can still render a gray/idle state based on QPS + error rate.
|
|
if overview.RequestCountSLA <= 0 && overview.RequestCountTotal <= 0 && overview.ErrorCountTotal <= 0 {
|
|
return 100
|
|
}
|
|
|
|
score := 100.0
|
|
|
|
// --- SLA (primary signal) ---
|
|
// SLA is a ratio (0..1). Target is intentionally modest for LLM gateways; it can be tuned later.
|
|
slaPct := clampFloat64(overview.SLA*100, 0, 100)
|
|
if slaPct < 99.5 {
|
|
// Up to -45 points as SLA drops.
|
|
score -= math.Min(45, (99.5-slaPct)*12)
|
|
}
|
|
|
|
// --- Error rates (secondary signal) ---
|
|
errorPct := clampFloat64(overview.ErrorRate*100, 0, 100)
|
|
if errorPct > 1 {
|
|
// Cap at -20 points by 6% error rate.
|
|
score -= math.Min(20, (errorPct-1)*4)
|
|
}
|
|
|
|
upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100)
|
|
if upstreamPct > 1 {
|
|
// Upstream instability deserves extra weight, but keep it smaller than SLA/error.
|
|
score -= math.Min(15, (upstreamPct-1)*3)
|
|
}
|
|
|
|
// --- Latency (tail-focused) ---
|
|
// Use p99 of duration + TTFT. Penalize only when clearly elevated.
|
|
if overview.Duration.P99 != nil {
|
|
p99 := float64(*overview.Duration.P99)
|
|
if p99 > 2000 {
|
|
// From 2s upward, gradually penalize up to -20.
|
|
score -= math.Min(20, (p99-2000)/900) // ~20s => ~-20
|
|
}
|
|
}
|
|
if overview.TTFT.P99 != nil {
|
|
p99 := float64(*overview.TTFT.P99)
|
|
if p99 > 500 {
|
|
// TTFT > 500ms starts hurting; cap at -10.
|
|
score -= math.Min(10, (p99-500)/200) // 2.5s => -10
|
|
}
|
|
}
|
|
|
|
// --- System metrics snapshot (best-effort) ---
|
|
if overview.SystemMetrics != nil {
|
|
if overview.SystemMetrics.DBOK != nil && !*overview.SystemMetrics.DBOK {
|
|
score -= 20
|
|
}
|
|
if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK {
|
|
score -= 15
|
|
}
|
|
|
|
if overview.SystemMetrics.CPUUsagePercent != nil {
|
|
cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100)
|
|
if cpuPct > 85 {
|
|
score -= math.Min(10, (cpuPct-85)*1.5)
|
|
}
|
|
}
|
|
if overview.SystemMetrics.MemoryUsagePercent != nil {
|
|
memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100)
|
|
if memPct > 90 {
|
|
score -= math.Min(10, (memPct-90)*1.0)
|
|
}
|
|
}
|
|
|
|
if overview.SystemMetrics.DBConnWaiting != nil && *overview.SystemMetrics.DBConnWaiting > 0 {
|
|
waiting := float64(*overview.SystemMetrics.DBConnWaiting)
|
|
score -= math.Min(10, waiting*2)
|
|
}
|
|
if overview.SystemMetrics.ConcurrencyQueueDepth != nil && *overview.SystemMetrics.ConcurrencyQueueDepth > 0 {
|
|
depth := float64(*overview.SystemMetrics.ConcurrencyQueueDepth)
|
|
score -= math.Min(10, depth*0.5)
|
|
}
|
|
}
|
|
|
|
// --- Job heartbeats (best-effort) ---
|
|
// Penalize only clear "error after last success" signals, and cap the impact.
|
|
jobPenalty := 0.0
|
|
for _, hb := range overview.JobHeartbeats {
|
|
if hb == nil {
|
|
continue
|
|
}
|
|
if hb.LastErrorAt != nil && (hb.LastSuccessAt == nil || hb.LastErrorAt.After(*hb.LastSuccessAt)) {
|
|
jobPenalty += 5
|
|
continue
|
|
}
|
|
if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute {
|
|
jobPenalty += 2
|
|
}
|
|
}
|
|
score -= math.Min(15, jobPenalty)
|
|
|
|
score = clampFloat64(score, 0, 100)
|
|
return int(math.Round(score))
|
|
}
|
|
|
|
func clampFloat64(v float64, min float64, max float64) float64 {
|
|
if v < min {
|
|
return min
|
|
}
|
|
if v > max {
|
|
return max
|
|
}
|
|
return v
|
|
}
|