Files
sub2api/backend/internal/service/ops_health_score.go
IanShaw027 585257d340 feat(运维监控): 增强监控功能和健康评分系统
后端改进:
- 新增健康评分计算服务(ops_health_score.go)
- 添加分布式锁支持(ops_advisory_lock.go)
- 优化指标采集和聚合逻辑
- 新增运维指标采集间隔配置(60-3600秒)
- 移除未使用的WebSocket查询token认证中间件
- 改进清理服务和告警评估逻辑

前端改进:
- 简化OpsDashboard组件结构
- 完善国际化文本(中英文)
- 新增运维监控相关API类型定义
- 添加运维指标采集间隔设置界面
- 优化错误详情模态框

测试:
- 添加健康评分单元测试
- 更新API契约测试
2026-01-10 01:38:47 +08:00

127 lines
3.8 KiB
Go

package service
import (
"math"
"time"
)
// computeDashboardHealthScore computes a 0-100 health score from the metrics returned by the dashboard overview.
//
// Design goals:
// - Backend-owned scoring (UI only displays).
// - Uses "overall" business indicators (SLA/error/latency) plus infra indicators (db/redis/cpu/mem/jobs).
// - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data.
func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int {
if overview == nil {
return 0
}
// Idle/no-data: avoid showing a "bad" score when there is no traffic.
// UI can still render a gray/idle state based on QPS + error rate.
if overview.RequestCountSLA <= 0 && overview.RequestCountTotal <= 0 && overview.ErrorCountTotal <= 0 {
return 100
}
score := 100.0
// --- SLA (primary signal) ---
// SLA is a ratio (0..1). Target is intentionally modest for LLM gateways; it can be tuned later.
slaPct := clampFloat64(overview.SLA*100, 0, 100)
if slaPct < 99.5 {
// Up to -45 points as SLA drops.
score -= math.Min(45, (99.5-slaPct)*12)
}
// --- Error rates (secondary signal) ---
errorPct := clampFloat64(overview.ErrorRate*100, 0, 100)
if errorPct > 1 {
// Cap at -20 points by 6% error rate.
score -= math.Min(20, (errorPct-1)*4)
}
upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100)
if upstreamPct > 1 {
// Upstream instability deserves extra weight, but keep it smaller than SLA/error.
score -= math.Min(15, (upstreamPct-1)*3)
}
// --- Latency (tail-focused) ---
// Use p99 of duration + TTFT. Penalize only when clearly elevated.
if overview.Duration.P99 != nil {
p99 := float64(*overview.Duration.P99)
if p99 > 2000 {
// From 2s upward, gradually penalize up to -20.
score -= math.Min(20, (p99-2000)/900) // ~20s => ~-20
}
}
if overview.TTFT.P99 != nil {
p99 := float64(*overview.TTFT.P99)
if p99 > 500 {
// TTFT > 500ms starts hurting; cap at -10.
score -= math.Min(10, (p99-500)/200) // 2.5s => -10
}
}
// --- System metrics snapshot (best-effort) ---
if overview.SystemMetrics != nil {
if overview.SystemMetrics.DBOK != nil && !*overview.SystemMetrics.DBOK {
score -= 20
}
if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK {
score -= 15
}
if overview.SystemMetrics.CPUUsagePercent != nil {
cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100)
if cpuPct > 85 {
score -= math.Min(10, (cpuPct-85)*1.5)
}
}
if overview.SystemMetrics.MemoryUsagePercent != nil {
memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100)
if memPct > 90 {
score -= math.Min(10, (memPct-90)*1.0)
}
}
if overview.SystemMetrics.DBConnWaiting != nil && *overview.SystemMetrics.DBConnWaiting > 0 {
waiting := float64(*overview.SystemMetrics.DBConnWaiting)
score -= math.Min(10, waiting*2)
}
if overview.SystemMetrics.ConcurrencyQueueDepth != nil && *overview.SystemMetrics.ConcurrencyQueueDepth > 0 {
depth := float64(*overview.SystemMetrics.ConcurrencyQueueDepth)
score -= math.Min(10, depth*0.5)
}
}
// --- Job heartbeats (best-effort) ---
// Penalize only clear "error after last success" signals, and cap the impact.
jobPenalty := 0.0
for _, hb := range overview.JobHeartbeats {
if hb == nil {
continue
}
if hb.LastErrorAt != nil && (hb.LastSuccessAt == nil || hb.LastErrorAt.After(*hb.LastSuccessAt)) {
jobPenalty += 5
continue
}
if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute {
jobPenalty += 2
}
}
score -= math.Min(15, jobPenalty)
score = clampFloat64(score, 0, 100)
return int(math.Round(score))
}
func clampFloat64(v float64, min float64, max float64) float64 {
if v < min {
return min
}
if v > max {
return max
}
return v
}