feat(运维监控): 增强监控功能和健康评分系统
后端改进: - 新增健康评分计算服务(ops_health_score.go) - 添加分布式锁支持(ops_advisory_lock.go) - 优化指标采集和聚合逻辑 - 新增运维指标采集间隔配置(60-3600秒) - 移除未使用的WebSocket查询token认证中间件 - 改进清理服务和告警评估逻辑 前端改进: - 简化OpsDashboard组件结构 - 完善国际化文本(中英文) - 新增运维监控相关API类型定义 - 添加运维指标采集间隔设置界面 - 优化错误详情模态框 测试: - 添加健康评分单元测试 - 更新API契约测试
This commit is contained in:
126
backend/internal/service/ops_health_score.go
Normal file
126
backend/internal/service/ops_health_score.go
Normal file
@@ -0,0 +1,126 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"math"
|
||||
"time"
|
||||
)
|
||||
|
||||
// computeDashboardHealthScore computes a 0-100 health score from the metrics returned by the dashboard overview.
|
||||
//
|
||||
// Design goals:
|
||||
// - Backend-owned scoring (UI only displays).
|
||||
// - Uses "overall" business indicators (SLA/error/latency) plus infra indicators (db/redis/cpu/mem/jobs).
|
||||
// - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data.
|
||||
func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int {
|
||||
if overview == nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
// Idle/no-data: avoid showing a "bad" score when there is no traffic.
|
||||
// UI can still render a gray/idle state based on QPS + error rate.
|
||||
if overview.RequestCountSLA <= 0 && overview.RequestCountTotal <= 0 && overview.ErrorCountTotal <= 0 {
|
||||
return 100
|
||||
}
|
||||
|
||||
score := 100.0
|
||||
|
||||
// --- SLA (primary signal) ---
|
||||
// SLA is a ratio (0..1). Target is intentionally modest for LLM gateways; it can be tuned later.
|
||||
slaPct := clampFloat64(overview.SLA*100, 0, 100)
|
||||
if slaPct < 99.5 {
|
||||
// Up to -45 points as SLA drops.
|
||||
score -= math.Min(45, (99.5-slaPct)*12)
|
||||
}
|
||||
|
||||
// --- Error rates (secondary signal) ---
|
||||
errorPct := clampFloat64(overview.ErrorRate*100, 0, 100)
|
||||
if errorPct > 1 {
|
||||
// Cap at -20 points by 6% error rate.
|
||||
score -= math.Min(20, (errorPct-1)*4)
|
||||
}
|
||||
|
||||
upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100)
|
||||
if upstreamPct > 1 {
|
||||
// Upstream instability deserves extra weight, but keep it smaller than SLA/error.
|
||||
score -= math.Min(15, (upstreamPct-1)*3)
|
||||
}
|
||||
|
||||
// --- Latency (tail-focused) ---
|
||||
// Use p99 of duration + TTFT. Penalize only when clearly elevated.
|
||||
if overview.Duration.P99 != nil {
|
||||
p99 := float64(*overview.Duration.P99)
|
||||
if p99 > 2000 {
|
||||
// From 2s upward, gradually penalize up to -20.
|
||||
score -= math.Min(20, (p99-2000)/900) // ~20s => ~-20
|
||||
}
|
||||
}
|
||||
if overview.TTFT.P99 != nil {
|
||||
p99 := float64(*overview.TTFT.P99)
|
||||
if p99 > 500 {
|
||||
// TTFT > 500ms starts hurting; cap at -10.
|
||||
score -= math.Min(10, (p99-500)/200) // 2.5s => -10
|
||||
}
|
||||
}
|
||||
|
||||
// --- System metrics snapshot (best-effort) ---
|
||||
if overview.SystemMetrics != nil {
|
||||
if overview.SystemMetrics.DBOK != nil && !*overview.SystemMetrics.DBOK {
|
||||
score -= 20
|
||||
}
|
||||
if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK {
|
||||
score -= 15
|
||||
}
|
||||
|
||||
if overview.SystemMetrics.CPUUsagePercent != nil {
|
||||
cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100)
|
||||
if cpuPct > 85 {
|
||||
score -= math.Min(10, (cpuPct-85)*1.5)
|
||||
}
|
||||
}
|
||||
if overview.SystemMetrics.MemoryUsagePercent != nil {
|
||||
memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100)
|
||||
if memPct > 90 {
|
||||
score -= math.Min(10, (memPct-90)*1.0)
|
||||
}
|
||||
}
|
||||
|
||||
if overview.SystemMetrics.DBConnWaiting != nil && *overview.SystemMetrics.DBConnWaiting > 0 {
|
||||
waiting := float64(*overview.SystemMetrics.DBConnWaiting)
|
||||
score -= math.Min(10, waiting*2)
|
||||
}
|
||||
if overview.SystemMetrics.ConcurrencyQueueDepth != nil && *overview.SystemMetrics.ConcurrencyQueueDepth > 0 {
|
||||
depth := float64(*overview.SystemMetrics.ConcurrencyQueueDepth)
|
||||
score -= math.Min(10, depth*0.5)
|
||||
}
|
||||
}
|
||||
|
||||
// --- Job heartbeats (best-effort) ---
|
||||
// Penalize only clear "error after last success" signals, and cap the impact.
|
||||
jobPenalty := 0.0
|
||||
for _, hb := range overview.JobHeartbeats {
|
||||
if hb == nil {
|
||||
continue
|
||||
}
|
||||
if hb.LastErrorAt != nil && (hb.LastSuccessAt == nil || hb.LastErrorAt.After(*hb.LastSuccessAt)) {
|
||||
jobPenalty += 5
|
||||
continue
|
||||
}
|
||||
if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute {
|
||||
jobPenalty += 2
|
||||
}
|
||||
}
|
||||
score -= math.Min(15, jobPenalty)
|
||||
|
||||
score = clampFloat64(score, 0, 100)
|
||||
return int(math.Round(score))
|
||||
}
|
||||
|
||||
func clampFloat64(v float64, min float64, max float64) float64 {
|
||||
if v < min {
|
||||
return min
|
||||
}
|
||||
if v > max {
|
||||
return max
|
||||
}
|
||||
return v
|
||||
}
|
||||
Reference in New Issue
Block a user