feat(运维监控): 增强监控功能和健康评分系统

后端改进： - 新增健康评分计算服务（ops_health_score.go） - 添加分布式锁支持（ops_advisory_lock.go） - 优化指标采集和聚合逻辑 - 新增运维指标采集间隔配置（60-3600秒） - 移除未使用的WebSocket查询token认证中间件 - 改进清理服务和告警评估逻辑前端改进： - 简化OpsDashboard组件结构 - 完善国际化文本（中英文） - 新增运维监控相关API类型定义 - 添加运维指标采集间隔设置界面 - 优化错误详情模态框测试： - 添加健康评分单元测试 - 更新API契约测试
2026-01-10 01:38:47 +08:00
parent 8ae75e7f6e
commit 585257d340
25 changed files with 570 additions and 385 deletions
--- a/backend/internal/service/ops_health_score.go
+++ b/backend/internal/service/ops_health_score.go
@@ -0,0 +1,126 @@
+package service
+
+import (
+	"math"
+	"time"
+)
+
+// computeDashboardHealthScore computes a 0-100 health score from the metrics returned by the dashboard overview.
+//
+// Design goals:
+// - Backend-owned scoring (UI only displays).
+// - Uses "overall" business indicators (SLA/error/latency) plus infra indicators (db/redis/cpu/mem/jobs).
+// - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data.
+func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int {
+	if overview == nil {
+		return 0
+	}
+
+	// Idle/no-data: avoid showing a "bad" score when there is no traffic.
+	// UI can still render a gray/idle state based on QPS + error rate.
+	if overview.RequestCountSLA <= 0 && overview.RequestCountTotal <= 0 && overview.ErrorCountTotal <= 0 {
+		return 100
+	}
+
+	score := 100.0
+
+	// --- SLA (primary signal) ---
+	// SLA is a ratio (0..1). Target is intentionally modest for LLM gateways; it can be tuned later.
+	slaPct := clampFloat64(overview.SLA*100, 0, 100)
+	if slaPct < 99.5 {
+		// Up to -45 points as SLA drops.
+		score -= math.Min(45, (99.5-slaPct)*12)
+	}
+
+	// --- Error rates (secondary signal) ---
+	errorPct := clampFloat64(overview.ErrorRate*100, 0, 100)
+	if errorPct > 1 {
+		// Cap at -20 points by 6% error rate.
+		score -= math.Min(20, (errorPct-1)*4)
+	}
+
+	upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100)
+	if upstreamPct > 1 {
+		// Upstream instability deserves extra weight, but keep it smaller than SLA/error.
+		score -= math.Min(15, (upstreamPct-1)*3)
+	}
+
+	// --- Latency (tail-focused) ---
+	// Use p99 of duration + TTFT. Penalize only when clearly elevated.
+	if overview.Duration.P99 != nil {
+		p99 := float64(*overview.Duration.P99)
+		if p99 > 2000 {
+			// From 2s upward, gradually penalize up to -20.
+			score -= math.Min(20, (p99-2000)/900) // ~20s => ~-20
+		}
+	}
+	if overview.TTFT.P99 != nil {
+		p99 := float64(*overview.TTFT.P99)
+		if p99 > 500 {
+			// TTFT > 500ms starts hurting; cap at -10.
+			score -= math.Min(10, (p99-500)/200) // 2.5s => -10
+		}
+	}
+
+	// --- System metrics snapshot (best-effort) ---
+	if overview.SystemMetrics != nil {
+		if overview.SystemMetrics.DBOK != nil && !*overview.SystemMetrics.DBOK {
+			score -= 20
+		}
+		if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK {
+			score -= 15
+		}
+
+		if overview.SystemMetrics.CPUUsagePercent != nil {
+			cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100)
+			if cpuPct > 85 {
+				score -= math.Min(10, (cpuPct-85)*1.5)
+			}
+		}
+		if overview.SystemMetrics.MemoryUsagePercent != nil {
+			memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100)
+			if memPct > 90 {
+				score -= math.Min(10, (memPct-90)*1.0)
+			}
+		}
+
+		if overview.SystemMetrics.DBConnWaiting != nil && *overview.SystemMetrics.DBConnWaiting > 0 {
+			waiting := float64(*overview.SystemMetrics.DBConnWaiting)
+			score -= math.Min(10, waiting*2)
+		}
+		if overview.SystemMetrics.ConcurrencyQueueDepth != nil && *overview.SystemMetrics.ConcurrencyQueueDepth > 0 {
+			depth := float64(*overview.SystemMetrics.ConcurrencyQueueDepth)
+			score -= math.Min(10, depth*0.5)
+		}
+	}
+
+	// --- Job heartbeats (best-effort) ---
+	// Penalize only clear "error after last success" signals, and cap the impact.
+	jobPenalty := 0.0
+	for _, hb := range overview.JobHeartbeats {
+		if hb == nil {
+			continue
+		}
+		if hb.LastErrorAt != nil && (hb.LastSuccessAt == nil || hb.LastErrorAt.After(*hb.LastSuccessAt)) {
+			jobPenalty += 5
+			continue
+		}
+		if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute {
+			jobPenalty += 2
+		}
+	}
+	score -= math.Min(15, jobPenalty)
+
+	score = clampFloat64(score, 0, 100)
+	return int(math.Round(score))
+}
+
+func clampFloat64(v float64, min float64, max float64) float64 {
+	if v < min {
+		return min
+	}
+	if v > max {
+		return max
+	}
+	return v
+}