feat(ops): 优化健康评分算法和智能诊断机制
- 采用分层加权评分(业务70% + 基础设施30%),避免重复扣分 - 新增延迟诊断(P99 > 2s critical, > 1s warning) - 新增资源诊断(CPU/内存/DB/Redis状态) - 调整诊断阈值(上游错误率5% critical,请求错误率3% critical) - 为每个诊断项添加可操作建议 - 添加完整的单元测试覆盖(30+测试用例) - 完善中英文国际化文本
This commit is contained in:
@@ -9,7 +9,8 @@ import (
|
||||
//
|
||||
// Design goals:
|
||||
// - Backend-owned scoring (UI only displays).
|
||||
// - Uses "overall" business indicators (SLA/error/latency) plus infra indicators (db/redis/cpu/mem/jobs).
|
||||
// - Layered scoring: Business Health (70%) + Infrastructure Health (30%)
|
||||
// - Avoids double-counting (e.g., DB failure affects both infra and business metrics)
|
||||
// - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data.
|
||||
func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int {
|
||||
if overview == nil {
|
||||
@@ -22,97 +23,124 @@ func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview)
|
||||
return 100
|
||||
}
|
||||
|
||||
score := 100.0
|
||||
businessHealth := computeBusinessHealth(overview)
|
||||
infraHealth := computeInfraHealth(now, overview)
|
||||
|
||||
// --- SLA (primary signal) ---
|
||||
// SLA is a ratio (0..1). Target is intentionally modest for LLM gateways; it can be tuned later.
|
||||
// Weighted combination: 70% business + 30% infrastructure
|
||||
score := businessHealth*0.7 + infraHealth*0.3
|
||||
return int(math.Round(clampFloat64(score, 0, 100)))
|
||||
}
|
||||
|
||||
// computeBusinessHealth calculates business health score (0-100)
|
||||
// Components: SLA (50%) + Error Rate (30%) + Latency (20%)
|
||||
func computeBusinessHealth(overview *OpsDashboardOverview) float64 {
|
||||
// SLA score: 99.5% → 100, 95% → 0 (linear)
|
||||
slaScore := 100.0
|
||||
slaPct := clampFloat64(overview.SLA*100, 0, 100)
|
||||
if slaPct < 99.5 {
|
||||
// Up to -45 points as SLA drops.
|
||||
score -= math.Min(45, (99.5-slaPct)*12)
|
||||
if slaPct >= 95 {
|
||||
slaScore = (slaPct - 95) / 4.5 * 100
|
||||
} else {
|
||||
slaScore = 0
|
||||
}
|
||||
}
|
||||
|
||||
// --- Error rates (secondary signal) ---
|
||||
// Error rate score: 0.5% → 100, 5% → 0 (linear)
|
||||
// Combines request errors and upstream errors
|
||||
errorScore := 100.0
|
||||
errorPct := clampFloat64(overview.ErrorRate*100, 0, 100)
|
||||
if errorPct > 1 {
|
||||
// Cap at -20 points by 6% error rate.
|
||||
score -= math.Min(20, (errorPct-1)*4)
|
||||
}
|
||||
|
||||
upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100)
|
||||
if upstreamPct > 1 {
|
||||
// Upstream instability deserves extra weight, but keep it smaller than SLA/error.
|
||||
score -= math.Min(15, (upstreamPct-1)*3)
|
||||
combinedErrorPct := math.Max(errorPct, upstreamPct) // Use worst case
|
||||
if combinedErrorPct > 0.5 {
|
||||
if combinedErrorPct <= 5 {
|
||||
errorScore = (5 - combinedErrorPct) / 4.5 * 100
|
||||
} else {
|
||||
errorScore = 0
|
||||
}
|
||||
}
|
||||
|
||||
// --- Latency (tail-focused) ---
|
||||
// Use p99 of duration + TTFT. Penalize only when clearly elevated.
|
||||
// Latency score: 1s → 100, 10s → 0 (linear)
|
||||
// Uses P99 of duration (TTFT is less critical for overall health)
|
||||
latencyScore := 100.0
|
||||
if overview.Duration.P99 != nil {
|
||||
p99 := float64(*overview.Duration.P99)
|
||||
if p99 > 2000 {
|
||||
// From 2s upward, gradually penalize up to -20.
|
||||
score -= math.Min(20, (p99-2000)/900) // ~20s => ~-20
|
||||
}
|
||||
}
|
||||
if overview.TTFT.P99 != nil {
|
||||
p99 := float64(*overview.TTFT.P99)
|
||||
if p99 > 500 {
|
||||
// TTFT > 500ms starts hurting; cap at -10.
|
||||
score -= math.Min(10, (p99-500)/200) // 2.5s => -10
|
||||
if p99 > 1000 {
|
||||
if p99 <= 10000 {
|
||||
latencyScore = (10000 - p99) / 9000 * 100
|
||||
} else {
|
||||
latencyScore = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --- System metrics snapshot (best-effort) ---
|
||||
// Weighted combination
|
||||
return slaScore*0.5 + errorScore*0.3 + latencyScore*0.2
|
||||
}
|
||||
|
||||
// computeInfraHealth calculates infrastructure health score (0-100)
|
||||
// Components: Storage (40%) + Compute Resources (30%) + Background Jobs (30%)
|
||||
func computeInfraHealth(now time.Time, overview *OpsDashboardOverview) float64 {
|
||||
// Storage score: DB critical, Redis less critical
|
||||
storageScore := 100.0
|
||||
if overview.SystemMetrics != nil {
|
||||
if overview.SystemMetrics.DBOK != nil && !*overview.SystemMetrics.DBOK {
|
||||
score -= 20
|
||||
}
|
||||
if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK {
|
||||
score -= 15
|
||||
}
|
||||
|
||||
if overview.SystemMetrics.CPUUsagePercent != nil {
|
||||
cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100)
|
||||
if cpuPct > 85 {
|
||||
score -= math.Min(10, (cpuPct-85)*1.5)
|
||||
}
|
||||
}
|
||||
if overview.SystemMetrics.MemoryUsagePercent != nil {
|
||||
memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100)
|
||||
if memPct > 90 {
|
||||
score -= math.Min(10, (memPct-90)*1.0)
|
||||
}
|
||||
}
|
||||
|
||||
if overview.SystemMetrics.DBConnWaiting != nil && *overview.SystemMetrics.DBConnWaiting > 0 {
|
||||
waiting := float64(*overview.SystemMetrics.DBConnWaiting)
|
||||
score -= math.Min(10, waiting*2)
|
||||
}
|
||||
if overview.SystemMetrics.ConcurrencyQueueDepth != nil && *overview.SystemMetrics.ConcurrencyQueueDepth > 0 {
|
||||
depth := float64(*overview.SystemMetrics.ConcurrencyQueueDepth)
|
||||
score -= math.Min(10, depth*0.5)
|
||||
storageScore = 0 // DB failure is critical
|
||||
} else if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK {
|
||||
storageScore = 50 // Redis failure is degraded but not critical
|
||||
}
|
||||
}
|
||||
|
||||
// --- Job heartbeats (best-effort) ---
|
||||
// Penalize only clear "error after last success" signals, and cap the impact.
|
||||
jobPenalty := 0.0
|
||||
// Compute resources score: CPU + Memory
|
||||
computeScore := 100.0
|
||||
if overview.SystemMetrics != nil {
|
||||
cpuScore := 100.0
|
||||
if overview.SystemMetrics.CPUUsagePercent != nil {
|
||||
cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100)
|
||||
if cpuPct > 80 {
|
||||
if cpuPct <= 100 {
|
||||
cpuScore = (100 - cpuPct) / 20 * 100
|
||||
} else {
|
||||
cpuScore = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
memScore := 100.0
|
||||
if overview.SystemMetrics.MemoryUsagePercent != nil {
|
||||
memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100)
|
||||
if memPct > 85 {
|
||||
if memPct <= 100 {
|
||||
memScore = (100 - memPct) / 15 * 100
|
||||
} else {
|
||||
memScore = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
computeScore = (cpuScore + memScore) / 2
|
||||
}
|
||||
|
||||
// Background jobs score
|
||||
jobScore := 100.0
|
||||
failedJobs := 0
|
||||
totalJobs := 0
|
||||
for _, hb := range overview.JobHeartbeats {
|
||||
if hb == nil {
|
||||
continue
|
||||
}
|
||||
totalJobs++
|
||||
if hb.LastErrorAt != nil && (hb.LastSuccessAt == nil || hb.LastErrorAt.After(*hb.LastSuccessAt)) {
|
||||
jobPenalty += 5
|
||||
continue
|
||||
}
|
||||
if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute {
|
||||
jobPenalty += 2
|
||||
failedJobs++
|
||||
} else if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute {
|
||||
failedJobs++
|
||||
}
|
||||
}
|
||||
score -= math.Min(15, jobPenalty)
|
||||
if totalJobs > 0 && failedJobs > 0 {
|
||||
jobScore = (1 - float64(failedJobs)/float64(totalJobs)) * 100
|
||||
}
|
||||
|
||||
score = clampFloat64(score, 0, 100)
|
||||
return int(math.Round(score))
|
||||
// Weighted combination
|
||||
return storageScore*0.4 + computeScore*0.3 + jobScore*0.3
|
||||
}
|
||||
|
||||
func clampFloat64(v float64, min float64, max float64) float64 {
|
||||
|
||||
@@ -55,6 +55,377 @@ func TestComputeDashboardHealthScore_DegradesOnBadSignals(t *testing.T) {
|
||||
require.GreaterOrEqual(t, score, 0)
|
||||
}
|
||||
|
||||
func TestComputeDashboardHealthScore_Comprehensive(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
overview *OpsDashboardOverview
|
||||
wantMin int
|
||||
wantMax int
|
||||
}{
|
||||
{
|
||||
name: "nil overview returns 0",
|
||||
overview: nil,
|
||||
wantMin: 0,
|
||||
wantMax: 0,
|
||||
},
|
||||
{
|
||||
name: "perfect health",
|
||||
overview: &OpsDashboardOverview{
|
||||
RequestCountTotal: 1000,
|
||||
RequestCountSLA: 1000,
|
||||
SLA: 1.0,
|
||||
ErrorRate: 0,
|
||||
UpstreamErrorRate: 0,
|
||||
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||
TTFT: OpsPercentiles{P99: intPtr(100)},
|
||||
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||
DBOK: boolPtr(true),
|
||||
RedisOK: boolPtr(true),
|
||||
CPUUsagePercent: float64Ptr(30),
|
||||
MemoryUsagePercent: float64Ptr(40),
|
||||
},
|
||||
},
|
||||
wantMin: 100,
|
||||
wantMax: 100,
|
||||
},
|
||||
{
|
||||
name: "good health - SLA 99.8%",
|
||||
overview: &OpsDashboardOverview{
|
||||
RequestCountTotal: 1000,
|
||||
RequestCountSLA: 1000,
|
||||
SLA: 0.998,
|
||||
ErrorRate: 0.003,
|
||||
UpstreamErrorRate: 0.001,
|
||||
Duration: OpsPercentiles{P99: intPtr(800)},
|
||||
TTFT: OpsPercentiles{P99: intPtr(200)},
|
||||
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||
DBOK: boolPtr(true),
|
||||
RedisOK: boolPtr(true),
|
||||
CPUUsagePercent: float64Ptr(50),
|
||||
MemoryUsagePercent: float64Ptr(60),
|
||||
},
|
||||
},
|
||||
wantMin: 95,
|
||||
wantMax: 100,
|
||||
},
|
||||
{
|
||||
name: "medium health - SLA 96%",
|
||||
overview: &OpsDashboardOverview{
|
||||
RequestCountTotal: 1000,
|
||||
RequestCountSLA: 1000,
|
||||
SLA: 0.96,
|
||||
ErrorRate: 0.02,
|
||||
UpstreamErrorRate: 0.01,
|
||||
Duration: OpsPercentiles{P99: intPtr(3000)},
|
||||
TTFT: OpsPercentiles{P99: intPtr(600)},
|
||||
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||
DBOK: boolPtr(true),
|
||||
RedisOK: boolPtr(true),
|
||||
CPUUsagePercent: float64Ptr(70),
|
||||
MemoryUsagePercent: float64Ptr(75),
|
||||
},
|
||||
},
|
||||
wantMin: 60,
|
||||
wantMax: 85,
|
||||
},
|
||||
{
|
||||
name: "DB failure",
|
||||
overview: &OpsDashboardOverview{
|
||||
RequestCountTotal: 1000,
|
||||
RequestCountSLA: 1000,
|
||||
SLA: 0.995,
|
||||
ErrorRate: 0,
|
||||
UpstreamErrorRate: 0,
|
||||
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||
DBOK: boolPtr(false),
|
||||
RedisOK: boolPtr(true),
|
||||
CPUUsagePercent: float64Ptr(30),
|
||||
MemoryUsagePercent: float64Ptr(40),
|
||||
},
|
||||
},
|
||||
wantMin: 70,
|
||||
wantMax: 90,
|
||||
},
|
||||
{
|
||||
name: "Redis failure",
|
||||
overview: &OpsDashboardOverview{
|
||||
RequestCountTotal: 1000,
|
||||
RequestCountSLA: 1000,
|
||||
SLA: 0.995,
|
||||
ErrorRate: 0,
|
||||
UpstreamErrorRate: 0,
|
||||
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||
DBOK: boolPtr(true),
|
||||
RedisOK: boolPtr(false),
|
||||
CPUUsagePercent: float64Ptr(30),
|
||||
MemoryUsagePercent: float64Ptr(40),
|
||||
},
|
||||
},
|
||||
wantMin: 85,
|
||||
wantMax: 95,
|
||||
},
|
||||
{
|
||||
name: "high CPU usage",
|
||||
overview: &OpsDashboardOverview{
|
||||
RequestCountTotal: 1000,
|
||||
RequestCountSLA: 1000,
|
||||
SLA: 0.995,
|
||||
ErrorRate: 0,
|
||||
UpstreamErrorRate: 0,
|
||||
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||
DBOK: boolPtr(true),
|
||||
RedisOK: boolPtr(true),
|
||||
CPUUsagePercent: float64Ptr(95),
|
||||
MemoryUsagePercent: float64Ptr(40),
|
||||
},
|
||||
},
|
||||
wantMin: 85,
|
||||
wantMax: 100,
|
||||
},
|
||||
{
|
||||
name: "combined failures - business degraded + infra healthy",
|
||||
overview: &OpsDashboardOverview{
|
||||
RequestCountTotal: 1000,
|
||||
RequestCountSLA: 1000,
|
||||
SLA: 0.90,
|
||||
ErrorRate: 0.05,
|
||||
UpstreamErrorRate: 0.02,
|
||||
Duration: OpsPercentiles{P99: intPtr(10000)},
|
||||
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||
DBOK: boolPtr(true),
|
||||
RedisOK: boolPtr(true),
|
||||
CPUUsagePercent: float64Ptr(20),
|
||||
MemoryUsagePercent: float64Ptr(30),
|
||||
},
|
||||
},
|
||||
wantMin: 25,
|
||||
wantMax: 50,
|
||||
},
|
||||
{
|
||||
name: "combined failures - business healthy + infra degraded",
|
||||
overview: &OpsDashboardOverview{
|
||||
RequestCountTotal: 1000,
|
||||
RequestCountSLA: 1000,
|
||||
SLA: 0.998,
|
||||
ErrorRate: 0.001,
|
||||
UpstreamErrorRate: 0,
|
||||
Duration: OpsPercentiles{P99: intPtr(600)},
|
||||
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||
DBOK: boolPtr(false),
|
||||
RedisOK: boolPtr(false),
|
||||
CPUUsagePercent: float64Ptr(95),
|
||||
MemoryUsagePercent: float64Ptr(95),
|
||||
},
|
||||
},
|
||||
wantMin: 70,
|
||||
wantMax: 90,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
score := computeDashboardHealthScore(time.Now().UTC(), tt.overview)
|
||||
require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %d", tt.wantMin)
|
||||
require.LessOrEqual(t, score, tt.wantMax, "score should be <= %d", tt.wantMax)
|
||||
require.GreaterOrEqual(t, score, 0, "score must be >= 0")
|
||||
require.LessOrEqual(t, score, 100, "score must be <= 100")
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestComputeBusinessHealth(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
overview *OpsDashboardOverview
|
||||
wantMin float64
|
||||
wantMax float64
|
||||
}{
|
||||
{
|
||||
name: "perfect metrics",
|
||||
overview: &OpsDashboardOverview{
|
||||
SLA: 1.0,
|
||||
ErrorRate: 0,
|
||||
UpstreamErrorRate: 0,
|
||||
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||
},
|
||||
wantMin: 100,
|
||||
wantMax: 100,
|
||||
},
|
||||
{
|
||||
name: "SLA boundary 99.5%",
|
||||
overview: &OpsDashboardOverview{
|
||||
SLA: 0.995,
|
||||
ErrorRate: 0,
|
||||
UpstreamErrorRate: 0,
|
||||
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||
},
|
||||
wantMin: 100,
|
||||
wantMax: 100,
|
||||
},
|
||||
{
|
||||
name: "SLA boundary 95%",
|
||||
overview: &OpsDashboardOverview{
|
||||
SLA: 0.95,
|
||||
ErrorRate: 0,
|
||||
UpstreamErrorRate: 0,
|
||||
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||
},
|
||||
wantMin: 50,
|
||||
wantMax: 60,
|
||||
},
|
||||
{
|
||||
name: "error rate boundary 0.5%",
|
||||
overview: &OpsDashboardOverview{
|
||||
SLA: 0.995,
|
||||
ErrorRate: 0.005,
|
||||
UpstreamErrorRate: 0,
|
||||
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||
},
|
||||
wantMin: 95,
|
||||
wantMax: 100,
|
||||
},
|
||||
{
|
||||
name: "latency boundary 1000ms",
|
||||
overview: &OpsDashboardOverview{
|
||||
SLA: 0.995,
|
||||
ErrorRate: 0,
|
||||
UpstreamErrorRate: 0,
|
||||
Duration: OpsPercentiles{P99: intPtr(1000)},
|
||||
},
|
||||
wantMin: 95,
|
||||
wantMax: 100,
|
||||
},
|
||||
{
|
||||
name: "upstream error dominates",
|
||||
overview: &OpsDashboardOverview{
|
||||
SLA: 0.995,
|
||||
ErrorRate: 0.001,
|
||||
UpstreamErrorRate: 0.03,
|
||||
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||
},
|
||||
wantMin: 75,
|
||||
wantMax: 90,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
score := computeBusinessHealth(tt.overview)
|
||||
require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %.1f", tt.wantMin)
|
||||
require.LessOrEqual(t, score, tt.wantMax, "score should be <= %.1f", tt.wantMax)
|
||||
require.GreaterOrEqual(t, score, 0.0, "score must be >= 0")
|
||||
require.LessOrEqual(t, score, 100.0, "score must be <= 100")
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestComputeInfraHealth(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
now := time.Now().UTC()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
overview *OpsDashboardOverview
|
||||
wantMin float64
|
||||
wantMax float64
|
||||
}{
|
||||
{
|
||||
name: "all infrastructure healthy",
|
||||
overview: &OpsDashboardOverview{
|
||||
RequestCountTotal: 1000,
|
||||
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||
DBOK: boolPtr(true),
|
||||
RedisOK: boolPtr(true),
|
||||
CPUUsagePercent: float64Ptr(30),
|
||||
MemoryUsagePercent: float64Ptr(40),
|
||||
},
|
||||
},
|
||||
wantMin: 100,
|
||||
wantMax: 100,
|
||||
},
|
||||
{
|
||||
name: "DB down",
|
||||
overview: &OpsDashboardOverview{
|
||||
RequestCountTotal: 1000,
|
||||
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||
DBOK: boolPtr(false),
|
||||
RedisOK: boolPtr(true),
|
||||
CPUUsagePercent: float64Ptr(30),
|
||||
MemoryUsagePercent: float64Ptr(40),
|
||||
},
|
||||
},
|
||||
wantMin: 50,
|
||||
wantMax: 70,
|
||||
},
|
||||
{
|
||||
name: "Redis down",
|
||||
overview: &OpsDashboardOverview{
|
||||
RequestCountTotal: 1000,
|
||||
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||
DBOK: boolPtr(true),
|
||||
RedisOK: boolPtr(false),
|
||||
CPUUsagePercent: float64Ptr(30),
|
||||
MemoryUsagePercent: float64Ptr(40),
|
||||
},
|
||||
},
|
||||
wantMin: 80,
|
||||
wantMax: 95,
|
||||
},
|
||||
{
|
||||
name: "CPU at 90%",
|
||||
overview: &OpsDashboardOverview{
|
||||
RequestCountTotal: 1000,
|
||||
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||
DBOK: boolPtr(true),
|
||||
RedisOK: boolPtr(true),
|
||||
CPUUsagePercent: float64Ptr(90),
|
||||
MemoryUsagePercent: float64Ptr(40),
|
||||
},
|
||||
},
|
||||
wantMin: 85,
|
||||
wantMax: 95,
|
||||
},
|
||||
{
|
||||
name: "failed background job",
|
||||
overview: &OpsDashboardOverview{
|
||||
RequestCountTotal: 1000,
|
||||
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||
DBOK: boolPtr(true),
|
||||
RedisOK: boolPtr(true),
|
||||
CPUUsagePercent: float64Ptr(30),
|
||||
MemoryUsagePercent: float64Ptr(40),
|
||||
},
|
||||
JobHeartbeats: []*OpsJobHeartbeat{
|
||||
{
|
||||
JobName: "test-job",
|
||||
LastErrorAt: &now,
|
||||
},
|
||||
},
|
||||
},
|
||||
wantMin: 70,
|
||||
wantMax: 90,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
score := computeInfraHealth(now, tt.overview)
|
||||
require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %.1f", tt.wantMin)
|
||||
require.LessOrEqual(t, score, tt.wantMax, "score should be <= %.1f", tt.wantMax)
|
||||
require.GreaterOrEqual(t, score, 0.0, "score must be >= 0")
|
||||
require.LessOrEqual(t, score, 100.0, "score must be <= 100")
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func timePtr(v time.Time) *time.Time { return &v }
|
||||
|
||||
func stringPtr(v string) *string { return &v }
|
||||
|
||||
@@ -1928,22 +1928,62 @@ export default {
|
||||
footer: 'Automated diagnostic suggestions based on current metrics',
|
||||
idle: 'System is currently idle',
|
||||
idleImpact: 'No active traffic',
|
||||
// Resource diagnostics
|
||||
dbDown: 'Database connection failed',
|
||||
dbDownImpact: 'All database operations will fail',
|
||||
dbDownAction: 'Check database service status, network connectivity, and connection configuration',
|
||||
redisDown: 'Redis connection failed',
|
||||
redisDownImpact: 'Cache functionality degraded, performance may decline',
|
||||
redisDownAction: 'Check Redis service status and network connectivity',
|
||||
cpuCritical: 'CPU usage critically high ({usage}%)',
|
||||
cpuCriticalImpact: 'System response slowing, may affect all requests',
|
||||
cpuCriticalAction: 'Check CPU-intensive tasks, consider scaling or code optimization',
|
||||
cpuHigh: 'CPU usage elevated ({usage}%)',
|
||||
cpuHighImpact: 'System load is high, needs attention',
|
||||
cpuHighAction: 'Monitor CPU trends, prepare scaling plan',
|
||||
memoryCritical: 'Memory usage critically high ({usage}%)',
|
||||
memoryCriticalImpact: 'May trigger OOM, system stability threatened',
|
||||
memoryCriticalAction: 'Check for memory leaks, consider increasing memory or optimizing usage',
|
||||
memoryHigh: 'Memory usage elevated ({usage}%)',
|
||||
memoryHighImpact: 'Memory pressure is high, needs attention',
|
||||
memoryHighAction: 'Monitor memory trends, check for memory leaks',
|
||||
// Latency diagnostics
|
||||
latencyCritical: 'Response latency critically high ({latency}ms)',
|
||||
latencyCriticalImpact: 'User experience extremely poor, many requests timing out',
|
||||
latencyCriticalAction: 'Check slow queries, database indexes, network latency, and upstream services',
|
||||
latencyHigh: 'Response latency elevated ({latency}ms)',
|
||||
latencyHighImpact: 'User experience degraded, needs optimization',
|
||||
latencyHighAction: 'Analyze slow request logs, optimize database queries and business logic',
|
||||
ttftHigh: 'Time to first byte elevated ({ttft}ms)',
|
||||
ttftHighImpact: 'User perceived latency increased',
|
||||
ttftHighAction: 'Optimize request processing flow, reduce pre-processing time',
|
||||
// Error rate diagnostics
|
||||
upstreamCritical: 'Upstream error rate critically high ({rate}%)',
|
||||
upstreamCriticalImpact: 'May affect many user requests',
|
||||
upstreamCriticalAction: 'Check upstream service health, enable fallback strategies',
|
||||
upstreamHigh: 'Upstream error rate elevated ({rate}%)',
|
||||
upstreamHighImpact: 'Recommend checking upstream service status',
|
||||
slaCritical: 'SLA critically below target ({sla}%)',
|
||||
slaCriticalImpact: 'User experience severely degraded',
|
||||
slaLow: 'SLA below target ({sla}%)',
|
||||
slaLowImpact: 'Service quality needs attention',
|
||||
upstreamHighAction: 'Contact upstream service team, prepare fallback plan',
|
||||
errorHigh: 'Error rate too high ({rate}%)',
|
||||
errorHighImpact: 'Many requests failing',
|
||||
errorHighAction: 'Check error logs, identify root cause, urgent fix required',
|
||||
errorElevated: 'Error rate elevated ({rate}%)',
|
||||
errorElevatedImpact: 'Recommend checking error logs',
|
||||
errorElevatedAction: 'Analyze error types and distribution, create fix plan',
|
||||
// SLA diagnostics
|
||||
slaCritical: 'SLA critically below target ({sla}%)',
|
||||
slaCriticalImpact: 'User experience severely degraded',
|
||||
slaCriticalAction: 'Urgently investigate errors and latency, consider rate limiting',
|
||||
slaLow: 'SLA below target ({sla}%)',
|
||||
slaLowImpact: 'Service quality needs attention',
|
||||
slaLowAction: 'Analyze SLA decline causes, optimize system performance',
|
||||
// Health score diagnostics
|
||||
healthCritical: 'Overall health score critically low ({score})',
|
||||
healthCriticalImpact: 'Multiple metrics may be degraded; prioritize error rate and latency investigation',
|
||||
healthCriticalAction: 'Comprehensive system check, prioritize critical-level issues',
|
||||
healthLow: 'Overall health score low ({score})',
|
||||
healthLowImpact: 'May indicate minor instability; monitor SLA and error rates',
|
||||
healthLowAction: 'Monitor metric trends, prevent issue escalation',
|
||||
healthy: 'All system metrics normal',
|
||||
healthyImpact: 'Service running stable'
|
||||
},
|
||||
|
||||
@@ -2074,22 +2074,62 @@ export default {
|
||||
footer: '基于当前指标的自动诊断建议',
|
||||
idle: '系统当前处于待机状态',
|
||||
idleImpact: '无活跃流量',
|
||||
// Resource diagnostics
|
||||
dbDown: '数据库连接失败',
|
||||
dbDownImpact: '所有数据库操作将失败',
|
||||
dbDownAction: '检查数据库服务状态、网络连接和连接配置',
|
||||
redisDown: 'Redis连接失败',
|
||||
redisDownImpact: '缓存功能降级,性能可能下降',
|
||||
redisDownAction: '检查Redis服务状态和网络连接',
|
||||
cpuCritical: 'CPU使用率严重过高 ({usage}%)',
|
||||
cpuCriticalImpact: '系统响应变慢,可能影响所有请求',
|
||||
cpuCriticalAction: '检查CPU密集型任务,考虑扩容或优化代码',
|
||||
cpuHigh: 'CPU使用率偏高 ({usage}%)',
|
||||
cpuHighImpact: '系统负载较高,需要关注',
|
||||
cpuHighAction: '监控CPU趋势,准备扩容方案',
|
||||
memoryCritical: '内存使用率严重过高 ({usage}%)',
|
||||
memoryCriticalImpact: '可能触发OOM,系统稳定性受威胁',
|
||||
memoryCriticalAction: '检查内存泄漏,考虑增加内存或优化内存使用',
|
||||
memoryHigh: '内存使用率偏高 ({usage}%)',
|
||||
memoryHighImpact: '内存压力较大,需要关注',
|
||||
memoryHighAction: '监控内存趋势,检查是否有内存泄漏',
|
||||
// Latency diagnostics
|
||||
latencyCritical: '响应延迟严重过高 ({latency}ms)',
|
||||
latencyCriticalImpact: '用户体验极差,大量请求超时',
|
||||
latencyCriticalAction: '检查慢查询、数据库索引、网络延迟和上游服务',
|
||||
latencyHigh: '响应延迟偏高 ({latency}ms)',
|
||||
latencyHighImpact: '用户体验下降,需要优化',
|
||||
latencyHighAction: '分析慢请求日志,优化数据库查询和业务逻辑',
|
||||
ttftHigh: '首字节时间偏高 ({ttft}ms)',
|
||||
ttftHighImpact: '用户感知延迟增加',
|
||||
ttftHighAction: '优化请求处理流程,减少前置逻辑耗时',
|
||||
// Error rate diagnostics
|
||||
upstreamCritical: '上游错误率严重偏高 ({rate}%)',
|
||||
upstreamCriticalImpact: '可能影响大量用户请求',
|
||||
upstreamCriticalAction: '检查上游服务健康状态,启用降级策略',
|
||||
upstreamHigh: '上游错误率偏高 ({rate}%)',
|
||||
upstreamHighImpact: '建议检查上游服务状态',
|
||||
slaCritical: 'SLA 严重低于目标 ({sla}%)',
|
||||
slaCriticalImpact: '用户体验严重受损',
|
||||
slaLow: 'SLA 低于目标 ({sla}%)',
|
||||
slaLowImpact: '需要关注服务质量',
|
||||
upstreamHighAction: '联系上游服务团队,准备降级方案',
|
||||
errorHigh: '错误率过高 ({rate}%)',
|
||||
errorHighImpact: '大量请求失败',
|
||||
errorHighAction: '查看错误日志,定位错误根因,紧急修复',
|
||||
errorElevated: '错误率偏高 ({rate}%)',
|
||||
errorElevatedImpact: '建议检查错误日志',
|
||||
errorElevatedAction: '分析错误类型和分布,制定修复计划',
|
||||
// SLA diagnostics
|
||||
slaCritical: 'SLA 严重低于目标 ({sla}%)',
|
||||
slaCriticalImpact: '用户体验严重受损',
|
||||
slaCriticalAction: '紧急排查错误和延迟问题,考虑限流保护',
|
||||
slaLow: 'SLA 低于目标 ({sla}%)',
|
||||
slaLowImpact: '需要关注服务质量',
|
||||
slaLowAction: '分析SLA下降原因,优化系统性能',
|
||||
// Health score diagnostics
|
||||
healthCritical: '综合健康评分过低 ({score})',
|
||||
healthCriticalImpact: '多个指标可能同时异常,建议优先排查错误与延迟',
|
||||
healthCriticalAction: '全面检查系统状态,优先处理critical级别问题',
|
||||
healthLow: '综合健康评分偏低 ({score})',
|
||||
healthLowImpact: '可能存在轻度波动,建议关注 SLA 与错误率',
|
||||
healthLowAction: '监控指标趋势,预防问题恶化',
|
||||
healthy: '所有系统指标正常',
|
||||
healthyImpact: '服务运行稳定'
|
||||
},
|
||||
|
||||
@@ -287,6 +287,7 @@ interface DiagnosisItem {
|
||||
type: 'critical' | 'warning' | 'info'
|
||||
message: string
|
||||
impact: string
|
||||
action?: string
|
||||
}
|
||||
|
||||
const diagnosisReport = computed<DiagnosisItem[]>(() => {
|
||||
@@ -304,63 +305,157 @@ const diagnosisReport = computed<DiagnosisItem[]>(() => {
|
||||
return report
|
||||
}
|
||||
|
||||
const upstreamRatePct = (ov.upstream_error_rate ?? 0) * 100
|
||||
if (upstreamRatePct > 10) {
|
||||
// Resource diagnostics (highest priority)
|
||||
const sm = ov.system_metrics
|
||||
if (sm) {
|
||||
if (sm.db_ok === false) {
|
||||
report.push({
|
||||
type: 'critical',
|
||||
message: t('admin.ops.diagnosis.dbDown'),
|
||||
impact: t('admin.ops.diagnosis.dbDownImpact'),
|
||||
action: t('admin.ops.diagnosis.dbDownAction')
|
||||
})
|
||||
}
|
||||
if (sm.redis_ok === false) {
|
||||
report.push({
|
||||
type: 'warning',
|
||||
message: t('admin.ops.diagnosis.redisDown'),
|
||||
impact: t('admin.ops.diagnosis.redisDownImpact'),
|
||||
action: t('admin.ops.diagnosis.redisDownAction')
|
||||
})
|
||||
}
|
||||
|
||||
const cpuPct = sm.cpu_usage_percent ?? 0
|
||||
if (cpuPct > 90) {
|
||||
report.push({
|
||||
type: 'critical',
|
||||
message: t('admin.ops.diagnosis.cpuCritical', { usage: cpuPct.toFixed(1) }),
|
||||
impact: t('admin.ops.diagnosis.cpuCriticalImpact'),
|
||||
action: t('admin.ops.diagnosis.cpuCriticalAction')
|
||||
})
|
||||
} else if (cpuPct > 80) {
|
||||
report.push({
|
||||
type: 'warning',
|
||||
message: t('admin.ops.diagnosis.cpuHigh', { usage: cpuPct.toFixed(1) }),
|
||||
impact: t('admin.ops.diagnosis.cpuHighImpact'),
|
||||
action: t('admin.ops.diagnosis.cpuHighAction')
|
||||
})
|
||||
}
|
||||
|
||||
const memPct = sm.memory_usage_percent ?? 0
|
||||
if (memPct > 90) {
|
||||
report.push({
|
||||
type: 'critical',
|
||||
message: t('admin.ops.diagnosis.memoryCritical', { usage: memPct.toFixed(1) }),
|
||||
impact: t('admin.ops.diagnosis.memoryCriticalImpact'),
|
||||
action: t('admin.ops.diagnosis.memoryCriticalAction')
|
||||
})
|
||||
} else if (memPct > 85) {
|
||||
report.push({
|
||||
type: 'warning',
|
||||
message: t('admin.ops.diagnosis.memoryHigh', { usage: memPct.toFixed(1) }),
|
||||
impact: t('admin.ops.diagnosis.memoryHighImpact'),
|
||||
action: t('admin.ops.diagnosis.memoryHighAction')
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Latency diagnostics
|
||||
const durationP99 = ov.duration?.p99_ms ?? 0
|
||||
if (durationP99 > 2000) {
|
||||
report.push({
|
||||
type: 'critical',
|
||||
message: t('admin.ops.diagnosis.upstreamCritical', { rate: upstreamRatePct.toFixed(2) }),
|
||||
impact: t('admin.ops.diagnosis.upstreamCriticalImpact')
|
||||
message: t('admin.ops.diagnosis.latencyCritical', { latency: durationP99.toFixed(0) }),
|
||||
impact: t('admin.ops.diagnosis.latencyCriticalImpact'),
|
||||
action: t('admin.ops.diagnosis.latencyCriticalAction')
|
||||
})
|
||||
} else if (upstreamRatePct > 3) {
|
||||
} else if (durationP99 > 1000) {
|
||||
report.push({
|
||||
type: 'warning',
|
||||
message: t('admin.ops.diagnosis.upstreamHigh', { rate: upstreamRatePct.toFixed(2) }),
|
||||
impact: t('admin.ops.diagnosis.upstreamHighImpact')
|
||||
message: t('admin.ops.diagnosis.latencyHigh', { latency: durationP99.toFixed(0) }),
|
||||
impact: t('admin.ops.diagnosis.latencyHighImpact'),
|
||||
action: t('admin.ops.diagnosis.latencyHighAction')
|
||||
})
|
||||
}
|
||||
|
||||
const ttftP99 = ov.ttft?.p99_ms ?? 0
|
||||
if (ttftP99 > 500) {
|
||||
report.push({
|
||||
type: 'warning',
|
||||
message: t('admin.ops.diagnosis.ttftHigh', { ttft: ttftP99.toFixed(0) }),
|
||||
impact: t('admin.ops.diagnosis.ttftHighImpact'),
|
||||
action: t('admin.ops.diagnosis.ttftHighAction')
|
||||
})
|
||||
}
|
||||
|
||||
// Error rate diagnostics (adjusted thresholds)
|
||||
const upstreamRatePct = (ov.upstream_error_rate ?? 0) * 100
|
||||
if (upstreamRatePct > 5) {
|
||||
report.push({
|
||||
type: 'critical',
|
||||
message: t('admin.ops.diagnosis.upstreamCritical', { rate: upstreamRatePct.toFixed(2) }),
|
||||
impact: t('admin.ops.diagnosis.upstreamCriticalImpact'),
|
||||
action: t('admin.ops.diagnosis.upstreamCriticalAction')
|
||||
})
|
||||
} else if (upstreamRatePct > 2) {
|
||||
report.push({
|
||||
type: 'warning',
|
||||
message: t('admin.ops.diagnosis.upstreamHigh', { rate: upstreamRatePct.toFixed(2) }),
|
||||
impact: t('admin.ops.diagnosis.upstreamHighImpact'),
|
||||
action: t('admin.ops.diagnosis.upstreamHighAction')
|
||||
})
|
||||
}
|
||||
|
||||
const errorPct = (ov.error_rate ?? 0) * 100
|
||||
if (errorPct > 3) {
|
||||
report.push({
|
||||
type: 'critical',
|
||||
message: t('admin.ops.diagnosis.errorHigh', { rate: errorPct.toFixed(2) }),
|
||||
impact: t('admin.ops.diagnosis.errorHighImpact'),
|
||||
action: t('admin.ops.diagnosis.errorHighAction')
|
||||
})
|
||||
} else if (errorPct > 0.5) {
|
||||
report.push({
|
||||
type: 'warning',
|
||||
message: t('admin.ops.diagnosis.errorElevated', { rate: errorPct.toFixed(2) }),
|
||||
impact: t('admin.ops.diagnosis.errorElevatedImpact'),
|
||||
action: t('admin.ops.diagnosis.errorElevatedAction')
|
||||
})
|
||||
}
|
||||
|
||||
// SLA diagnostics
|
||||
const slaPct = (ov.sla ?? 0) * 100
|
||||
if (slaPct < 90) {
|
||||
report.push({
|
||||
type: 'critical',
|
||||
message: t('admin.ops.diagnosis.slaCritical', { sla: slaPct.toFixed(2) }),
|
||||
impact: t('admin.ops.diagnosis.slaCriticalImpact')
|
||||
impact: t('admin.ops.diagnosis.slaCriticalImpact'),
|
||||
action: t('admin.ops.diagnosis.slaCriticalAction')
|
||||
})
|
||||
} else if (slaPct < 98) {
|
||||
report.push({
|
||||
type: 'warning',
|
||||
message: t('admin.ops.diagnosis.slaLow', { sla: slaPct.toFixed(2) }),
|
||||
impact: t('admin.ops.diagnosis.slaLowImpact')
|
||||
})
|
||||
}
|
||||
|
||||
const errorPct = (ov.error_rate ?? 0) * 100
|
||||
if (errorPct > 5) {
|
||||
report.push({
|
||||
type: 'critical',
|
||||
message: t('admin.ops.diagnosis.errorHigh', { rate: errorPct.toFixed(2) }),
|
||||
impact: t('admin.ops.diagnosis.errorHighImpact')
|
||||
})
|
||||
} else if (errorPct > 1) {
|
||||
report.push({
|
||||
type: 'warning',
|
||||
message: t('admin.ops.diagnosis.errorElevated', { rate: errorPct.toFixed(2) }),
|
||||
impact: t('admin.ops.diagnosis.errorElevatedImpact')
|
||||
impact: t('admin.ops.diagnosis.slaLowImpact'),
|
||||
action: t('admin.ops.diagnosis.slaLowAction')
|
||||
})
|
||||
}
|
||||
|
||||
// Health score diagnostics (lowest priority)
|
||||
if (healthScoreValue.value != null) {
|
||||
if (healthScoreValue.value < 60) {
|
||||
report.push({
|
||||
type: 'critical',
|
||||
message: t('admin.ops.diagnosis.healthCritical', { score: healthScoreValue.value }),
|
||||
impact: t('admin.ops.diagnosis.healthCriticalImpact')
|
||||
impact: t('admin.ops.diagnosis.healthCriticalImpact'),
|
||||
action: t('admin.ops.diagnosis.healthCriticalAction')
|
||||
})
|
||||
} else if (healthScoreValue.value < 90) {
|
||||
report.push({
|
||||
type: 'warning',
|
||||
message: t('admin.ops.diagnosis.healthLow', { score: healthScoreValue.value }),
|
||||
impact: t('admin.ops.diagnosis.healthLowImpact')
|
||||
impact: t('admin.ops.diagnosis.healthLowImpact'),
|
||||
action: t('admin.ops.diagnosis.healthLowAction')
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -752,9 +847,12 @@ function openJobsDetails() {
|
||||
/>
|
||||
</svg>
|
||||
</div>
|
||||
<div>
|
||||
<div class="flex-1">
|
||||
<div class="text-xs font-semibold text-gray-900 dark:text-white">{{ item.message }}</div>
|
||||
<div class="mt-0.5 text-[11px] text-gray-500 dark:text-gray-400">{{ item.impact }}</div>
|
||||
<div v-if="item.action" class="mt-1 text-[11px] text-blue-600 dark:text-blue-400">
|
||||
💡 {{ item.action }}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
Reference in New Issue
Block a user