From 585257d34030c5f068a444cc1b718cc73ae9fa37 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Sat, 10 Jan 2026 01:38:47 +0800 Subject: [PATCH] =?UTF-8?q?feat(=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7):=20?= =?UTF-8?q?=E5=A2=9E=E5=BC=BA=E7=9B=91=E6=8E=A7=E5=8A=9F=E8=83=BD=E5=92=8C?= =?UTF-8?q?=E5=81=A5=E5=BA=B7=E8=AF=84=E5=88=86=E7=B3=BB=E7=BB=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 后端改进: - 新增健康评分计算服务(ops_health_score.go) - 添加分布式锁支持(ops_advisory_lock.go) - 优化指标采集和聚合逻辑 - 新增运维指标采集间隔配置(60-3600秒) - 移除未使用的WebSocket查询token认证中间件 - 改进清理服务和告警评估逻辑 前端改进: - 简化OpsDashboard组件结构 - 完善国际化文本(中英文) - 新增运维监控相关API类型定义 - 添加运维指标采集间隔设置界面 - 优化错误详情模态框 测试: - 添加健康评分单元测试 - 更新API契约测试 --- .../internal/handler/admin/setting_handler.go | 28 +- backend/internal/handler/dto/settings.go | 5 +- .../internal/repository/ops_repo_metrics.go | 27 +- backend/internal/server/api_contract_test.go | 4 +- .../server/middleware/ws_query_token_auth.go | 54 ---- backend/internal/server/router.go | 2 - backend/internal/service/ops_advisory_lock.go | 46 ++++ .../service/ops_aggregation_service.go | 33 ++- .../service/ops_alert_evaluator_service.go | 7 +- .../internal/service/ops_cleanup_service.go | 36 +-- backend/internal/service/ops_dashboard.go | 13 + .../internal/service/ops_dashboard_models.go | 4 + backend/internal/service/ops_health_score.go | 126 +++++++++ .../internal/service/ops_health_score_test.go | 60 +++++ .../internal/service/ops_metrics_collector.go | 66 ++--- backend/internal/service/ops_port.go | 10 + backend/internal/service/setting_service.go | 16 ++ backend/internal/service/settings_view.go | 1 + frontend/src/api/admin/ops.ts | 5 + frontend/src/api/admin/settings.ts | 2 + frontend/src/i18n/locales/en.ts | 60 ++++- frontend/src/i18n/locales/zh.ts | 76 +++++- frontend/src/views/admin/SettingsView.vue | 25 +- frontend/src/views/admin/ops/OpsDashboard.vue | 245 +----------------- .../ops/components/OpsErrorDetailModal.vue | 4 +- 25 files changed, 570 insertions(+), 385 deletions(-) delete mode 100644 backend/internal/server/middleware/ws_query_token_auth.go create mode 100644 backend/internal/service/ops_advisory_lock.go create mode 100644 backend/internal/service/ops_health_score.go create mode 100644 backend/internal/service/ops_health_score_test.go diff --git a/backend/internal/handler/admin/setting_handler.go b/backend/internal/handler/admin/setting_handler.go index 4d4d5639..59f47010 100644 --- a/backend/internal/handler/admin/setting_handler.go +++ b/backend/internal/handler/admin/setting_handler.go @@ -68,6 +68,7 @@ func (h *SettingHandler) GetSettings(c *gin.Context) { OpsMonitoringEnabled: settings.OpsMonitoringEnabled, OpsRealtimeMonitoringEnabled: settings.OpsRealtimeMonitoringEnabled, OpsQueryModeDefault: settings.OpsQueryModeDefault, + OpsMetricsIntervalSeconds: settings.OpsMetricsIntervalSeconds, }) } @@ -115,9 +116,10 @@ type UpdateSettingsRequest struct { IdentityPatchPrompt string `json:"identity_patch_prompt"` // Ops monitoring (vNext) - OpsMonitoringEnabled *bool `json:"ops_monitoring_enabled"` - OpsRealtimeMonitoringEnabled *bool `json:"ops_realtime_monitoring_enabled"` + OpsMonitoringEnabled *bool `json:"ops_monitoring_enabled"` + OpsRealtimeMonitoringEnabled *bool `json:"ops_realtime_monitoring_enabled"` OpsQueryModeDefault *string `json:"ops_query_mode_default"` + OpsMetricsIntervalSeconds *int `json:"ops_metrics_interval_seconds"` } // UpdateSettings 更新系统设置 @@ -173,6 +175,18 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) { } } + // Ops metrics collector interval validation (seconds). + if req.OpsMetricsIntervalSeconds != nil { + v := *req.OpsMetricsIntervalSeconds + if v < 60 { + v = 60 + } + if v > 3600 { + v = 3600 + } + req.OpsMetricsIntervalSeconds = &v + } + settings := &service.SystemSettings{ RegistrationEnabled: req.RegistrationEnabled, EmailVerifyEnabled: req.EmailVerifyEnabled, @@ -219,6 +233,12 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) { } return previousSettings.OpsQueryModeDefault }(), + OpsMetricsIntervalSeconds: func() int { + if req.OpsMetricsIntervalSeconds != nil { + return *req.OpsMetricsIntervalSeconds + } + return previousSettings.OpsMetricsIntervalSeconds + }(), } if err := h.settingService.UpdateSettings(c.Request.Context(), settings); err != nil { @@ -266,6 +286,7 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) { OpsMonitoringEnabled: updatedSettings.OpsMonitoringEnabled, OpsRealtimeMonitoringEnabled: updatedSettings.OpsRealtimeMonitoringEnabled, OpsQueryModeDefault: updatedSettings.OpsQueryModeDefault, + OpsMetricsIntervalSeconds: updatedSettings.OpsMetricsIntervalSeconds, }) } @@ -375,6 +396,9 @@ func diffSettings(before *service.SystemSettings, after *service.SystemSettings, if before.OpsQueryModeDefault != after.OpsQueryModeDefault { changed = append(changed, "ops_query_mode_default") } + if before.OpsMetricsIntervalSeconds != after.OpsMetricsIntervalSeconds { + changed = append(changed, "ops_metrics_interval_seconds") + } return changed } diff --git a/backend/internal/handler/dto/settings.go b/backend/internal/handler/dto/settings.go index 6fd53b26..3f631bfa 100644 --- a/backend/internal/handler/dto/settings.go +++ b/backend/internal/handler/dto/settings.go @@ -39,9 +39,10 @@ type SystemSettings struct { IdentityPatchPrompt string `json:"identity_patch_prompt"` // Ops monitoring (vNext) - OpsMonitoringEnabled bool `json:"ops_monitoring_enabled"` - OpsRealtimeMonitoringEnabled bool `json:"ops_realtime_monitoring_enabled"` + OpsMonitoringEnabled bool `json:"ops_monitoring_enabled"` + OpsRealtimeMonitoringEnabled bool `json:"ops_realtime_monitoring_enabled"` OpsQueryModeDefault string `json:"ops_query_mode_default"` + OpsMetricsIntervalSeconds int `json:"ops_metrics_interval_seconds"` } type PublicSettings struct { diff --git a/backend/internal/repository/ops_repo_metrics.go b/backend/internal/repository/ops_repo_metrics.go index 96bad88a..75345595 100644 --- a/backend/internal/repository/ops_repo_metrics.go +++ b/backend/internal/repository/ops_repo_metrics.go @@ -68,6 +68,9 @@ INSERT INTO ops_system_metrics ( db_ok, redis_ok, + redis_conn_total, + redis_conn_idle, + db_conn_active, db_conn_idle, db_conn_waiting, @@ -83,8 +86,9 @@ INSERT INTO ops_system_metrics ( $21,$22,$23,$24,$25,$26, $27,$28,$29,$30, $31,$32, - $33,$34,$35, - $36,$37 + $33,$34, + $35,$36,$37, + $38,$39 )` _, err := r.db.ExecContext( @@ -130,6 +134,9 @@ INSERT INTO ops_system_metrics ( opsNullBool(input.DBOK), opsNullBool(input.RedisOK), + opsNullInt(input.RedisConnTotal), + opsNullInt(input.RedisConnIdle), + opsNullInt(input.DBConnActive), opsNullInt(input.DBConnIdle), opsNullInt(input.DBConnWaiting), @@ -162,6 +169,9 @@ SELECT db_ok, redis_ok, + redis_conn_total, + redis_conn_idle, + db_conn_active, db_conn_idle, db_conn_waiting, @@ -182,6 +192,8 @@ LIMIT 1` var memPct sql.NullFloat64 var dbOK sql.NullBool var redisOK sql.NullBool + var redisTotal sql.NullInt64 + var redisIdle sql.NullInt64 var dbActive sql.NullInt64 var dbIdle sql.NullInt64 var dbWaiting sql.NullInt64 @@ -198,6 +210,8 @@ LIMIT 1` &memPct, &dbOK, &redisOK, + &redisTotal, + &redisIdle, &dbActive, &dbIdle, &dbWaiting, @@ -231,6 +245,14 @@ LIMIT 1` v := redisOK.Bool out.RedisOK = &v } + if redisTotal.Valid { + v := int(redisTotal.Int64) + out.RedisConnTotal = &v + } + if redisIdle.Valid { + v := int(redisIdle.Int64) + out.RedisConnIdle = &v + } if dbActive.Valid { v := int(dbActive.Int64) out.DBConnActive = &v @@ -398,4 +420,3 @@ func opsNullTime(v *time.Time) any { } return sql.NullTime{Time: *v, Valid: true} } - diff --git a/backend/internal/server/api_contract_test.go b/backend/internal/server/api_contract_test.go index 23cab19c..f8140fe6 100644 --- a/backend/internal/server/api_contract_test.go +++ b/backend/internal/server/api_contract_test.go @@ -319,7 +319,9 @@ func TestAPIContracts(t *testing.T) { "enable_identity_patch": true, "identity_patch_prompt": "", "ops_monitoring_enabled": true, - "ops_realtime_monitoring_enabled": true + "ops_realtime_monitoring_enabled": true, + "ops_query_mode_default": "auto", + "ops_metrics_interval_seconds": 60 } }`, }, diff --git a/backend/internal/server/middleware/ws_query_token_auth.go b/backend/internal/server/middleware/ws_query_token_auth.go deleted file mode 100644 index 3b8d086a..00000000 --- a/backend/internal/server/middleware/ws_query_token_auth.go +++ /dev/null @@ -1,54 +0,0 @@ -package middleware - -import ( - "net/http" - "strings" - - "github.com/gin-gonic/gin" -) - -// InjectBearerTokenFromQueryForWebSocket copies `?token=` into the Authorization header -// for WebSocket handshake requests on a small allow-list of endpoints. -// -// Why: browsers can't set custom headers on WebSocket handshake, but our admin routes -// are protected by header-based auth. This keeps the token support scoped to WS only. -func InjectBearerTokenFromQueryForWebSocket() gin.HandlerFunc { - return func(c *gin.Context) { - if c == nil || c.Request == nil { - if c != nil { - c.Next() - } - return - } - - // Only GET websocket upgrades. - if c.Request.Method != http.MethodGet { - c.Next() - return - } - if !strings.EqualFold(strings.TrimSpace(c.GetHeader("Upgrade")), "websocket") { - c.Next() - return - } - - // If caller already supplied auth headers, don't override. - if strings.TrimSpace(c.GetHeader("Authorization")) != "" || strings.TrimSpace(c.GetHeader("x-api-key")) != "" { - c.Next() - return - } - - // Allow-list ops websocket endpoints. - path := strings.TrimSpace(c.Request.URL.Path) - if !strings.HasPrefix(path, "/api/v1/admin/ops/ws/") { - c.Next() - return - } - - token := strings.TrimSpace(c.Query("token")) - if token != "" { - c.Request.Header.Set("Authorization", "Bearer "+token) - } - - c.Next() - } -} diff --git a/backend/internal/server/router.go b/backend/internal/server/router.go index 85df99bd..3ea087d6 100644 --- a/backend/internal/server/router.go +++ b/backend/internal/server/router.go @@ -25,8 +25,6 @@ func SetupRouter( ) *gin.Engine { // 应用中间件 r.Use(middleware2.Logger()) - // WebSocket handshake auth helper (token via query param, WS endpoints only). - r.Use(middleware2.InjectBearerTokenFromQueryForWebSocket()) r.Use(middleware2.CORS(cfg.CORS)) r.Use(middleware2.SecurityHeaders(cfg.Security.CSP)) diff --git a/backend/internal/service/ops_advisory_lock.go b/backend/internal/service/ops_advisory_lock.go new file mode 100644 index 00000000..f7ef4cee --- /dev/null +++ b/backend/internal/service/ops_advisory_lock.go @@ -0,0 +1,46 @@ +package service + +import ( + "context" + "database/sql" + "hash/fnv" + "time" +) + +func hashAdvisoryLockID(key string) int64 { + h := fnv.New64a() + _, _ = h.Write([]byte(key)) + return int64(h.Sum64()) +} + +func tryAcquireDBAdvisoryLock(ctx context.Context, db *sql.DB, lockID int64) (func(), bool) { + if db == nil { + return nil, false + } + if ctx == nil { + ctx = context.Background() + } + + conn, err := db.Conn(ctx) + if err != nil { + return nil, false + } + + acquired := false + if err := conn.QueryRowContext(ctx, "SELECT pg_try_advisory_lock($1)", lockID).Scan(&acquired); err != nil { + _ = conn.Close() + return nil, false + } + if !acquired { + _ = conn.Close() + return nil, false + } + + release := func() { + unlockCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _, _ = conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", lockID) + _ = conn.Close() + } + return release, true +} diff --git a/backend/internal/service/ops_aggregation_service.go b/backend/internal/service/ops_aggregation_service.go index 04dbb11b..2a6afbba 100644 --- a/backend/internal/service/ops_aggregation_service.go +++ b/backend/internal/service/ops_aggregation_service.go @@ -376,28 +376,37 @@ return 0 `) func (s *OpsAggregationService) tryAcquireLeaderLock(ctx context.Context, key string, ttl time.Duration, logPrefix string) (func(), bool) { - if s == nil || s.redisClient == nil { - return nil, true + if s == nil { + return nil, false } if ctx == nil { ctx = context.Background() } - ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result() - if err != nil { - // Fail-open: do not block single-instance deployments. - return nil, true + // Prefer Redis leader lock when available (multi-instance), but avoid stampeding + // the DB when Redis is flaky by falling back to a DB advisory lock. + if s.redisClient != nil { + ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result() + if err == nil { + if !ok { + s.maybeLogSkip(logPrefix) + return nil, false + } + release := func() { + ctx2, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _, _ = opsAggReleaseScript.Run(ctx2, s.redisClient, []string{key}, s.instanceID).Result() + } + return release, true + } + // Redis error: fall through to DB advisory lock. } + + release, ok := tryAcquireDBAdvisoryLock(ctx, s.db, hashAdvisoryLockID(key)) if !ok { s.maybeLogSkip(logPrefix) return nil, false } - - release := func() { - ctx2, cancel := context.WithTimeout(context.Background(), 2*time.Second) - defer cancel() - _, _ = opsAggReleaseScript.Run(ctx2, s.redisClient, []string{key}, s.instanceID).Result() - } return release, true } diff --git a/backend/internal/service/ops_alert_evaluator_service.go b/backend/internal/service/ops_alert_evaluator_service.go index b970c720..81712136 100644 --- a/backend/internal/service/ops_alert_evaluator_service.go +++ b/backend/internal/service/ops_alert_evaluator_service.go @@ -720,11 +720,12 @@ func (s *OpsAlertEvaluatorService) tryAcquireLeaderLock(ctx context.Context, loc ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result() if err != nil { - // Fail-open for single-node environments, but warn. + // Prefer fail-closed to avoid duplicate evaluators stampeding the DB when Redis is flaky. + // Single-node deployments can disable the distributed lock via runtime settings. s.warnNoRedisOnce.Do(func() { - log.Printf("[OpsAlertEvaluator] leader lock SetNX failed; running without lock: %v", err) + log.Printf("[OpsAlertEvaluator] leader lock SetNX failed; skipping this cycle: %v", err) }) - return nil, true + return nil, false } if !ok { s.maybeLogSkip(key) diff --git a/backend/internal/service/ops_cleanup_service.go b/backend/internal/service/ops_cleanup_service.go index ef825c04..08c6a16e 100644 --- a/backend/internal/service/ops_cleanup_service.go +++ b/backend/internal/service/ops_cleanup_service.go @@ -300,30 +300,36 @@ func (s *OpsCleanupService) tryAcquireLeaderLock(ctx context.Context) (func(), b return nil, true } - if s.redisClient == nil { - s.warnNoRedisOnce.Do(func() { - log.Printf("[OpsCleanup] redis not configured; running without distributed lock") - }) - return nil, true - } - key := opsCleanupLeaderLockKeyDefault ttl := opsCleanupLeaderLockTTLDefault - ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result() - if err != nil { + // Prefer Redis leader lock when available, but avoid stampeding the DB when Redis is flaky by + // falling back to a DB advisory lock. + if s.redisClient != nil { + ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result() + if err == nil { + if !ok { + return nil, false + } + return func() { + _, _ = opsCleanupReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result() + }, true + } + // Redis error: fall back to DB advisory lock. s.warnNoRedisOnce.Do(func() { - log.Printf("[OpsCleanup] leader lock SetNX failed; running without lock: %v", err) + log.Printf("[OpsCleanup] leader lock SetNX failed; falling back to DB advisory lock: %v", err) + }) + } else { + s.warnNoRedisOnce.Do(func() { + log.Printf("[OpsCleanup] redis not configured; using DB advisory lock") }) - return nil, true } + + release, ok := tryAcquireDBAdvisoryLock(ctx, s.db, hashAdvisoryLockID(key)) if !ok { return nil, false } - - return func() { - _, _ = opsCleanupReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result() - }, true + return release, true } func (s *OpsCleanupService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) { diff --git a/backend/internal/service/ops_dashboard.go b/backend/internal/service/ops_dashboard.go index 23d6d82f..31822ba8 100644 --- a/backend/internal/service/ops_dashboard.go +++ b/backend/internal/service/ops_dashboard.go @@ -5,6 +5,7 @@ import ( "database/sql" "errors" "log" + "time" infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" ) @@ -39,6 +40,16 @@ func (s *OpsService) GetDashboardOverview(ctx context.Context, filter *OpsDashbo // Best-effort system health + jobs; dashboard metrics should still render if these are missing. if metrics, err := s.opsRepo.GetLatestSystemMetrics(ctx, 1); err == nil { + // Attach config-derived limits so the UI can show "current / max" for connection pools. + // These are best-effort and should never block the dashboard rendering. + if s != nil && s.cfg != nil { + if s.cfg.Database.MaxOpenConns > 0 { + metrics.DBMaxOpenConns = intPtr(s.cfg.Database.MaxOpenConns) + } + if s.cfg.Redis.PoolSize > 0 { + metrics.RedisPoolSize = intPtr(s.cfg.Redis.PoolSize) + } + } overview.SystemMetrics = metrics } else if err != nil && !errors.Is(err, sql.ErrNoRows) { log.Printf("[Ops] GetLatestSystemMetrics failed: %v", err) @@ -50,6 +61,8 @@ func (s *OpsService) GetDashboardOverview(ctx context.Context, filter *OpsDashbo log.Printf("[Ops] ListJobHeartbeats failed: %v", err) } + overview.HealthScore = computeDashboardHealthScore(time.Now().UTC(), overview) + return overview, nil } diff --git a/backend/internal/service/ops_dashboard_models.go b/backend/internal/service/ops_dashboard_models.go index 51a0b1fb..f189031b 100644 --- a/backend/internal/service/ops_dashboard_models.go +++ b/backend/internal/service/ops_dashboard_models.go @@ -35,6 +35,10 @@ type OpsDashboardOverview struct { Platform string `json:"platform"` GroupID *int64 `json:"group_id"` + // HealthScore is a backend-computed overall health score (0-100). + // It is derived from the monitored metrics in this overview, plus best-effort system metrics/job heartbeats. + HealthScore int `json:"health_score"` + // Latest system-level snapshot (window=1m, global). SystemMetrics *OpsSystemMetricsSnapshot `json:"system_metrics"` diff --git a/backend/internal/service/ops_health_score.go b/backend/internal/service/ops_health_score.go new file mode 100644 index 00000000..68cfc10d --- /dev/null +++ b/backend/internal/service/ops_health_score.go @@ -0,0 +1,126 @@ +package service + +import ( + "math" + "time" +) + +// computeDashboardHealthScore computes a 0-100 health score from the metrics returned by the dashboard overview. +// +// Design goals: +// - Backend-owned scoring (UI only displays). +// - Uses "overall" business indicators (SLA/error/latency) plus infra indicators (db/redis/cpu/mem/jobs). +// - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data. +func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int { + if overview == nil { + return 0 + } + + // Idle/no-data: avoid showing a "bad" score when there is no traffic. + // UI can still render a gray/idle state based on QPS + error rate. + if overview.RequestCountSLA <= 0 && overview.RequestCountTotal <= 0 && overview.ErrorCountTotal <= 0 { + return 100 + } + + score := 100.0 + + // --- SLA (primary signal) --- + // SLA is a ratio (0..1). Target is intentionally modest for LLM gateways; it can be tuned later. + slaPct := clampFloat64(overview.SLA*100, 0, 100) + if slaPct < 99.5 { + // Up to -45 points as SLA drops. + score -= math.Min(45, (99.5-slaPct)*12) + } + + // --- Error rates (secondary signal) --- + errorPct := clampFloat64(overview.ErrorRate*100, 0, 100) + if errorPct > 1 { + // Cap at -20 points by 6% error rate. + score -= math.Min(20, (errorPct-1)*4) + } + + upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100) + if upstreamPct > 1 { + // Upstream instability deserves extra weight, but keep it smaller than SLA/error. + score -= math.Min(15, (upstreamPct-1)*3) + } + + // --- Latency (tail-focused) --- + // Use p99 of duration + TTFT. Penalize only when clearly elevated. + if overview.Duration.P99 != nil { + p99 := float64(*overview.Duration.P99) + if p99 > 2000 { + // From 2s upward, gradually penalize up to -20. + score -= math.Min(20, (p99-2000)/900) // ~20s => ~-20 + } + } + if overview.TTFT.P99 != nil { + p99 := float64(*overview.TTFT.P99) + if p99 > 500 { + // TTFT > 500ms starts hurting; cap at -10. + score -= math.Min(10, (p99-500)/200) // 2.5s => -10 + } + } + + // --- System metrics snapshot (best-effort) --- + if overview.SystemMetrics != nil { + if overview.SystemMetrics.DBOK != nil && !*overview.SystemMetrics.DBOK { + score -= 20 + } + if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK { + score -= 15 + } + + if overview.SystemMetrics.CPUUsagePercent != nil { + cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100) + if cpuPct > 85 { + score -= math.Min(10, (cpuPct-85)*1.5) + } + } + if overview.SystemMetrics.MemoryUsagePercent != nil { + memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100) + if memPct > 90 { + score -= math.Min(10, (memPct-90)*1.0) + } + } + + if overview.SystemMetrics.DBConnWaiting != nil && *overview.SystemMetrics.DBConnWaiting > 0 { + waiting := float64(*overview.SystemMetrics.DBConnWaiting) + score -= math.Min(10, waiting*2) + } + if overview.SystemMetrics.ConcurrencyQueueDepth != nil && *overview.SystemMetrics.ConcurrencyQueueDepth > 0 { + depth := float64(*overview.SystemMetrics.ConcurrencyQueueDepth) + score -= math.Min(10, depth*0.5) + } + } + + // --- Job heartbeats (best-effort) --- + // Penalize only clear "error after last success" signals, and cap the impact. + jobPenalty := 0.0 + for _, hb := range overview.JobHeartbeats { + if hb == nil { + continue + } + if hb.LastErrorAt != nil && (hb.LastSuccessAt == nil || hb.LastErrorAt.After(*hb.LastSuccessAt)) { + jobPenalty += 5 + continue + } + if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute { + jobPenalty += 2 + } + } + score -= math.Min(15, jobPenalty) + + score = clampFloat64(score, 0, 100) + return int(math.Round(score)) +} + +func clampFloat64(v float64, min float64, max float64) float64 { + if v < min { + return min + } + if v > max { + return max + } + return v +} diff --git a/backend/internal/service/ops_health_score_test.go b/backend/internal/service/ops_health_score_test.go new file mode 100644 index 00000000..d7e5dd8c --- /dev/null +++ b/backend/internal/service/ops_health_score_test.go @@ -0,0 +1,60 @@ +//go:build unit + +package service + +import ( + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestComputeDashboardHealthScore_IdleReturns100(t *testing.T) { + t.Parallel() + + score := computeDashboardHealthScore(time.Now().UTC(), &OpsDashboardOverview{}) + require.Equal(t, 100, score) +} + +func TestComputeDashboardHealthScore_DegradesOnBadSignals(t *testing.T) { + t.Parallel() + + ov := &OpsDashboardOverview{ + RequestCountTotal: 100, + RequestCountSLA: 100, + SuccessCount: 90, + ErrorCountTotal: 10, + ErrorCountSLA: 10, + + SLA: 0.90, + ErrorRate: 0.10, + UpstreamErrorRate: 0.08, + + Duration: OpsPercentiles{P99: intPtr(20_000)}, + TTFT: OpsPercentiles{P99: intPtr(2_000)}, + + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(false), + RedisOK: boolPtr(false), + CPUUsagePercent: float64Ptr(98.0), + MemoryUsagePercent: float64Ptr(97.0), + DBConnWaiting: intPtr(3), + ConcurrencyQueueDepth: intPtr(10), + }, + JobHeartbeats: []*OpsJobHeartbeat{ + { + JobName: "job-a", + LastErrorAt: timePtr(time.Now().UTC().Add(-1 * time.Minute)), + LastError: stringPtr("boom"), + }, + }, + } + + score := computeDashboardHealthScore(time.Now().UTC(), ov) + require.Less(t, score, 80) + require.GreaterOrEqual(t, score, 0) +} + +func timePtr(v time.Time) *time.Time { return &v } + +func stringPtr(v string) *string { return &v } diff --git a/backend/internal/service/ops_metrics_collector.go b/backend/internal/service/ops_metrics_collector.go index cd90e1bd..e55e365b 100644 --- a/backend/internal/service/ops_metrics_collector.go +++ b/backend/internal/service/ops_metrics_collector.go @@ -5,7 +5,6 @@ import ( "database/sql" "errors" "fmt" - "hash/fnv" "log" "math" "os" @@ -262,6 +261,7 @@ func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error { dbOK := c.checkDB(ctx) redisOK := c.checkRedis(ctx) active, idle := c.dbPoolStats() + redisTotal, redisIdle, redisStatsOK := c.redisPoolStats() successCount, tokenConsumed, err := c.queryUsageCounts(ctx, windowStart, windowEnd) if err != nil { @@ -327,6 +327,19 @@ func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error { DBOK: boolPtr(dbOK), RedisOK: boolPtr(redisOK), + RedisConnTotal: func() *int { + if !redisStatsOK { + return nil + } + return intPtr(redisTotal) + }(), + RedisConnIdle: func() *int { + if !redisStatsOK { + return nil + } + return intPtr(redisIdle) + }(), + DBConnActive: intPtr(active), DBConnIdle: intPtr(idle), GoroutineCount: intPtr(goroutines), @@ -722,6 +735,17 @@ func (c *OpsMetricsCollector) checkRedis(ctx context.Context) bool { return c.redisClient.Ping(ctx).Err() == nil } +func (c *OpsMetricsCollector) redisPoolStats() (total int, idle int, ok bool) { + if c == nil || c.redisClient == nil { + return 0, 0, false + } + stats := c.redisClient.PoolStats() + if stats == nil { + return 0, 0, false + } + return int(stats.TotalConns), int(stats.IdleConns), true +} + func (c *OpsMetricsCollector) dbPoolStats() (active int, idle int) { if c == nil || c.db == nil { return 0, 0 @@ -749,7 +773,7 @@ func (c *OpsMetricsCollector) tryAcquireLeaderLock(ctx context.Context) (func(), if err != nil { // Prefer fail-closed to avoid stampeding the database when Redis is flaky. // Fallback to a DB advisory lock when Redis is present but unavailable. - release, ok := c.tryAcquireDBAdvisoryLock(ctx) + release, ok := tryAcquireDBAdvisoryLock(ctx, c.db, opsMetricsCollectorAdvisoryLockID) if !ok { c.maybeLogSkip() return nil, false @@ -769,38 +793,6 @@ func (c *OpsMetricsCollector) tryAcquireLeaderLock(ctx context.Context) (func(), return release, true } -func (c *OpsMetricsCollector) tryAcquireDBAdvisoryLock(ctx context.Context) (func(), bool) { - if c == nil || c.db == nil { - return nil, false - } - if ctx == nil { - ctx = context.Background() - } - - conn, err := c.db.Conn(ctx) - if err != nil { - return nil, false - } - - acquired := false - if err := conn.QueryRowContext(ctx, "SELECT pg_try_advisory_lock($1)", opsMetricsCollectorAdvisoryLockID).Scan(&acquired); err != nil { - _ = conn.Close() - return nil, false - } - if !acquired { - _ = conn.Close() - return nil, false - } - - release := func() { - unlockCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second) - defer cancel() - _, _ = conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", opsMetricsCollectorAdvisoryLockID) - _ = conn.Close() - } - return release, true -} - func (c *OpsMetricsCollector) maybeLogSkip() { c.skipLogMu.Lock() defer c.skipLogMu.Unlock() @@ -853,9 +845,3 @@ func float64Ptr(v float64) *float64 { out := v return &out } - -func hashAdvisoryLockID(s string) int64 { - h := fnv.New64a() - _, _ = h.Write([]byte(s)) - return int64(h.Sum64()) -} diff --git a/backend/internal/service/ops_port.go b/backend/internal/service/ops_port.go index a3d847e0..90591a56 100644 --- a/backend/internal/service/ops_port.go +++ b/backend/internal/service/ops_port.go @@ -165,6 +165,9 @@ type OpsInsertSystemMetricsInput struct { DBOK *bool RedisOK *bool + RedisConnTotal *int + RedisConnIdle *int + DBConnActive *int DBConnIdle *int DBConnWaiting *int @@ -186,6 +189,13 @@ type OpsSystemMetricsSnapshot struct { DBOK *bool `json:"db_ok"` RedisOK *bool `json:"redis_ok"` + // Config-derived limits (best-effort). These are not historical metrics; they help UI render "current vs max". + DBMaxOpenConns *int `json:"db_max_open_conns"` + RedisPoolSize *int `json:"redis_pool_size"` + + RedisConnTotal *int `json:"redis_conn_total"` + RedisConnIdle *int `json:"redis_conn_idle"` + DBConnActive *int `json:"db_conn_active"` DBConnIdle *int `json:"db_conn_idle"` DBConnWaiting *int `json:"db_conn_waiting"` diff --git a/backend/internal/service/setting_service.go b/backend/internal/service/setting_service.go index 1aea32be..09772616 100644 --- a/backend/internal/service/setting_service.go +++ b/backend/internal/service/setting_service.go @@ -139,6 +139,9 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet updates[SettingKeyOpsMonitoringEnabled] = strconv.FormatBool(settings.OpsMonitoringEnabled) updates[SettingKeyOpsRealtimeMonitoringEnabled] = strconv.FormatBool(settings.OpsRealtimeMonitoringEnabled) updates[SettingKeyOpsQueryModeDefault] = string(ParseOpsQueryMode(settings.OpsQueryModeDefault)) + if settings.OpsMetricsIntervalSeconds > 0 { + updates[SettingKeyOpsMetricsIntervalSeconds] = strconv.Itoa(settings.OpsMetricsIntervalSeconds) + } return s.settingRepo.SetMultiple(ctx, updates) } @@ -231,6 +234,7 @@ func (s *SettingService) InitializeDefaultSettings(ctx context.Context) error { SettingKeyOpsMonitoringEnabled: "true", SettingKeyOpsRealtimeMonitoringEnabled: "true", SettingKeyOpsQueryModeDefault: "auto", + SettingKeyOpsMetricsIntervalSeconds: "60", } return s.settingRepo.SetMultiple(ctx, defaults) @@ -301,6 +305,18 @@ func (s *SettingService) parseSettings(settings map[string]string) *SystemSettin result.OpsMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsMonitoringEnabled]) result.OpsRealtimeMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsRealtimeMonitoringEnabled]) result.OpsQueryModeDefault = string(ParseOpsQueryMode(settings[SettingKeyOpsQueryModeDefault])) + result.OpsMetricsIntervalSeconds = 60 + if raw := strings.TrimSpace(settings[SettingKeyOpsMetricsIntervalSeconds]); raw != "" { + if v, err := strconv.Atoi(raw); err == nil { + if v < 60 { + v = 60 + } + if v > 3600 { + v = 3600 + } + result.OpsMetricsIntervalSeconds = v + } + } return result } diff --git a/backend/internal/service/settings_view.go b/backend/internal/service/settings_view.go index e9d07bca..1f3d925a 100644 --- a/backend/internal/service/settings_view.go +++ b/backend/internal/service/settings_view.go @@ -43,6 +43,7 @@ type SystemSettings struct { OpsMonitoringEnabled bool OpsRealtimeMonitoringEnabled bool OpsQueryModeDefault string + OpsMetricsIntervalSeconds int } type PublicSettings struct { diff --git a/frontend/src/api/admin/ops.ts b/frontend/src/api/admin/ops.ts index 3c3529a9..851993ca 100644 --- a/frontend/src/api/admin/ops.ts +++ b/frontend/src/api/admin/ops.ts @@ -46,6 +46,8 @@ export interface OpsDashboardOverview { platform: string group_id?: number | null + health_score?: number + system_metrics?: OpsSystemMetricsSnapshot | null job_heartbeats?: OpsJobHeartbeat[] | null @@ -228,6 +230,9 @@ export interface OpsSystemMetricsSnapshot { db_ok?: boolean | null redis_ok?: boolean | null + redis_conn_total?: number | null + redis_conn_idle?: number | null + db_conn_active?: number | null db_conn_idle?: number | null db_conn_waiting?: number | null diff --git a/frontend/src/api/admin/settings.ts b/frontend/src/api/admin/settings.ts index 37b12e40..9ddeb5bf 100644 --- a/frontend/src/api/admin/settings.ts +++ b/frontend/src/api/admin/settings.ts @@ -50,6 +50,7 @@ export interface SystemSettings { ops_monitoring_enabled: boolean ops_realtime_monitoring_enabled: boolean ops_query_mode_default: 'auto' | 'raw' | 'preagg' | string + ops_metrics_interval_seconds: number } export interface UpdateSettingsRequest { @@ -83,6 +84,7 @@ export interface UpdateSettingsRequest { ops_monitoring_enabled?: boolean ops_realtime_monitoring_enabled?: boolean ops_query_mode_default?: 'auto' | 'raw' | 'preagg' | string + ops_metrics_interval_seconds?: number } /** diff --git a/frontend/src/i18n/locales/en.ts b/frontend/src/i18n/locales/en.ts index f80a235f..1caae1d5 100644 --- a/frontend/src/i18n/locales/en.ts +++ b/frontend/src/i18n/locales/en.ts @@ -1733,8 +1733,10 @@ export default { redis: 'Redis', goroutines: 'Goroutines', jobs: 'Jobs', + jobsHelp: 'Click “Details” to view job heartbeats and recent errors', active: 'active', idle: 'idle', + waiting: 'waiting', ok: 'ok', lastRun: 'last_run:', lastSuccess: 'last_success:', @@ -1770,12 +1772,50 @@ export default { errorsSla: 'Errors (SLA scope)', upstreamExcl429529: 'Upstream (excl 429/529)', failedToLoadData: 'Failed to load ops data.', + failedToLoadOverview: 'Failed to load overview', + failedToLoadThroughputTrend: 'Failed to load throughput trend', + failedToLoadLatencyHistogram: 'Failed to load latency histogram', + failedToLoadErrorTrend: 'Failed to load error trend', + failedToLoadErrorDistribution: 'Failed to load error distribution', + failedToLoadErrorDetail: 'Failed to load error detail', + retryFailed: 'Retry failed', tpsK: 'TPS (K)', top: 'Top:', throughputTrend: 'Throughput Trend', latencyHistogram: 'Latency Histogram', errorTrend: 'Error Trend', errorDistribution: 'Error Distribution', + // Health Score & Diagnosis + health: 'Health', + healthCondition: 'Health Condition', + healthHelp: 'Overall system health score based on SLA, error rate, and resource usage', + healthyStatus: 'Healthy', + riskyStatus: 'At Risk', + idleStatus: 'Idle', + diagnosis: { + title: 'Smart Diagnosis', + footer: 'Automated diagnostic suggestions based on current metrics', + idle: 'System is currently idle', + idleImpact: 'No active traffic', + upstreamCritical: 'Upstream error rate critically high ({rate}%)', + upstreamCriticalImpact: 'May affect many user requests', + upstreamHigh: 'Upstream error rate elevated ({rate}%)', + upstreamHighImpact: 'Recommend checking upstream service status', + slaCritical: 'SLA critically below target ({sla}%)', + slaCriticalImpact: 'User experience severely degraded', + slaLow: 'SLA below target ({sla}%)', + slaLowImpact: 'Service quality needs attention', + errorHigh: 'Error rate too high ({rate}%)', + errorHighImpact: 'Many requests failing', + errorElevated: 'Error rate elevated ({rate}%)', + errorElevatedImpact: 'Recommend checking error logs', + healthCritical: 'Overall health score critically low ({score})', + healthCriticalImpact: 'Multiple metrics may be degraded; prioritize error rate and latency investigation', + healthLow: 'Overall health score low ({score})', + healthLowImpact: 'May indicate minor instability; monitor SLA and error rates', + healthy: 'All system metrics normal', + healthyImpact: 'Service running stable' + }, // Error Log errorLog: { timeId: 'Time / ID', @@ -2069,7 +2109,21 @@ export default { throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.', latencyHistogram: 'Latency distribution (duration_ms) for successful requests.', errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).', - errorDistribution: 'Error distribution by status code.' + errorDistribution: 'Error distribution by status code.', + goroutines: + 'Number of Go runtime goroutines (lightweight threads). There is no absolute “safe” number—use your historical baseline. Heuristic: <2k is common; 2k–8k watch; >8k plus rising queue/latency often suggests blocking/leaks.', + cpu: 'CPU usage percentage, showing system processor load.', + memory: 'Memory usage, including used and total available memory.', + db: 'Database connection pool status, including active, idle, and waiting connections.', + redis: 'Redis connection pool status, showing active and idle connections.', + jobs: 'Background job execution status, including last run time, success time, and error information.', + qps: 'Queries Per Second (QPS) and Tokens Per Second (TPS), real-time system throughput.', + tokens: 'Total number of tokens processed in the current time window.', + sla: 'Service Level Agreement success rate, excluding business limits (e.g., insufficient balance, quota exceeded).', + errors: 'Error statistics, including total errors, error rate, and upstream error rate.', + latency: 'Request latency statistics, including p50, p90, p95, p99 percentiles.', + ttft: 'Time To First Token, measuring the speed of first byte return in streaming responses.', + health: 'System health score (0-100), considering SLA, error rate, and resource usage.' }, charts: { emptyRequest: 'No requests in this window.', @@ -2183,7 +2237,9 @@ export default { queryModeHint: 'Default query mode for Ops Dashboard (auto/raw/preagg)', queryModeAuto: 'Auto (recommended)', queryModeRaw: 'Raw (most accurate, slower)', - queryModePreagg: 'Preagg (fastest, requires aggregation)' + queryModePreagg: 'Preagg (fastest, requires aggregation)', + metricsInterval: 'Metrics Collection Interval (seconds)', + metricsIntervalHint: 'How often to collect system/request metrics (60-3600 seconds)' }, adminApiKey: { title: 'Admin API Key', diff --git a/frontend/src/i18n/locales/zh.ts b/frontend/src/i18n/locales/zh.ts index 646511f4..d8ce293c 100644 --- a/frontend/src/i18n/locales/zh.ts +++ b/frontend/src/i18n/locales/zh.ts @@ -1878,8 +1878,10 @@ export default { redis: 'Redis', goroutines: '协程', jobs: '后台任务', + jobsHelp: '点击“明细”查看任务心跳与报错信息', active: '活跃', idle: '空闲', + waiting: '等待', ok: '正常', lastRun: '最近运行', lastSuccess: '最近成功', @@ -1898,8 +1900,8 @@ export default { errors: '错误', errorRate: '错误率:', upstreamRate: '上游错误率:', - latencyDuration: '延迟 (duration_ms)', - ttftLabel: 'TTFT (first_token_ms)', + latencyDuration: '延迟(毫秒)', + ttftLabel: '首字延迟(毫秒)', p50: 'p50', p90: 'p90', p95: 'p95', @@ -1915,12 +1917,50 @@ export default { errorsSla: '错误(SLA范围)', upstreamExcl429529: '上游(排除429/529)', failedToLoadData: '加载运维数据失败', - tpsK: 'TPS (K)', + failedToLoadOverview: '加载概览数据失败', + failedToLoadThroughputTrend: '加载吞吐趋势失败', + failedToLoadLatencyHistogram: '加载延迟分布失败', + failedToLoadErrorTrend: '加载错误趋势失败', + failedToLoadErrorDistribution: '加载错误分布失败', + failedToLoadErrorDetail: '加载错误详情失败', + retryFailed: '重试失败', + tpsK: 'TPS(千)', top: '最高:', throughputTrend: '吞吐趋势', latencyHistogram: '延迟分布', errorTrend: '错误趋势', errorDistribution: '错误分布', + // Health Score & Diagnosis + health: '健康', + healthCondition: '健康状况', + healthHelp: '基于 SLA、错误率和资源使用情况的系统整体健康评分', + healthyStatus: '健康', + riskyStatus: '风险', + idleStatus: '待机', + diagnosis: { + title: '智能诊断', + footer: '基于当前指标的自动诊断建议', + idle: '系统当前处于待机状态', + idleImpact: '无活跃流量', + upstreamCritical: '上游错误率严重偏高 ({rate}%)', + upstreamCriticalImpact: '可能影响大量用户请求', + upstreamHigh: '上游错误率偏高 ({rate}%)', + upstreamHighImpact: '建议检查上游服务状态', + slaCritical: 'SLA 严重低于目标 ({sla}%)', + slaCriticalImpact: '用户体验严重受损', + slaLow: 'SLA 低于目标 ({sla}%)', + slaLowImpact: '需要关注服务质量', + errorHigh: '错误率过高 ({rate}%)', + errorHighImpact: '大量请求失败', + errorElevated: '错误率偏高 ({rate}%)', + errorElevatedImpact: '建议检查错误日志', + healthCritical: '综合健康评分过低 ({score})', + healthCriticalImpact: '多个指标可能同时异常,建议优先排查错误与延迟', + healthLow: '综合健康评分偏低 ({score})', + healthLowImpact: '可能存在轻度波动,建议关注 SLA 与错误率', + healthy: '所有系统指标正常', + healthyImpact: '服务运行稳定' + }, // Error Log errorLog: { timeId: '时间 / ID', @@ -2212,9 +2252,23 @@ export default { }, tooltips: { throughputTrend: '当前窗口内的请求/QPS 与 token/TPS 趋势。', - latencyHistogram: '成功请求的延迟分布(duration_ms)。', + latencyHistogram: '成功请求的延迟分布(毫秒)。', errorTrend: '错误趋势(SLA 口径排除业务限制;上游错误率排除 429/529)。', - errorDistribution: '按状态码统计的错误分布。' + errorDistribution: '按状态码统计的错误分布。', + goroutines: + 'Go 运行时的协程数量(轻量级线程)。没有绝对“安全值”,建议以历史基线为准。经验参考:<2000 常见;2000-8000 需关注;>8000 且伴随队列/延迟上升时,优先排查阻塞/泄漏。', + cpu: 'CPU 使用率,显示系统处理器的负载情况。', + memory: '内存使用率,包括已使用和总可用内存。', + db: '数据库连接池状态,包括活跃连接、空闲连接和等待连接数。', + redis: 'Redis 连接池状态,显示活跃和空闲的连接数。', + jobs: '后台任务执行状态,包括最近运行时间、成功时间和错误信息。', + qps: '每秒查询数(QPS)和每秒Token数(TPS),实时显示系统吞吐量。', + tokens: '当前时间窗口内处理的总Token数量。', + sla: '服务等级协议达成率,排除业务限制(如余额不足、配额超限)的成功请求占比。', + errors: '错误统计,包括总错误数、错误率和上游错误率。', + latency: '请求延迟统计,包括 p50、p90、p95、p99 等百分位数。', + ttft: '首Token延迟(Time To First Token),衡量流式响应的首字节返回速度。', + health: '系统健康评分(0-100),综合考虑 SLA、错误率和资源使用情况。' }, charts: { emptyRequest: '该时间窗口内暂无请求。', @@ -2320,14 +2374,16 @@ export default { description: '启用运维监控模块,用于排障与健康可视化', disabled: '运维监控已关闭', enabled: '启用运维监控', - enabledHint: '启用 Ops 运维监控模块(仅管理员可见)', + enabledHint: '启用运维监控模块(仅管理员可见)', realtimeEnabled: '启用实时监控', - realtimeEnabledHint: '启用实时 QPS/指标推送(WebSocket)', + realtimeEnabledHint: '启用实时请求速率和指标推送(WebSocket)', queryMode: '默认查询模式', - queryModeHint: 'Ops Dashboard 默认查询模式(auto/raw/preagg)', + queryModeHint: '运维监控默认查询模式(自动/原始/预聚合)', queryModeAuto: '自动(推荐)', - queryModeRaw: 'Raw(最准,但较慢)', - queryModePreagg: 'Preagg(最快,需预聚合)' + queryModeRaw: '原始(最准确,但较慢)', + queryModePreagg: '预聚合(最快,需预聚合)', + metricsInterval: '采集频率(秒)', + metricsIntervalHint: '系统/请求指标采集频率(60-3600 秒)' }, adminApiKey: { title: '管理员 API Key', diff --git a/frontend/src/views/admin/SettingsView.vue b/frontend/src/views/admin/SettingsView.vue index 4375a6cc..cf7a2867 100644 --- a/frontend/src/views/admin/SettingsView.vue +++ b/frontend/src/views/admin/SettingsView.vue @@ -715,6 +715,25 @@ class="w-[220px]" /> + +
+ {{ t('admin.settings.opsMonitoring.metricsIntervalHint') }} +
+