feat(运维监控): 增强监控功能和健康评分系统

后端改进： - 新增健康评分计算服务（ops_health_score.go） - 添加分布式锁支持（ops_advisory_lock.go） - 优化指标采集和聚合逻辑 - 新增运维指标采集间隔配置（60-3600秒） - 移除未使用的WebSocket查询token认证中间件 - 改进清理服务和告警评估逻辑前端改进： - 简化OpsDashboard组件结构 - 完善国际化文本（中英文） - 新增运维监控相关API类型定义 - 添加运维指标采集间隔设置界面 - 优化错误详情模态框测试： - 添加健康评分单元测试 - 更新API契约测试
2026-01-10 01:38:47 +08:00
parent 8ae75e7f6e
commit 585257d340
25 changed files with 570 additions and 385 deletions
--- a/backend/internal/handler/admin/setting_handler.go
+++ b/backend/internal/handler/admin/setting_handler.go
@@ -68,6 +68,7 @@ func (h *SettingHandler) GetSettings(c *gin.Context) {
 		OpsMonitoringEnabled:         settings.OpsMonitoringEnabled,
 		OpsRealtimeMonitoringEnabled: settings.OpsRealtimeMonitoringEnabled,
 		OpsQueryModeDefault:          settings.OpsQueryModeDefault,
 		OpsMetricsIntervalSeconds:    settings.OpsMetricsIntervalSeconds,
 	})
 }
@@ -115,9 +116,10 @@ type UpdateSettingsRequest struct {
 	IdentityPatchPrompt string `json:"identity_patch_prompt"`
 	// Ops monitoring (vNext)
-	OpsMonitoringEnabled         *bool `json:"ops_monitoring_enabled"`
+	OpsMonitoringEnabled         *bool   `json:"ops_monitoring_enabled"`
-	OpsRealtimeMonitoringEnabled *bool `json:"ops_realtime_monitoring_enabled"`
+	OpsRealtimeMonitoringEnabled *bool   `json:"ops_realtime_monitoring_enabled"`
 	OpsQueryModeDefault          *string `json:"ops_query_mode_default"`
 	OpsMetricsIntervalSeconds    *int    `json:"ops_metrics_interval_seconds"`
 }
 // UpdateSettings 更新系统设置
@@ -173,6 +175,18 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) {
 		}
 	}
 	// Ops metrics collector interval validation (seconds).
 	if req.OpsMetricsIntervalSeconds != nil {
 		v := *req.OpsMetricsIntervalSeconds
 		if v < 60 {
 			v = 60
 		}
 		if v > 3600 {
 			v = 3600
 		}
 		req.OpsMetricsIntervalSeconds = &v
 	}
 	settings := &service.SystemSettings{
 		RegistrationEnabled:      req.RegistrationEnabled,
 		EmailVerifyEnabled:       req.EmailVerifyEnabled,
@@ -219,6 +233,12 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) {
 			}
 			return previousSettings.OpsQueryModeDefault
 		}(),
 		OpsMetricsIntervalSeconds: func() int {
 			if req.OpsMetricsIntervalSeconds != nil {
 				return *req.OpsMetricsIntervalSeconds
 			}
 			return previousSettings.OpsMetricsIntervalSeconds
 		}(),
 	}
 	if err := h.settingService.UpdateSettings(c.Request.Context(), settings); err != nil {
@@ -266,6 +286,7 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) {
 		OpsMonitoringEnabled:         updatedSettings.OpsMonitoringEnabled,
 		OpsRealtimeMonitoringEnabled: updatedSettings.OpsRealtimeMonitoringEnabled,
 		OpsQueryModeDefault:          updatedSettings.OpsQueryModeDefault,
 		OpsMetricsIntervalSeconds:    updatedSettings.OpsMetricsIntervalSeconds,
 	})
 }
@@ -375,6 +396,9 @@ func diffSettings(before *service.SystemSettings, after *service.SystemSettings,
 	if before.OpsQueryModeDefault != after.OpsQueryModeDefault {
 		changed = append(changed, "ops_query_mode_default")
 	}
 	if before.OpsMetricsIntervalSeconds != after.OpsMetricsIntervalSeconds {
 		changed = append(changed, "ops_metrics_interval_seconds")
 	}
 	return changed
 }
--- a/backend/internal/handler/dto/settings.go
+++ b/backend/internal/handler/dto/settings.go
@@ -39,9 +39,10 @@ type SystemSettings struct {
 	IdentityPatchPrompt string `json:"identity_patch_prompt"`
 	// Ops monitoring (vNext)
-	OpsMonitoringEnabled         bool `json:"ops_monitoring_enabled"`
+	OpsMonitoringEnabled         bool   `json:"ops_monitoring_enabled"`
-	OpsRealtimeMonitoringEnabled bool `json:"ops_realtime_monitoring_enabled"`
+	OpsRealtimeMonitoringEnabled bool   `json:"ops_realtime_monitoring_enabled"`
 	OpsQueryModeDefault          string `json:"ops_query_mode_default"`
 	OpsMetricsIntervalSeconds    int    `json:"ops_metrics_interval_seconds"`
 }
 type PublicSettings struct {
--- a/backend/internal/repository/ops_repo_metrics.go
+++ b/backend/internal/repository/ops_repo_metrics.go
@@ -68,6 +68,9 @@ INSERT INTO ops_system_metrics (
  db_ok,
  redis_ok,
  redis_conn_total,
  redis_conn_idle,
  db_conn_active,
  db_conn_idle,
  db_conn_waiting,
@@ -83,8 +86,9 @@ INSERT INTO ops_system_metrics (
  $21,$22,$23,$24,$25,$26,
  $27,$28,$29,$30,
  $31,$32,
-  $33,$34,$35,
+  $33,$34,
-  $36,$37
+  $35,$36,$37,
  $38,$39
 )`
 	_, err := r.db.ExecContext(
@@ -130,6 +134,9 @@ INSERT INTO ops_system_metrics (
 		opsNullBool(input.DBOK),
 		opsNullBool(input.RedisOK),
 		opsNullInt(input.RedisConnTotal),
 		opsNullInt(input.RedisConnIdle),
 		opsNullInt(input.DBConnActive),
 		opsNullInt(input.DBConnIdle),
 		opsNullInt(input.DBConnWaiting),
@@ -162,6 +169,9 @@ SELECT
  db_ok,
  redis_ok,
  redis_conn_total,
  redis_conn_idle,
  db_conn_active,
  db_conn_idle,
  db_conn_waiting,
@@ -182,6 +192,8 @@ LIMIT 1`
 	var memPct sql.NullFloat64
 	var dbOK sql.NullBool
 	var redisOK sql.NullBool
 	var redisTotal sql.NullInt64
 	var redisIdle sql.NullInt64
 	var dbActive sql.NullInt64
 	var dbIdle sql.NullInt64
 	var dbWaiting sql.NullInt64
@@ -198,6 +210,8 @@ LIMIT 1`
 		&memPct,
 		&dbOK,
 		&redisOK,
 		&redisTotal,
 		&redisIdle,
 		&dbActive,
 		&dbIdle,
 		&dbWaiting,
@@ -231,6 +245,14 @@ LIMIT 1`
 		v := redisOK.Bool
 		out.RedisOK = &v
 	}
 	if redisTotal.Valid {
 		v := int(redisTotal.Int64)
 		out.RedisConnTotal = &v
 	}
 	if redisIdle.Valid {
 		v := int(redisIdle.Int64)
 		out.RedisConnIdle = &v
 	}
 	if dbActive.Valid {
 		v := int(dbActive.Int64)
 		out.DBConnActive = &v
@@ -398,4 +420,3 @@ func opsNullTime(v *time.Time) any {
 	}
 	return sql.NullTime{Time: *v, Valid: true}
 }
--- a/backend/internal/server/api_contract_test.go
+++ b/backend/internal/server/api_contract_test.go
@@ -319,7 +319,9 @@ func TestAPIContracts(t *testing.T) {
 					"enable_identity_patch": true,
 					"identity_patch_prompt": "",
 					"ops_monitoring_enabled": true,
-					"ops_realtime_monitoring_enabled": true
+					"ops_realtime_monitoring_enabled": true,
 					"ops_query_mode_default": "auto",
 					"ops_metrics_interval_seconds": 60
 				}
 			}`,
 		},
--- a/backend/internal/server/middleware/ws_query_token_auth.go
+++ b/backend/internal/server/middleware/ws_query_token_auth.go
@@ -1,54 +0,0 @@
 package middleware
 import (
 	"net/http"
 	"strings"
 	"github.com/gin-gonic/gin"
 )
 // InjectBearerTokenFromQueryForWebSocket copies `?token=` into the Authorization header
 // for WebSocket handshake requests on a small allow-list of endpoints.
 //
 // Why: browsers can't set custom headers on WebSocket handshake, but our admin routes
 // are protected by header-based auth. This keeps the token support scoped to WS only.
 func InjectBearerTokenFromQueryForWebSocket() gin.HandlerFunc {
 	return func(c *gin.Context) {
 		if c == nil || c.Request == nil {
 			if c != nil {
 				c.Next()
 			}
 			return
 		}
 		// Only GET websocket upgrades.
 		if c.Request.Method != http.MethodGet {
 			c.Next()
 			return
 		}
 		if !strings.EqualFold(strings.TrimSpace(c.GetHeader("Upgrade")), "websocket") {
 			c.Next()
 			return
 		}
 		// If caller already supplied auth headers, don't override.
 		if strings.TrimSpace(c.GetHeader("Authorization")) != "" || strings.TrimSpace(c.GetHeader("x-api-key")) != "" {
 			c.Next()
 			return
 		}
 		// Allow-list ops websocket endpoints.
 		path := strings.TrimSpace(c.Request.URL.Path)
 		if !strings.HasPrefix(path, "/api/v1/admin/ops/ws/") {
 			c.Next()
 			return
 		}
 		token := strings.TrimSpace(c.Query("token"))
 		if token != "" {
 			c.Request.Header.Set("Authorization", "Bearer "+token)
 		}
 		c.Next()
 	}
 }
--- a/backend/internal/server/router.go
+++ b/backend/internal/server/router.go
@@ -25,8 +25,6 @@ func SetupRouter(
 ) *gin.Engine {
 	// 应用中间件
 	r.Use(middleware2.Logger())
 	// WebSocket handshake auth helper (token via query param, WS endpoints only).
 	r.Use(middleware2.InjectBearerTokenFromQueryForWebSocket())
 	r.Use(middleware2.CORS(cfg.CORS))
 	r.Use(middleware2.SecurityHeaders(cfg.Security.CSP))
--- a/backend/internal/service/ops_advisory_lock.go
+++ b/backend/internal/service/ops_advisory_lock.go
@@ -0,0 +1,46 @@
 package service
 import (
 	"context"
 	"database/sql"
 	"hash/fnv"
 	"time"
 )
 func hashAdvisoryLockID(key string) int64 {
 	h := fnv.New64a()
 	_, _ = h.Write([]byte(key))
 	return int64(h.Sum64())
 }
 func tryAcquireDBAdvisoryLock(ctx context.Context, db *sql.DB, lockID int64) (func(), bool) {
 	if db == nil {
 		return nil, false
 	}
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	conn, err := db.Conn(ctx)
 	if err != nil {
 		return nil, false
 	}
 	acquired := false
 	if err := conn.QueryRowContext(ctx, "SELECT pg_try_advisory_lock($1)", lockID).Scan(&acquired); err != nil {
 		_ = conn.Close()
 		return nil, false
 	}
 	if !acquired {
 		_ = conn.Close()
 		return nil, false
 	}
 	release := func() {
 		unlockCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
 		defer cancel()
 		_, _ = conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", lockID)
 		_ = conn.Close()
 	}
 	return release, true
 }
--- a/backend/internal/service/ops_aggregation_service.go
+++ b/backend/internal/service/ops_aggregation_service.go
@@ -376,28 +376,37 @@ return 0
 `)
 func (s *OpsAggregationService) tryAcquireLeaderLock(ctx context.Context, key string, ttl time.Duration, logPrefix string) (func(), bool) {
-	if s == nil || s.redisClient == nil {
+	if s == nil {
-		return nil, true
+		return nil, false
 	}
 	if ctx == nil {
 		ctx = context.Background()
 	}
-	ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
+	// Prefer Redis leader lock when available (multi-instance), but avoid stampeding
-	if err != nil {
+	// the DB when Redis is flaky by falling back to a DB advisory lock.
-		// Fail-open: do not block single-instance deployments.
+	if s.redisClient != nil {
-		return nil, true
+		ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
 		if err == nil {
 			if !ok {
 				s.maybeLogSkip(logPrefix)
 				return nil, false
 			}
 			release := func() {
 				ctx2, cancel := context.WithTimeout(context.Background(), 2*time.Second)
 				defer cancel()
 				_, _ = opsAggReleaseScript.Run(ctx2, s.redisClient, []string{key}, s.instanceID).Result()
 			}
 			return release, true
 		}
 		// Redis error: fall through to DB advisory lock.
 	}
 	release, ok := tryAcquireDBAdvisoryLock(ctx, s.db, hashAdvisoryLockID(key))
 	if !ok {
 		s.maybeLogSkip(logPrefix)
 		return nil, false
 	}
 	release := func() {
 		ctx2, cancel := context.WithTimeout(context.Background(), 2*time.Second)
 		defer cancel()
 		_, _ = opsAggReleaseScript.Run(ctx2, s.redisClient, []string{key}, s.instanceID).Result()
 	}
 	return release, true
 }
--- a/backend/internal/service/ops_alert_evaluator_service.go
+++ b/backend/internal/service/ops_alert_evaluator_service.go
@@ -720,11 +720,12 @@ func (s *OpsAlertEvaluatorService) tryAcquireLeaderLock(ctx context.Context, loc
 	ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
 	if err != nil {
-		// Fail-open for single-node environments, but warn.
+		// Prefer fail-closed to avoid duplicate evaluators stampeding the DB when Redis is flaky.
 		// Single-node deployments can disable the distributed lock via runtime settings.
 		s.warnNoRedisOnce.Do(func() {
-			log.Printf("[OpsAlertEvaluator] leader lock SetNX failed; running without lock: %v", err)
+			log.Printf("[OpsAlertEvaluator] leader lock SetNX failed; skipping this cycle: %v", err)
 		})
-		return nil, true
+		return nil, false
 	}
 	if !ok {
 		s.maybeLogSkip(key)
--- a/backend/internal/service/ops_cleanup_service.go
+++ b/backend/internal/service/ops_cleanup_service.go
@@ -300,30 +300,36 @@ func (s *OpsCleanupService) tryAcquireLeaderLock(ctx context.Context) (func(), b
 		return nil, true
 	}
 	if s.redisClient == nil {
 		s.warnNoRedisOnce.Do(func() {
 			log.Printf("[OpsCleanup] redis not configured; running without distributed lock")
 		})
 		return nil, true
 	}
 	key := opsCleanupLeaderLockKeyDefault
 	ttl := opsCleanupLeaderLockTTLDefault
-	ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
+	// Prefer Redis leader lock when available, but avoid stampeding the DB when Redis is flaky by
-	if err != nil {
+	// falling back to a DB advisory lock.
 	if s.redisClient != nil {
 		ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
 		if err == nil {
 			if !ok {
 				return nil, false
 			}
 			return func() {
 				_, _ = opsCleanupReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
 			}, true
 		}
 		// Redis error: fall back to DB advisory lock.
 		s.warnNoRedisOnce.Do(func() {
-			log.Printf("[OpsCleanup] leader lock SetNX failed; running without lock: %v", err)
+			log.Printf("[OpsCleanup] leader lock SetNX failed; falling back to DB advisory lock: %v", err)
 		})
 	} else {
 		s.warnNoRedisOnce.Do(func() {
 			log.Printf("[OpsCleanup] redis not configured; using DB advisory lock")
 		})
 		return nil, true
 	}
 	release, ok := tryAcquireDBAdvisoryLock(ctx, s.db, hashAdvisoryLockID(key))
 	if !ok {
 		return nil, false
 	}
-
+	return release, true
 	return func() {
 		_, _ = opsCleanupReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
 	}, true
 }
 func (s *OpsCleanupService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {
--- a/backend/internal/service/ops_dashboard.go
+++ b/backend/internal/service/ops_dashboard.go
@@ -5,6 +5,7 @@ import (
 	"database/sql"
 	"errors"
 	"log"
 	"time"
 	infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
 )
@@ -39,6 +40,16 @@ func (s *OpsService) GetDashboardOverview(ctx context.Context, filter *OpsDashbo
 	// Best-effort system health + jobs; dashboard metrics should still render if these are missing.
 	if metrics, err := s.opsRepo.GetLatestSystemMetrics(ctx, 1); err == nil {
 		// Attach config-derived limits so the UI can show "current / max" for connection pools.
 		// These are best-effort and should never block the dashboard rendering.
 		if s != nil && s.cfg != nil {
 			if s.cfg.Database.MaxOpenConns > 0 {
 				metrics.DBMaxOpenConns = intPtr(s.cfg.Database.MaxOpenConns)
 			}
 			if s.cfg.Redis.PoolSize > 0 {
 				metrics.RedisPoolSize = intPtr(s.cfg.Redis.PoolSize)
 			}
 		}
 		overview.SystemMetrics = metrics
 	} else if err != nil && !errors.Is(err, sql.ErrNoRows) {
 		log.Printf("[Ops] GetLatestSystemMetrics failed: %v", err)
@@ -50,6 +61,8 @@ func (s *OpsService) GetDashboardOverview(ctx context.Context, filter *OpsDashbo
 		log.Printf("[Ops] ListJobHeartbeats failed: %v", err)
 	}
 	overview.HealthScore = computeDashboardHealthScore(time.Now().UTC(), overview)
 	return overview, nil
 }
--- a/backend/internal/service/ops_dashboard_models.go
+++ b/backend/internal/service/ops_dashboard_models.go
@@ -35,6 +35,10 @@ type OpsDashboardOverview struct {
 	Platform  string    `json:"platform"`
 	GroupID   *int64    `json:"group_id"`
 	// HealthScore is a backend-computed overall health score (0-100).
 	// It is derived from the monitored metrics in this overview, plus best-effort system metrics/job heartbeats.
 	HealthScore int `json:"health_score"`
 	// Latest system-level snapshot (window=1m, global).
 	SystemMetrics *OpsSystemMetricsSnapshot `json:"system_metrics"`
--- a/backend/internal/service/ops_health_score.go
+++ b/backend/internal/service/ops_health_score.go
@@ -0,0 +1,126 @@
 package service
 import (
 	"math"
 	"time"
 )
 // computeDashboardHealthScore computes a 0-100 health score from the metrics returned by the dashboard overview.
 //
 // Design goals:
 // - Backend-owned scoring (UI only displays).
 // - Uses "overall" business indicators (SLA/error/latency) plus infra indicators (db/redis/cpu/mem/jobs).
 // - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data.
 func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int {
 	if overview == nil {
 		return 0
 	}
 	// Idle/no-data: avoid showing a "bad" score when there is no traffic.
 	// UI can still render a gray/idle state based on QPS + error rate.
 	if overview.RequestCountSLA <= 0 && overview.RequestCountTotal <= 0 && overview.ErrorCountTotal <= 0 {
 		return 100
 	}
 	score := 100.0
 	// --- SLA (primary signal) ---
 	// SLA is a ratio (0..1). Target is intentionally modest for LLM gateways; it can be tuned later.
 	slaPct := clampFloat64(overview.SLA*100, 0, 100)
 	if slaPct < 99.5 {
 		// Up to -45 points as SLA drops.
 		score -= math.Min(45, (99.5-slaPct)*12)
 	}
 	// --- Error rates (secondary signal) ---
 	errorPct := clampFloat64(overview.ErrorRate*100, 0, 100)
 	if errorPct > 1 {
 		// Cap at -20 points by 6% error rate.
 		score -= math.Min(20, (errorPct-1)*4)
 	}
 	upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100)
 	if upstreamPct > 1 {
 		// Upstream instability deserves extra weight, but keep it smaller than SLA/error.
 		score -= math.Min(15, (upstreamPct-1)*3)
 	}
 	// --- Latency (tail-focused) ---
 	// Use p99 of duration + TTFT. Penalize only when clearly elevated.
 	if overview.Duration.P99 != nil {
 		p99 := float64(*overview.Duration.P99)
 		if p99 > 2000 {
 			// From 2s upward, gradually penalize up to -20.
 			score -= math.Min(20, (p99-2000)/900) // ~20s => ~-20
 		}
 	}
 	if overview.TTFT.P99 != nil {
 		p99 := float64(*overview.TTFT.P99)
 		if p99 > 500 {
 			// TTFT > 500ms starts hurting; cap at -10.
 			score -= math.Min(10, (p99-500)/200) // 2.5s => -10
 		}
 	}
 	// --- System metrics snapshot (best-effort) ---
 	if overview.SystemMetrics != nil {
 		if overview.SystemMetrics.DBOK != nil && !*overview.SystemMetrics.DBOK {
 			score -= 20
 		}
 		if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK {
 			score -= 15
 		}
 		if overview.SystemMetrics.CPUUsagePercent != nil {
 			cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100)
 			if cpuPct > 85 {
 				score -= math.Min(10, (cpuPct-85)*1.5)
 			}
 		}
 		if overview.SystemMetrics.MemoryUsagePercent != nil {
 			memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100)
 			if memPct > 90 {
 				score -= math.Min(10, (memPct-90)*1.0)
 			}
 		}
 		if overview.SystemMetrics.DBConnWaiting != nil && *overview.SystemMetrics.DBConnWaiting > 0 {
 			waiting := float64(*overview.SystemMetrics.DBConnWaiting)
 			score -= math.Min(10, waiting*2)
 		}
 		if overview.SystemMetrics.ConcurrencyQueueDepth != nil && *overview.SystemMetrics.ConcurrencyQueueDepth > 0 {
 			depth := float64(*overview.SystemMetrics.ConcurrencyQueueDepth)
 			score -= math.Min(10, depth*0.5)
 		}
 	}
 	// --- Job heartbeats (best-effort) ---
 	// Penalize only clear "error after last success" signals, and cap the impact.
 	jobPenalty := 0.0
 	for _, hb := range overview.JobHeartbeats {
 		if hb == nil {
 			continue
 		}
 		if hb.LastErrorAt != nil && (hb.LastSuccessAt == nil || hb.LastErrorAt.After(*hb.LastSuccessAt)) {
 			jobPenalty += 5
 			continue
 		}
 		if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute {
 			jobPenalty += 2
 		}
 	}
 	score -= math.Min(15, jobPenalty)
 	score = clampFloat64(score, 0, 100)
 	return int(math.Round(score))
 }
 func clampFloat64(v float64, min float64, max float64) float64 {
 	if v < min {
 		return min
 	}
 	if v > max {
 		return max
 	}
 	return v
 }
--- a/backend/internal/service/ops_health_score_test.go
+++ b/backend/internal/service/ops_health_score_test.go
@@ -0,0 +1,60 @@
 //go:build unit
 package service
 import (
 	"testing"
 	"time"
 	"github.com/stretchr/testify/require"
 )
 func TestComputeDashboardHealthScore_IdleReturns100(t *testing.T) {
 	t.Parallel()
 	score := computeDashboardHealthScore(time.Now().UTC(), &OpsDashboardOverview{})
 	require.Equal(t, 100, score)
 }
 func TestComputeDashboardHealthScore_DegradesOnBadSignals(t *testing.T) {
 	t.Parallel()
 	ov := &OpsDashboardOverview{
 		RequestCountTotal: 100,
 		RequestCountSLA:   100,
 		SuccessCount:      90,
 		ErrorCountTotal:   10,
 		ErrorCountSLA:     10,
 		SLA:               0.90,
 		ErrorRate:         0.10,
 		UpstreamErrorRate: 0.08,
 		Duration: OpsPercentiles{P99: intPtr(20_000)},
 		TTFT:     OpsPercentiles{P99: intPtr(2_000)},
 		SystemMetrics: &OpsSystemMetricsSnapshot{
 			DBOK:                  boolPtr(false),
 			RedisOK:               boolPtr(false),
 			CPUUsagePercent:       float64Ptr(98.0),
 			MemoryUsagePercent:    float64Ptr(97.0),
 			DBConnWaiting:         intPtr(3),
 			ConcurrencyQueueDepth: intPtr(10),
 		},
 		JobHeartbeats: []*OpsJobHeartbeat{
 			{
 				JobName:     "job-a",
 				LastErrorAt: timePtr(time.Now().UTC().Add(-1 * time.Minute)),
 				LastError:   stringPtr("boom"),
 			},
 		},
 	}
 	score := computeDashboardHealthScore(time.Now().UTC(), ov)
 	require.Less(t, score, 80)
 	require.GreaterOrEqual(t, score, 0)
 }
 func timePtr(v time.Time) *time.Time { return &v }
 func stringPtr(v string) *string { return &v }
--- a/backend/internal/service/ops_metrics_collector.go
+++ b/backend/internal/service/ops_metrics_collector.go
@@ -5,7 +5,6 @@ import (
 	"database/sql"
 	"errors"
 	"fmt"
 	"hash/fnv"
 	"log"
 	"math"
 	"os"
@@ -262,6 +261,7 @@ func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error {
 	dbOK := c.checkDB(ctx)
 	redisOK := c.checkRedis(ctx)
 	active, idle := c.dbPoolStats()
 	redisTotal, redisIdle, redisStatsOK := c.redisPoolStats()
 	successCount, tokenConsumed, err := c.queryUsageCounts(ctx, windowStart, windowEnd)
 	if err != nil {
@@ -327,6 +327,19 @@ func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error {
 		DBOK:    boolPtr(dbOK),
 		RedisOK: boolPtr(redisOK),
 		RedisConnTotal: func() *int {
 			if !redisStatsOK {
 				return nil
 			}
 			return intPtr(redisTotal)
 		}(),
 		RedisConnIdle: func() *int {
 			if !redisStatsOK {
 				return nil
 			}
 			return intPtr(redisIdle)
 		}(),
 		DBConnActive:   intPtr(active),
 		DBConnIdle:     intPtr(idle),
 		GoroutineCount: intPtr(goroutines),
@@ -722,6 +735,17 @@ func (c *OpsMetricsCollector) checkRedis(ctx context.Context) bool {
 	return c.redisClient.Ping(ctx).Err() == nil
 }
 func (c *OpsMetricsCollector) redisPoolStats() (total int, idle int, ok bool) {
 	if c == nil || c.redisClient == nil {
 		return 0, 0, false
 	}
 	stats := c.redisClient.PoolStats()
 	if stats == nil {
 		return 0, 0, false
 	}
 	return int(stats.TotalConns), int(stats.IdleConns), true
 }
 func (c *OpsMetricsCollector) dbPoolStats() (active int, idle int) {
 	if c == nil || c.db == nil {
 		return 0, 0
@@ -749,7 +773,7 @@ func (c *OpsMetricsCollector) tryAcquireLeaderLock(ctx context.Context) (func(),
 	if err != nil {
 		// Prefer fail-closed to avoid stampeding the database when Redis is flaky.
 		// Fallback to a DB advisory lock when Redis is present but unavailable.
-		release, ok := c.tryAcquireDBAdvisoryLock(ctx)
+		release, ok := tryAcquireDBAdvisoryLock(ctx, c.db, opsMetricsCollectorAdvisoryLockID)
 		if !ok {
 			c.maybeLogSkip()
 			return nil, false
@@ -769,38 +793,6 @@ func (c *OpsMetricsCollector) tryAcquireLeaderLock(ctx context.Context) (func(),
 	return release, true
 }
 func (c *OpsMetricsCollector) tryAcquireDBAdvisoryLock(ctx context.Context) (func(), bool) {
 	if c == nil || c.db == nil {
 		return nil, false
 	}
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	conn, err := c.db.Conn(ctx)
 	if err != nil {
 		return nil, false
 	}
 	acquired := false
 	if err := conn.QueryRowContext(ctx, "SELECT pg_try_advisory_lock($1)", opsMetricsCollectorAdvisoryLockID).Scan(&acquired); err != nil {
 		_ = conn.Close()
 		return nil, false
 	}
 	if !acquired {
 		_ = conn.Close()
 		return nil, false
 	}
 	release := func() {
 		unlockCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
 		defer cancel()
 		_, _ = conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", opsMetricsCollectorAdvisoryLockID)
 		_ = conn.Close()
 	}
 	return release, true
 }
 func (c *OpsMetricsCollector) maybeLogSkip() {
 	c.skipLogMu.Lock()
 	defer c.skipLogMu.Unlock()
@@ -853,9 +845,3 @@ func float64Ptr(v float64) *float64 {
 	out := v
 	return &out
 }
 func hashAdvisoryLockID(s string) int64 {
 	h := fnv.New64a()
 	_, _ = h.Write([]byte(s))
 	return int64(h.Sum64())
 }
--- a/backend/internal/service/ops_port.go
+++ b/backend/internal/service/ops_port.go
@@ -165,6 +165,9 @@ type OpsInsertSystemMetricsInput struct {
 	DBOK    *bool
 	RedisOK *bool
 	RedisConnTotal *int
 	RedisConnIdle  *int
 	DBConnActive  *int
 	DBConnIdle    *int
 	DBConnWaiting *int
@@ -186,6 +189,13 @@ type OpsSystemMetricsSnapshot struct {
 	DBOK    *bool `json:"db_ok"`
 	RedisOK *bool `json:"redis_ok"`
 	// Config-derived limits (best-effort). These are not historical metrics; they help UI render "current vs max".
 	DBMaxOpenConns *int `json:"db_max_open_conns"`
 	RedisPoolSize  *int `json:"redis_pool_size"`
 	RedisConnTotal *int `json:"redis_conn_total"`
 	RedisConnIdle  *int `json:"redis_conn_idle"`
 	DBConnActive  *int `json:"db_conn_active"`
 	DBConnIdle    *int `json:"db_conn_idle"`
 	DBConnWaiting *int `json:"db_conn_waiting"`
--- a/backend/internal/service/setting_service.go
+++ b/backend/internal/service/setting_service.go
@@ -139,6 +139,9 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet
 	updates[SettingKeyOpsMonitoringEnabled] = strconv.FormatBool(settings.OpsMonitoringEnabled)
 	updates[SettingKeyOpsRealtimeMonitoringEnabled] = strconv.FormatBool(settings.OpsRealtimeMonitoringEnabled)
 	updates[SettingKeyOpsQueryModeDefault] = string(ParseOpsQueryMode(settings.OpsQueryModeDefault))
 	if settings.OpsMetricsIntervalSeconds > 0 {
 		updates[SettingKeyOpsMetricsIntervalSeconds] = strconv.Itoa(settings.OpsMetricsIntervalSeconds)
 	}
 	return s.settingRepo.SetMultiple(ctx, updates)
 }
@@ -231,6 +234,7 @@ func (s *SettingService) InitializeDefaultSettings(ctx context.Context) error {
 		SettingKeyOpsMonitoringEnabled:         "true",
 		SettingKeyOpsRealtimeMonitoringEnabled: "true",
 		SettingKeyOpsQueryModeDefault:          "auto",
 		SettingKeyOpsMetricsIntervalSeconds:    "60",
 	}
 	return s.settingRepo.SetMultiple(ctx, defaults)
@@ -301,6 +305,18 @@ func (s *SettingService) parseSettings(settings map[string]string) *SystemSettin
 	result.OpsMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsMonitoringEnabled])
 	result.OpsRealtimeMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsRealtimeMonitoringEnabled])
 	result.OpsQueryModeDefault = string(ParseOpsQueryMode(settings[SettingKeyOpsQueryModeDefault]))
 	result.OpsMetricsIntervalSeconds = 60
 	if raw := strings.TrimSpace(settings[SettingKeyOpsMetricsIntervalSeconds]); raw != "" {
 		if v, err := strconv.Atoi(raw); err == nil {
 			if v < 60 {
 				v = 60
 			}
 			if v > 3600 {
 				v = 3600
 			}
 			result.OpsMetricsIntervalSeconds = v
 		}
 	}
 	return result
 }
--- a/backend/internal/service/settings_view.go
+++ b/backend/internal/service/settings_view.go
@@ -43,6 +43,7 @@ type SystemSettings struct {
 	OpsMonitoringEnabled         bool
 	OpsRealtimeMonitoringEnabled bool
 	OpsQueryModeDefault          string
 	OpsMetricsIntervalSeconds    int
 }
 type PublicSettings struct {
--- a/frontend/src/api/admin/ops.ts
+++ b/frontend/src/api/admin/ops.ts
@@ -46,6 +46,8 @@ export interface OpsDashboardOverview {
  platform: string
  group_id?: number | null
  health_score?: number
  system_metrics?: OpsSystemMetricsSnapshot | null
  job_heartbeats?: OpsJobHeartbeat[] | null
@@ -228,6 +230,9 @@ export interface OpsSystemMetricsSnapshot {
  db_ok?: boolean | null
  redis_ok?: boolean | null
  redis_conn_total?: number | null
  redis_conn_idle?: number | null
  db_conn_active?: number | null
  db_conn_idle?: number | null
  db_conn_waiting?: number | null
--- a/frontend/src/api/admin/settings.ts
+++ b/frontend/src/api/admin/settings.ts
@@ -50,6 +50,7 @@ export interface SystemSettings {
  ops_monitoring_enabled: boolean
  ops_realtime_monitoring_enabled: boolean
  ops_query_mode_default: 'auto' | 'raw' | 'preagg' | string
  ops_metrics_interval_seconds: number
 }
 export interface UpdateSettingsRequest {
@@ -83,6 +84,7 @@ export interface UpdateSettingsRequest {
  ops_monitoring_enabled?: boolean
  ops_realtime_monitoring_enabled?: boolean
  ops_query_mode_default?: 'auto' | 'raw' | 'preagg' | string
  ops_metrics_interval_seconds?: number
 }
 /**
--- a/frontend/src/i18n/locales/en.ts
+++ b/frontend/src/i18n/locales/en.ts
@@ -1733,8 +1733,10 @@ export default {
      redis: 'Redis',
      goroutines: 'Goroutines',
      jobs: 'Jobs',
      jobsHelp: 'Click “Details” to view job heartbeats and recent errors',
      active: 'active',
      idle: 'idle',
      waiting: 'waiting',
      ok: 'ok',
      lastRun: 'last_run:',
      lastSuccess: 'last_success:',
@@ -1770,12 +1772,50 @@ export default {
      errorsSla: 'Errors (SLA scope)',
      upstreamExcl429529: 'Upstream (excl 429/529)',
      failedToLoadData: 'Failed to load ops data.',
      failedToLoadOverview: 'Failed to load overview',
      failedToLoadThroughputTrend: 'Failed to load throughput trend',
      failedToLoadLatencyHistogram: 'Failed to load latency histogram',
      failedToLoadErrorTrend: 'Failed to load error trend',
      failedToLoadErrorDistribution: 'Failed to load error distribution',
      failedToLoadErrorDetail: 'Failed to load error detail',
      retryFailed: 'Retry failed',
      tpsK: 'TPS (K)',
      top: 'Top:',
      throughputTrend: 'Throughput Trend',
      latencyHistogram: 'Latency Histogram',
      errorTrend: 'Error Trend',
      errorDistribution: 'Error Distribution',
      // Health Score & Diagnosis
      health: 'Health',
      healthCondition: 'Health Condition',
      healthHelp: 'Overall system health score based on SLA, error rate, and resource usage',
      healthyStatus: 'Healthy',
      riskyStatus: 'At Risk',
      idleStatus: 'Idle',
      diagnosis: {
        title: 'Smart Diagnosis',
        footer: 'Automated diagnostic suggestions based on current metrics',
        idle: 'System is currently idle',
        idleImpact: 'No active traffic',
        upstreamCritical: 'Upstream error rate critically high ({rate}%)',
        upstreamCriticalImpact: 'May affect many user requests',
        upstreamHigh: 'Upstream error rate elevated ({rate}%)',
        upstreamHighImpact: 'Recommend checking upstream service status',
        slaCritical: 'SLA critically below target ({sla}%)',
        slaCriticalImpact: 'User experience severely degraded',
        slaLow: 'SLA below target ({sla}%)',
        slaLowImpact: 'Service quality needs attention',
        errorHigh: 'Error rate too high ({rate}%)',
        errorHighImpact: 'Many requests failing',
        errorElevated: 'Error rate elevated ({rate}%)',
        errorElevatedImpact: 'Recommend checking error logs',
        healthCritical: 'Overall health score critically low ({score})',
        healthCriticalImpact: 'Multiple metrics may be degraded; prioritize error rate and latency investigation',
        healthLow: 'Overall health score low ({score})',
        healthLowImpact: 'May indicate minor instability; monitor SLA and error rates',
        healthy: 'All system metrics normal',
        healthyImpact: 'Service running stable'
      },
      // Error Log
      errorLog: {
        timeId: 'Time / ID',
@@ -2069,7 +2109,21 @@ export default {
        throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.',
        latencyHistogram: 'Latency distribution (duration_ms) for successful requests.',
        errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).',
-        errorDistribution: 'Error distribution by status code.'
+        errorDistribution: 'Error distribution by status code.',
        goroutines:
          'Number of Go runtime goroutines (lightweight threads). There is no absolute “safe” number—use your historical baseline. Heuristic: <2k is common; 2k–8k watch; >8k plus rising queue/latency often suggests blocking/leaks.',
        cpu: 'CPU usage percentage, showing system processor load.',
        memory: 'Memory usage, including used and total available memory.',
        db: 'Database connection pool status, including active, idle, and waiting connections.',
        redis: 'Redis connection pool status, showing active and idle connections.',
        jobs: 'Background job execution status, including last run time, success time, and error information.',
        qps: 'Queries Per Second (QPS) and Tokens Per Second (TPS), real-time system throughput.',
        tokens: 'Total number of tokens processed in the current time window.',
        sla: 'Service Level Agreement success rate, excluding business limits (e.g., insufficient balance, quota exceeded).',
        errors: 'Error statistics, including total errors, error rate, and upstream error rate.',
        latency: 'Request latency statistics, including p50, p90, p95, p99 percentiles.',
        ttft: 'Time To First Token, measuring the speed of first byte return in streaming responses.',
        health: 'System health score (0-100), considering SLA, error rate, and resource usage.'
      },
      charts: {
        emptyRequest: 'No requests in this window.',
@@ -2183,7 +2237,9 @@ export default {
        queryModeHint: 'Default query mode for Ops Dashboard (auto/raw/preagg)',
        queryModeAuto: 'Auto (recommended)',
        queryModeRaw: 'Raw (most accurate, slower)',
-        queryModePreagg: 'Preagg (fastest, requires aggregation)'
+        queryModePreagg: 'Preagg (fastest, requires aggregation)',
        metricsInterval: 'Metrics Collection Interval (seconds)',
        metricsIntervalHint: 'How often to collect system/request metrics (60-3600 seconds)'
      },
      adminApiKey: {
        title: 'Admin API Key',
--- a/frontend/src/i18n/locales/zh.ts
+++ b/frontend/src/i18n/locales/zh.ts
@@ -1878,8 +1878,10 @@ export default {
      redis: 'Redis',
      goroutines: '协程',
      jobs: '后台任务',
      jobsHelp: '点击“明细”查看任务心跳与报错信息',
      active: '活跃',
      idle: '空闲',
      waiting: '等待',
      ok: '正常',
      lastRun: '最近运行',
      lastSuccess: '最近成功',
@@ -1898,8 +1900,8 @@ export default {
      errors: '错误',
      errorRate: '错误率：',
      upstreamRate: '上游错误率：',
-      latencyDuration: '延迟 (duration_ms)',
+      latencyDuration: '延迟（毫秒）',
-      ttftLabel: 'TTFT (first_token_ms)',
+      ttftLabel: '首字延迟（毫秒）',
      p50: 'p50',
      p90: 'p90',
      p95: 'p95',
@@ -1915,12 +1917,50 @@ export default {
      errorsSla: '错误（SLA范围）',
      upstreamExcl429529: '上游（排除429/529）',
      failedToLoadData: '加载运维数据失败',
-      tpsK: 'TPS (K)',
+      failedToLoadOverview: '加载概览数据失败',
      failedToLoadThroughputTrend: '加载吞吐趋势失败',
      failedToLoadLatencyHistogram: '加载延迟分布失败',
      failedToLoadErrorTrend: '加载错误趋势失败',
      failedToLoadErrorDistribution: '加载错误分布失败',
      failedToLoadErrorDetail: '加载错误详情失败',
      retryFailed: '重试失败',
      tpsK: 'TPS（千）',
      top: '最高：',
      throughputTrend: '吞吐趋势',
      latencyHistogram: '延迟分布',
      errorTrend: '错误趋势',
      errorDistribution: '错误分布',
      // Health Score & Diagnosis
      health: '健康',
      healthCondition: '健康状况',
      healthHelp: '基于 SLA、错误率和资源使用情况的系统整体健康评分',
      healthyStatus: '健康',
      riskyStatus: '风险',
      idleStatus: '待机',
      diagnosis: {
        title: '智能诊断',
        footer: '基于当前指标的自动诊断建议',
        idle: '系统当前处于待机状态',
        idleImpact: '无活跃流量',
        upstreamCritical: '上游错误率严重偏高 ({rate}%)',
        upstreamCriticalImpact: '可能影响大量用户请求',
        upstreamHigh: '上游错误率偏高 ({rate}%)',
        upstreamHighImpact: '建议检查上游服务状态',
        slaCritical: 'SLA 严重低于目标 ({sla}%)',
        slaCriticalImpact: '用户体验严重受损',
        slaLow: 'SLA 低于目标 ({sla}%)',
        slaLowImpact: '需要关注服务质量',
        errorHigh: '错误率过高 ({rate}%)',
        errorHighImpact: '大量请求失败',
        errorElevated: '错误率偏高 ({rate}%)',
        errorElevatedImpact: '建议检查错误日志',
        healthCritical: '综合健康评分过低 ({score})',
        healthCriticalImpact: '多个指标可能同时异常，建议优先排查错误与延迟',
        healthLow: '综合健康评分偏低 ({score})',
        healthLowImpact: '可能存在轻度波动，建议关注 SLA 与错误率',
        healthy: '所有系统指标正常',
        healthyImpact: '服务运行稳定'
      },
      // Error Log
      errorLog: {
        timeId: '时间 / ID',
@@ -2212,9 +2252,23 @@ export default {
      },
      tooltips: {
        throughputTrend: '当前窗口内的请求/QPS 与 token/TPS 趋势。',
-        latencyHistogram: '成功请求的延迟分布（duration_ms）。',
+        latencyHistogram: '成功请求的延迟分布（毫秒）。',
        errorTrend: '错误趋势（SLA 口径排除业务限制；上游错误率排除 429/529）。',
-        errorDistribution: '按状态码统计的错误分布。'
+        errorDistribution: '按状态码统计的错误分布。',
        goroutines:
          'Go 运行时的协程数量（轻量级线程）。没有绝对“安全值”，建议以历史基线为准。经验参考：<2000 常见；2000-8000 需关注；>8000 且伴随队列/延迟上升时，优先排查阻塞/泄漏。',
        cpu: 'CPU 使用率，显示系统处理器的负载情况。',
        memory: '内存使用率，包括已使用和总可用内存。',
        db: '数据库连接池状态，包括活跃连接、空闲连接和等待连接数。',
        redis: 'Redis 连接池状态，显示活跃和空闲的连接数。',
        jobs: '后台任务执行状态，包括最近运行时间、成功时间和错误信息。',
        qps: '每秒查询数（QPS）和每秒Token数（TPS），实时显示系统吞吐量。',
        tokens: '当前时间窗口内处理的总Token数量。',
        sla: '服务等级协议达成率，排除业务限制（如余额不足、配额超限）的成功请求占比。',
        errors: '错误统计，包括总错误数、错误率和上游错误率。',
        latency: '请求延迟统计，包括 p50、p90、p95、p99 等百分位数。',
        ttft: '首Token延迟（Time To First Token），衡量流式响应的首字节返回速度。',
        health: '系统健康评分（0-100），综合考虑 SLA、错误率和资源使用情况。'
      },
      charts: {
        emptyRequest: '该时间窗口内暂无请求。',
@@ -2320,14 +2374,16 @@ export default {
        description: '启用运维监控模块，用于排障与健康可视化',
        disabled: '运维监控已关闭',
        enabled: '启用运维监控',
-        enabledHint: '启用 Ops 运维监控模块（仅管理员可见）',
+        enabledHint: '启用运维监控模块（仅管理员可见）',
        realtimeEnabled: '启用实时监控',
-        realtimeEnabledHint: '启用实时 QPS/指标推送（WebSocket）',
+        realtimeEnabledHint: '启用实时请求速率和指标推送（WebSocket）',
        queryMode: '默认查询模式',
-        queryModeHint: 'Ops Dashboard 默认查询模式（auto/raw/preagg）',
+        queryModeHint: '运维监控默认查询模式（自动/原始/预聚合）',
        queryModeAuto: '自动（推荐）',
-        queryModeRaw: 'Raw（最准，但较慢）',
+        queryModeRaw: '原始（最准确，但较慢）',
-        queryModePreagg: 'Preagg（最快，需预聚合）'
+        queryModePreagg: '预聚合（最快，需预聚合）',
        metricsInterval: '采集频率（秒）',
        metricsIntervalHint: '系统/请求指标采集频率（60-3600 秒）'
      },
      adminApiKey: {
        title: '管理员 API Key',
--- a/frontend/src/views/admin/SettingsView.vue
+++ b/frontend/src/views/admin/SettingsView.vue
@@ -715,6 +715,25 @@
                class="w-[220px]"
              />
            </div>
            <div v-if="form.ops_monitoring_enabled" class="mt-5 flex items-center justify-between">
              <div>
                <label class="font-medium text-gray-900 dark:text-white">{{
                  t('admin.settings.opsMonitoring.metricsInterval')
                }}</label>
                <p class="text-sm text-gray-500 dark:text-gray-400">
                  {{ t('admin.settings.opsMonitoring.metricsIntervalHint') }}
                </p>
              </div>
              <input
                v-model.number="form.ops_metrics_interval_seconds"
                type="number"
                min="60"
                max="3600"
                step="10"
                class="w-[220px] rounded-lg border border-gray-300 bg-white px-3 py-2 text-sm text-gray-900 shadow-sm focus:border-primary-500 focus:outline-none focus:ring-1 focus:ring-primary-500 dark:border-dark-600 dark:bg-dark-800 dark:text-white"
              />
            </div>
          </div>
        </div>
@@ -824,7 +843,8 @@ const form = reactive<SettingsForm>({
  // Ops Monitoring (vNext)
  ops_monitoring_enabled: true,
  ops_realtime_monitoring_enabled: true,
-  ops_query_mode_default: 'auto'
+  ops_query_mode_default: 'auto',
  ops_metrics_interval_seconds: 60
 })
 const opsQueryModeOptions = computed(() => [
@@ -922,7 +942,8 @@ async function saveSettings() {
      identity_patch_prompt: form.identity_patch_prompt,
      ops_monitoring_enabled: form.ops_monitoring_enabled,
      ops_realtime_monitoring_enabled: form.ops_realtime_monitoring_enabled,
-      ops_query_mode_default: form.ops_query_mode_default
+      ops_query_mode_default: form.ops_query_mode_default,
      ops_metrics_interval_seconds: form.ops_metrics_interval_seconds
    }
    const updated = await adminAPI.settings.updateSettings(payload)
    Object.assign(form, updated)
--- a/frontend/src/views/admin/ops/OpsDashboard.vue
+++ b/frontend/src/views/admin/ops/OpsDashboard.vue
@@ -33,190 +33,6 @@
        @open-error-details="openErrorDetails"
      />
      <!-- Overview -->
      <div
        v-if="opsEnabled && !(loading && !hasLoadedOnce)"
        class="overflow-hidden rounded-3xl bg-white shadow-sm ring-1 ring-gray-900/5 dark:bg-dark-800 dark:ring-dark-700"
      >
        <div class="border-b border-gray-100 px-6 py-4 dark:border-dark-700">
          <h3 class="text-base font-semibold text-gray-900 dark:text-white">{{ t('admin.ops.systemHealth') }}</h3>
        </div>
        <div class="p-6">
          <div v-if="loadingOverview" class="flex items-center justify-center py-10">
            <div class="h-8 w-8 animate-spin rounded-full border-b-2 border-primary-600"></div>
          </div>
          <div v-else-if="!overview?.system_metrics" class="py-6 text-sm text-gray-500 dark:text-gray-400">
            {{ t('admin.ops.noSystemMetrics') }}
          </div>
          <div v-else class="space-y-6">
            <div class="text-xs text-gray-500 dark:text-gray-400">
              {{ t('admin.ops.collectedAt') }} {{ formatDateTime(overview.system_metrics.created_at) }} ({{ t('admin.ops.window') }}
              {{ overview.system_metrics.window_minutes }}m)
            </div>
            <div class="grid grid-cols-1 gap-4 md:grid-cols-5">
              <div class="rounded-xl bg-gray-50 p-4 dark:bg-dark-800/50">
                <div class="text-xs text-gray-500 dark:text-gray-400">{{ t('admin.ops.cpu') }}</div>
                <div class="mt-1 text-xl font-semibold text-gray-900 dark:text-white">
                  {{ formatPercent0to100(overview.system_metrics.cpu_usage_percent) }}
                </div>
              </div>
              <div class="rounded-xl bg-gray-50 p-4 dark:bg-dark-800/50">
                <div class="text-xs text-gray-500 dark:text-gray-400">{{ t('admin.ops.memory') }}</div>
                <div class="mt-1 text-xl font-semibold text-gray-900 dark:text-white">
                  {{ formatPercent0to100(overview.system_metrics.memory_usage_percent) }}
                </div>
                <div class="mt-1 text-xs text-gray-500 dark:text-gray-400">
                  {{ formatMBPair(overview.system_metrics.memory_used_mb, overview.system_metrics.memory_total_mb) }}
                </div>
              </div>
              <div class="rounded-xl bg-gray-50 p-4 dark:bg-dark-800/50">
                <div class="text-xs text-gray-500 dark:text-gray-400">{{ t('admin.ops.db') }}</div>
                <div class="mt-1 text-xl font-semibold" :class="boolOkClass(overview.system_metrics.db_ok)">
                  {{ boolOkLabel(overview.system_metrics.db_ok) }}
                </div>
                <div class="mt-1 text-xs text-gray-500 dark:text-gray-400">
                  {{ t('admin.ops.active') }}: {{ overview.system_metrics.db_conn_active ?? '-' }}, {{ t('admin.ops.idle') }}:
                  {{ overview.system_metrics.db_conn_idle ?? '-' }}
                </div>
              </div>
              <div class="rounded-xl bg-gray-50 p-4 dark:bg-dark-800/50">
                <div class="text-xs text-gray-500 dark:text-gray-400">{{ t('admin.ops.redis') }}</div>
                <div class="mt-1 text-xl font-semibold" :class="boolOkClass(overview.system_metrics.redis_ok)">
                  {{ boolOkLabel(overview.system_metrics.redis_ok) }}
                </div>
              </div>
              <div class="rounded-xl bg-gray-50 p-4 dark:bg-dark-800/50">
                <div class="text-xs text-gray-500 dark:text-gray-400">{{ t('admin.ops.goroutines') }}</div>
                <div class="mt-1 text-xl font-semibold text-gray-900 dark:text-white">
                  {{ overview.system_metrics.goroutine_count ?? '-' }}
                </div>
              </div>
            </div>
            <div v-if="overview?.job_heartbeats?.length" class="rounded-xl border border-gray-100 dark:border-dark-700">
              <div class="border-b border-gray-100 px-4 py-3 text-sm font-semibold text-gray-900 dark:border-dark-700 dark:text-white">
                {{ t('admin.ops.jobs') }}
              </div>
              <div class="divide-y divide-gray-100 dark:divide-dark-700">
                <div
                  v-for="job in overview.job_heartbeats"
                  :key="job.job_name"
                  class="flex flex-col gap-1 px-4 py-3 md:flex-row md:items-center md:justify-between"
                >
                  <div class="text-sm font-medium text-gray-900 dark:text-white">
                    {{ job.job_name }}
                  </div>
                  <div class="text-xs text-gray-500 dark:text-gray-400">
                    {{ t('admin.ops.lastRun') }}: {{ job.last_run_at ? formatDateTime(job.last_run_at) : '-' }} · {{ t('admin.ops.lastSuccess') }}:
                    {{ job.last_success_at ? formatDateTime(job.last_success_at) : '-' }} ·
                    <span v-if="job.last_error" class="text-rose-600 dark:text-rose-400">
                      {{ t('admin.ops.lastError') }}: {{ job.last_error }}
                    </span>
                    <span v-else class="text-emerald-600 dark:text-emerald-400">{{ t('admin.ops.ok') }}</span>
                  </div>
                </div>
              </div>
            </div>
          </div>
        </div>
      </div>
      <div v-if="opsEnabled && !(loading && !hasLoadedOnce)" class="card">
        <div class="border-b border-gray-100 px-6 py-4 dark:border-dark-700">
          <h3 class="text-base font-semibold text-gray-900 dark:text-white">{{ t('admin.ops.overview') }}</h3>
        </div>
        <div class="p-6">
          <div v-if="loadingOverview" class="flex items-center justify-center py-10">
            <div class="h-8 w-8 animate-spin rounded-full border-b-2 border-primary-600"></div>
          </div>
          <div v-else-if="!overview" class="py-6 text-sm text-gray-500 dark:text-gray-400">
            {{ t('admin.ops.noData') }}
          </div>
          <div v-else class="space-y-6">
            <div class="grid grid-cols-1 gap-4 md:grid-cols-4">
              <div class="rounded-xl bg-gray-50 p-4 dark:bg-dark-800/50">
                <div class="text-xs text-gray-500 dark:text-gray-400">{{ t('admin.ops.requestsTotal') }}</div>
                <div class="mt-1 text-xl font-semibold text-gray-900 dark:text-white">
                  {{ formatInt(overview.request_count_total) }}
                </div>
                <div class="mt-1 text-xs text-gray-500 dark:text-gray-400">
                  {{ t('admin.ops.slaScope') }} {{ formatInt(overview.request_count_sla) }}
                </div>
              </div>
              <div class="rounded-xl bg-gray-50 p-4 dark:bg-dark-800/50">
                <div class="text-xs text-gray-500 dark:text-gray-400">{{ t('admin.ops.tokens') }}</div>
                <div class="mt-1 text-xl font-semibold text-gray-900 dark:text-white">
                  {{ formatInt(overview.token_consumed) }}
                </div>
                <div class="mt-1 text-xs text-gray-500 dark:text-gray-400">
                  {{ t('admin.ops.tps') }} {{ overview.tps.current }} ({{ t('admin.ops.peak') }} {{ overview.tps.peak }})
                </div>
              </div>
              <div class="rounded-xl bg-gray-50 p-4 dark:bg-dark-800/50">
                <div class="text-xs text-gray-500 dark:text-gray-400">{{ t('admin.ops.sla') }}</div>
                <div class="mt-1 text-xl font-semibold text-gray-900 dark:text-white">
                  {{ formatPercent(overview.sla) }}
                </div>
                <div class="mt-1 text-xs text-gray-500 dark:text-gray-400">
                  {{ t('admin.ops.businessLimited') }}: {{ formatInt(overview.business_limited_count) }}
                </div>
              </div>
              <div class="rounded-xl bg-gray-50 p-4 dark:bg-dark-800/50">
                <div class="text-xs text-gray-500 dark:text-gray-400">{{ t('admin.ops.errors') }}</div>
                <div class="mt-1 text-xs text-gray-600 dark:text-gray-300">
                  {{ t('admin.ops.errorRate') }}: <span class="font-semibold">{{ formatPercent(overview.error_rate) }}</span>
                </div>
                <div class="mt-1 text-xs text-gray-600 dark:text-gray-300">
                  {{ t('admin.ops.upstreamRate') }}: <span class="font-semibold">{{ formatPercent(overview.upstream_error_rate) }}</span>
                </div>
                <div class="mt-1 text-xs text-gray-500 dark:text-gray-400">
                  429: {{ formatInt(overview.upstream_429_count) }} · 529:
                  {{ formatInt(overview.upstream_529_count) }}
                </div>
              </div>
            </div>
            <div class="grid grid-cols-1 gap-4 md:grid-cols-2">
              <div class="rounded-xl border border-gray-200 bg-white p-4 dark:border-dark-700 dark:bg-dark-900">
                <div class="text-sm font-semibold text-gray-900 dark:text-white">{{ t('admin.ops.latencyDuration') }}</div>
                <div class="mt-3 grid grid-cols-2 gap-2 text-xs text-gray-600 dark:text-gray-300 md:grid-cols-3">
                  <div>{{ t('admin.ops.p50') }}: <span class="font-mono">{{ formatMs(overview.duration.p50_ms) }}</span></div>
                  <div>{{ t('admin.ops.p90') }}: <span class="font-mono">{{ formatMs(overview.duration.p90_ms) }}</span></div>
                  <div>{{ t('admin.ops.p95') }}: <span class="font-mono">{{ formatMs(overview.duration.p95_ms) }}</span></div>
                  <div>{{ t('admin.ops.p99') }}: <span class="font-mono">{{ formatMs(overview.duration.p99_ms) }}</span></div>
                  <div>{{ t('admin.ops.avg') }}: <span class="font-mono">{{ formatMs(overview.duration.avg_ms) }}</span></div>
                  <div>{{ t('admin.ops.max') }}: <span class="font-mono">{{ formatMs(overview.duration.max_ms) }}</span></div>
                </div>
              </div>
              <div class="rounded-xl border border-gray-200 bg-white p-4 dark:border-dark-700 dark:bg-dark-900">
                <div class="text-sm font-semibold text-gray-900 dark:text-white">{{ t('admin.ops.ttftLabel') }}</div>
                <div class="mt-3 grid grid-cols-2 gap-2 text-xs text-gray-600 dark:text-gray-300 md:grid-cols-3">
                  <div>{{ t('admin.ops.p50') }}: <span class="font-mono">{{ formatMs(overview.ttft.p50_ms) }}</span></div>
                  <div>{{ t('admin.ops.p90') }}: <span class="font-mono">{{ formatMs(overview.ttft.p90_ms) }}</span></div>
                  <div>{{ t('admin.ops.p95') }}: <span class="font-mono">{{ formatMs(overview.ttft.p95_ms) }}</span></div>
                  <div>{{ t('admin.ops.p99') }}: <span class="font-mono">{{ formatMs(overview.ttft.p99_ms) }}</span></div>
                  <div>{{ t('admin.ops.avg') }}: <span class="font-mono">{{ formatMs(overview.ttft.avg_ms) }}</span></div>
                  <div>{{ t('admin.ops.max') }}: <span class="font-mono">{{ formatMs(overview.ttft.max_ms) }}</span></div>
                </div>
              </div>
            </div>
          </div>
        </div>
      </div>
      <!-- Row: Concurrency + Throughput -->
      <div v-if="opsEnabled && !(loading && !hasLoadedOnce)" class="grid grid-cols-1 gap-6 lg:grid-cols-3">
        <div class="lg:col-span-1 min-h-[360px]">
@@ -308,7 +124,6 @@ import OpsLatencyChart from './components/OpsLatencyChart.vue'
 import OpsThroughputTrendChart from './components/OpsThroughputTrendChart.vue'
 import OpsAlertEventsCard from './components/OpsAlertEventsCard.vue'
 import OpsRequestDetailsModal, { type OpsRequestDetailsPreset } from './components/OpsRequestDetailsModal.vue'
 import { formatDateTime, formatNumberLocaleString } from '@/utils/format'
 const route = useRoute()
 const router = useRouter()
@@ -486,7 +301,6 @@ const syncQueryToRoute = useDebounceFn(async () => {
 }, 250)
 const overview = ref<OpsDashboardOverview | null>(null)
 const loadingOverview = ref(false)
 const throughputTrend = ref<OpsThroughputTrendResponse | null>(null)
 const loadingTrend = ref(false)
@@ -523,12 +337,15 @@ function handleThroughputSelectGroup(nextGroupId: number) {
  groupId.value = id
 }
-function handleOpenRequestDetails() {
+function handleOpenRequestDetails(preset?: OpsRequestDetailsPreset) {
-  requestDetailsPreset.value = {
+  const basePreset: OpsRequestDetailsPreset = {
    title: t('admin.ops.requestDetails.title'),
    kind: 'all',
    sort: 'created_at_desc'
  }
  requestDetailsPreset.value = { ...basePreset, ...(preset ?? {}) }
  if (!requestDetailsPreset.value.title) requestDetailsPreset.value.title = basePreset.title
  showRequestDetails.value = true
 }
@@ -573,46 +390,8 @@ function openError(id: number) {
  showErrorModal.value = true
 }
 function formatInt(v: number | null | undefined): string {
  if (typeof v !== 'number') return '0'
  return formatNumberLocaleString(v)
 }
 function formatPercent(v: number | null | undefined): string {
  if (typeof v !== 'number') return '-'
  return `${(v * 100).toFixed(2)}%`
 }
 function formatPercent0to100(v: number | null | undefined): string {
  if (typeof v !== 'number') return '-'
  return `${v.toFixed(1)}%`
 }
 function formatMBPair(used: number | null | undefined, total: number | null | undefined): string {
  if (typeof used !== 'number' || typeof total !== 'number') return '-'
  return `${formatNumberLocaleString(used)} / ${formatNumberLocaleString(total)} MB`
 }
 function boolOkLabel(v: boolean | null | undefined): string {
  if (v === true) return 'OK'
  if (v === false) return 'FAIL'
  return '-'
 }
 function boolOkClass(v: boolean | null | undefined): string {
  if (v === true) return 'text-emerald-600 dark:text-emerald-400'
  if (v === false) return 'text-rose-600 dark:text-rose-400'
  return 'text-gray-900 dark:text-white'
 }
 function formatMs(v: number | null | undefined): string {
  if (v == null) return '-'
  return `${v}ms`
 }
 async function refreshOverviewWithCancel(fetchSeq: number, signal: AbortSignal) {
  if (!opsEnabled.value) return
  loadingOverview.value = true
  try {
    const data = await opsAPI.getDashboardOverview(
      {
@@ -628,11 +407,7 @@ async function refreshOverviewWithCancel(fetchSeq: number, signal: AbortSignal)
  } catch (err: any) {
    if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return
    overview.value = null
-    appStore.showError(err?.message || 'Failed to load overview')
+    appStore.showError(err?.message || t('admin.ops.failedToLoadOverview'))
  } finally {
    if (fetchSeq === dashboardFetchSeq) {
      loadingOverview.value = false
    }
  }
 }
@@ -654,7 +429,7 @@ async function refreshThroughputTrendWithCancel(fetchSeq: number, signal: AbortS
  } catch (err: any) {
    if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return
    throughputTrend.value = null
-    appStore.showError(err?.message || 'Failed to load throughput trend')
+    appStore.showError(err?.message || t('admin.ops.failedToLoadThroughputTrend'))
  } finally {
    if (fetchSeq === dashboardFetchSeq) {
      loadingTrend.value = false
@@ -680,7 +455,7 @@ async function refreshLatencyHistogramWithCancel(fetchSeq: number, signal: Abort
  } catch (err: any) {
    if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return
    latencyHistogram.value = null
-    appStore.showError(err?.message || 'Failed to load latency histogram')
+    appStore.showError(err?.message || t('admin.ops.failedToLoadLatencyHistogram'))
  } finally {
    if (fetchSeq === dashboardFetchSeq) {
      loadingLatency.value = false
@@ -706,7 +481,7 @@ async function refreshErrorTrendWithCancel(fetchSeq: number, signal: AbortSignal
  } catch (err: any) {
    if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return
    errorTrend.value = null
-    appStore.showError(err?.message || 'Failed to load error trend')
+    appStore.showError(err?.message || t('admin.ops.failedToLoadErrorTrend'))
  } finally {
    if (fetchSeq === dashboardFetchSeq) {
      loadingErrorTrend.value = false
@@ -732,7 +507,7 @@ async function refreshErrorDistributionWithCancel(fetchSeq: number, signal: Abor
  } catch (err: any) {
    if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return
    errorDistribution.value = null
-    appStore.showError(err?.message || 'Failed to load error distribution')
+    appStore.showError(err?.message || t('admin.ops.failedToLoadErrorDistribution'))
  } finally {
    if (fetchSeq === dashboardFetchSeq) {
      loadingErrorDistribution.value = false
--- a/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue
+++ b/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue
@@ -286,7 +286,7 @@ async function fetchDetail(id: number) {
    }
  } catch (err: any) {
    detail.value = null
-    appStore.showError(err?.message || 'Failed to load error detail')
+    appStore.showError(err?.message || t('admin.ops.failedToLoadErrorDetail'))
  } finally {
    loading.value = false
  }
@@ -348,7 +348,7 @@ async function runConfirmedRetry() {
    const summary = res.status === 'succeeded' ? t('admin.ops.errorDetail.retrySuccess') : t('admin.ops.errorDetail.retryFailed')
    appStore.showSuccess(summary)
  } catch (err: any) {
-    appStore.showError(err?.message || 'Retry failed')
+    appStore.showError(err?.message || t('admin.ops.retryFailed'))
  } finally {
    retrying.value = false
  }